Ejemplo n.º 1
0
        def srr_step_func(ss, zi_zmuv, p_masks, q_masks, si, mi_p, mi_q):
            # transform the current belief state into an observation
            si_as_x = self._from_si_to_x(si)
            full_grad = T.log(1.0 + T.exp(ss)) * (self.x_out - si_as_x)

            # get the masked belief state and gradient for primary policy
            xi_for_p = (mi_p * self.x_out) + ((1.0 - mi_p) * si_as_x)
            grad_for_p = mi_p * full_grad

            # update the guide policy's revelation mask
            new_to_q = (1.0 - mi_q) * q_masks
            mip1_q = mi_q + new_to_q
            # get the masked belief state and gradient for guide policy
            # xi_for_q = (mip1_q * self.x_out) + ((1.0 - mip1_q) * si_as_x)
            xi_for_q = xi_for_p
            grad_for_q = mip1_q * full_grad

            # get samples of next zi, according to the primary policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(
                T.horizontal_stack(xi_for_p, grad_for_p), do_samples=False
            )
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply(
                T.horizontal_stack(xi_for_q, grad_for_q), do_samples=False
            )
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)
            # make zi samples that can be switched between zi_p and zi_q
            zi = (self.train_switch[0] * zi_q) + ((1.0 - self.train_switch[0]) * zi_p)

            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar)  # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar)  # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0)  # KL(p || N(0, I))

            # compute next si, given sampled zi (i.e. update the belief state)
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if self.step_type == "jump":
                # jump steps always do a full swap of belief state
                sip1 = si_step
            else:
                # additive steps adjust the belief state like an LSTM
                write_gate = T.nnet.sigmoid(2.0 + hydra_out[1])
                erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            # update the primary policy's revelation mask
            new_to_p = (1.0 - mi_p) * p_masks
            mip1_p = mi_p + new_to_p
            # compute NLL only for the newly revealed values
            nlli = self._construct_nll_costs(sip1, self.x_out, new_to_p)
            # each loop iteration produces the following values:
            #   sip1: belief state at end of current step
            #   mip1_p: revealed values mask to use in next step (primary)
            #   mip1_q: revealed values mask to use in next step (guide)
            #   nlli: NLL for values revealed at end of current step
            #   kldi_q2p: KL(q || p) for the current step
            #   kldi_p2q: KL(p || q) for the current step
            #   kldi_p2g: KL(p || N(0,I)) for the current step
            return sip1, mip1_p, mip1_q, nlli, kldi_q2p, kldi_p2q, kldi_p2g
        def imp_step_func(zi_zmuv, si):
            si_as_x = self._si_as_x(si)
            xi_unmasked = self.x_out
            xi_masked = (self.x_mask * xi_unmasked) + \
                        ((1.0 - self.x_mask) * si_as_x)
            grad_unmasked = self.x_out - si_as_x
            grad_masked = self.x_mask * grad_unmasked
            # get samples of next zi, according to the global policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked)
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply(
                    T.concatenate([xi_masked, xi_unmasked], axis=1))
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)

            # make zi samples that can be switched between zi_p and zi_q
            zi = ((self.train_switch[0] * zi_q) + \
                 ((1.0 - self.train_switch[0]) * zi_p))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar,
                                    zi_p_mean, zi_p_logvar) # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar,
                                    zi_q_mean, zi_q_logvar) # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar,
                                    0.0, 0.0) # KL(p || global prior)

            # compute the next si, given the sampled zi
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if (self.step_type == 'jump'):
                # jump steps always completely overwrite the current guesses
                sip1 = si_step
            elif (self.step_type == 'add'):
                # add steps just update the guesses additively
                sip1 = si + si_step
            elif (self.step_type == 'lstm'):
                # LSTM-style updates with write and erase gates
                write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1])
                erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            elif (self.step_type == 'layer'):
                alpha_gate = T.nnet.sigmoid(hydra_out[1])
                sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step)
            else:
                assert False, "Unknown step type!"

            # compute NLL for the current imputation
            nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask)
            return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g
Ejemplo n.º 3
0
        def imp_step_func(zi_zmuv, si):
            si_as_x = self._si_as_x(si)
            xi_unmasked = self.x_out
            xi_masked = (self.x_mask * xi_unmasked) + \
                        ((1.0 - self.x_mask) * si_as_x)
            grad_unmasked = self.x_out - si_as_x
            grad_masked = self.x_mask * grad_unmasked
            # get samples of next zi, according to the global policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked)
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply(
                T.concatenate([xi_masked, xi_unmasked], axis=1))
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)

            # make zi samples that can be switched between zi_p and zi_q
            zi = ((self.train_switch[0] * zi_q) + \
                 ((1.0 - self.train_switch[0]) * zi_p))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean,
                                    zi_p_logvar)  # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean,
                                    zi_q_logvar)  # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0,
                                    0.0)  # KL(p || global prior)

            # compute the next si, given the sampled zi
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if (self.step_type == 'jump'):
                # jump steps always completely overwrite the current guesses
                sip1 = si_step
            elif (self.step_type == 'add'):
                # add steps just update the guesses additively
                sip1 = si + si_step
            elif (self.step_type == 'lstm'):
                # LSTM-style updates with write and erase gates
                write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1])
                erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            elif (self.step_type == 'layer'):
                alpha_gate = T.nnet.sigmoid(hydra_out[1])
                sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step)
            else:
                assert False, "Unknown step type!"

            # compute NLL for the current imputation
            nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask)
            return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g
Ejemplo n.º 4
0
 def _construct_compute_fe_terms(self):
     """
     Construct theano function to compute the log-likelihood and posterior
     KL-divergence terms for the variational free-energy.
     """
     # setup some symbolic variables for theano to deal with
     Xd = T.matrix()
     Xc = T.zeros_like(Xd)
     Xm = T.zeros_like(Xd)
     # construct values to output
     if self.x_type == 'bernoulli':
         ll_term = log_prob_bernoulli(self.x, self.xg)
     else:
         ll_term = log_prob_gaussian2(self.x, self.xg, \
                 log_vars=self.bounded_logvar)
     all_klds = gaussian_kld(self.q_z_given_x.output_mean, \
             self.q_z_given_x.output_logvar, \
             self.prior_mean, self.prior_logvar)
     kld_term = T.sum(all_klds, axis=1)
     # compile theano function for a one-sample free-energy estimate
     fe_term_sample = theano.function(inputs=[Xd], \
             outputs=[ll_term, kld_term], \
             givens={self.Xd: Xd, self.Xc: Xc, self.Xm: Xm})
     # construct a wrapper function for multi-sample free-energy estimate
     def fe_term_estimator(X, sample_count):
         ll_sum = np.zeros((X.shape[0],))
         kld_sum = np.zeros((X.shape[0],))
         for i in range(sample_count):
             result = fe_term_sample(X)
             ll_sum = ll_sum + result[0].ravel()
             kld_sum = kld_sum + result[1].ravel()
         mean_nll = -ll_sum / float(sample_count)
         mean_kld = kld_sum / float(sample_count)
         return [mean_nll, mean_kld]
     return fe_term_estimator
Ejemplo n.º 5
0
    def _construct_chain_kld_cost(self, cost_decay=0.1):
        """
        Construct the posterior KL-d from prior part of cost to minimize.

        This is for operation in "free chain" mode, where a seed point is used
        to initialize a long(ish) running markov chain.
        """
        assert((cost_decay >= 0.0) and (cost_decay <= 1.0))
        obs_count = T.cast(self.Xd.shape[0], 'floatX')
        kld_mean = self.IN.kld_mean[0]
        kld_costs = []
        step_weight = 1.0
        step_weights = []
        step_decay = cost_decay
        for i in range(self.chain_len):
            IN_i = self.IN_chain[i]
            # basic variational term on KL divergence between post and prior
            kld_i = gaussian_kld(IN_i.output_mean, IN_i.output_logvar, \
                    self.prior_mean, self.prior_logvar)
            kld_i_costs = T.sum(kld_i, axis=1)
            # sum and reweight the KLd cost for this step in the chain
            c = T.mean(kld_i_costs)
            kld_costs.append(step_weight * c)
            step_weights.append(step_weight)
            step_weight = step_weight * step_decay
        kld_cost = sum(kld_costs) / sum(step_weights)
        return kld_cost
Ejemplo n.º 6
0
    def _construct_chain_kld_cost(self, cost_decay=0.1):
        """
        Construct the posterior KL-d from prior part of cost to minimize.

        This is for operation in "free chain" mode, where a seed point is used
        to initialize a long(ish) running markov chain.
        """
        assert ((cost_decay >= 0.0) and (cost_decay <= 1.0))
        obs_count = T.cast(self.Xd.shape[0], 'floatX')
        kld_mean = self.IN.kld_mean[0]
        kld_costs = []
        step_weight = 1.0
        step_weights = []
        step_decay = cost_decay
        for i in range(self.chain_len):
            IN_i = self.IN_chain[i]
            # basic variational term on KL divergence between post and prior
            kld_i = gaussian_kld(IN_i.output_mean, IN_i.output_logvar, \
                    self.prior_mean, self.prior_logvar)
            kld_i_costs = T.sum(kld_i, axis=1)
            # sum and reweight the KLd cost for this step in the chain
            c = T.mean(kld_i_costs)
            kld_costs.append(step_weight * c)
            step_weights.append(step_weight)
            step_weight = step_weight * step_decay
        kld_cost = sum(kld_costs) / sum(step_weights)
        return kld_cost
    def _construct_compute_fe_terms(self):
        """
        Construct theano function to compute the log-likelihood and posterior
        KL-divergence terms for the variational free-energy.
        """
        # construct values to output
        if self.x_type == 'bernoulli':
            ll_term = log_prob_bernoulli(self.x_in, self.xg)
        else:
            ll_term = log_prob_gaussian2(self.x_in, self.xg, \
                    log_vars=self.bounded_logvar)
        all_klds = gaussian_kld(self.z_mean, self.z_logvar, \
                                self.prior_mean, self.prior_logvar)
        kld_term = T.sum(all_klds, axis=1)
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[self.x_in], \
                                         outputs=[ll_term, kld_term])

        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(X, sample_count):
            X = to_fX(X)
            ll_sum = np.zeros((X.shape[0], ))
            kld_sum = np.zeros((X.shape[0], ))
            for i in range(sample_count):
                result = fe_term_sample(X)
                ll_sum = ll_sum + result[0].ravel()
                kld_sum = kld_sum + result[1].ravel()
            mean_nll = -ll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            return [mean_nll, mean_kld]

        return fe_term_estimator
 def _construct_compute_fe_terms(self):
     """
     Construct theano function to compute the log-likelihood and posterior
     KL-divergence terms for the variational free-energy.
     """
     # construct values to output
     if self.x_type == 'bernoulli':
         ll_term = log_prob_bernoulli(self.x_in, self.xg)
     else:
         ll_term = log_prob_gaussian2(self.x_in, self.xg, \
                 log_vars=self.bounded_logvar)
     all_klds = gaussian_kld(self.z_mean, self.z_logvar, \
                             self.prior_mean, self.prior_logvar)
     kld_term = T.sum(all_klds, axis=1)
     # compile theano function for a one-sample free-energy estimate
     fe_term_sample = theano.function(inputs=[self.x_in], \
                                      outputs=[ll_term, kld_term])
     # construct a wrapper function for multi-sample free-energy estimate
     def fe_term_estimator(X, sample_count):
         X = to_fX(X)
         ll_sum = np.zeros((X.shape[0],))
         kld_sum = np.zeros((X.shape[0],))
         for i in range(sample_count):
             result = fe_term_sample(X)
             ll_sum = ll_sum + result[0].ravel()
             kld_sum = kld_sum + result[1].ravel()
         mean_nll = -ll_sum / float(sample_count)
         mean_kld = kld_sum / float(sample_count)
         return [mean_nll, mean_kld]
     return fe_term_estimator
Ejemplo n.º 9
0
        def imp_step_func(zi_zmuv, si):
            si_as_x = self.obs_transform(si)
            xi_masked = (self.x_mask * self.x_out) + \
                        ((1.0 - self.x_mask) * si_as_x)
            #grad_ll = self.x_out - xi_masked
            # get samples of next zi, according to the global policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( \
                    xi_masked, do_samples=False)
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_x_xi.apply( \
                    T.horizontal_stack(xi_masked, self.x_out), \
                    do_samples=False)
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)

            if self.use_osm_mode:
                zi = zi_p
                # compute relevant KLds for this step
                kldi_q2p = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0)
                kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0)
            else:
                # make zi samples that can be switched between zi_p and zi_q
                zi = ((self.train_switch[0] * zi_q) + \
                     ((1.0 - self.train_switch[0]) * zi_p))
                # compute relevant KLds for this step
                kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, \
                                        zi_p_mean, zi_p_logvar)
                kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, \
                                        zi_q_mean, zi_q_logvar)

            # compute the next si, given the sampled zi
            hydra_out = self.p_xip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if (self.step_type == 'jump'):
                # jump steps always do a full swap (like standard VAE)
                sip1 = si_step
            else:
                # additive steps adjust the current guesses incrementally
                write_gate = T.nnet.sigmoid(2.0 + hydra_out[1])
                erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2])
                # LSTM-style update
                sip1 = (erase_gate * si) + (write_gate * si_step)
                # normal update (this was used in workshop papers)
                #sip1 = si + si_step
            # compute NLL for the current imputation
            nlli = self._construct_nll_costs(sip1, self.x_out, 0.0*self.x_mask)
            return sip1, nlli, kldi_q2p, kldi_p2q
        def imp_step_func(zi_zmuv, si):
            si_as_x = self._from_si_to_x(si)
            xi_unmasked = self.x_out
            xi_masked = (self.x_mask * xi_unmasked) + \
                        ((1.0 - self.x_mask) * si_as_x)
            grad_unmasked = self.x_out - si_as_x
            grad_masked = (self.x_mask * grad_unmasked) + \
                          ((1.0 - self.x_mask) * self.grad_null)
            # get samples of next zi, according to the global policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( \
                    T.horizontal_stack(xi_masked, grad_masked), \
                    do_samples=False)
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( \
                    T.horizontal_stack(xi_masked, grad_unmasked), \
                    do_samples=False)
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)

            # make zi samples that can be switched between zi_p and zi_q
            zi = ((self.train_switch[0] * zi_q) + \
                 ((1.0 - self.train_switch[0]) * zi_p))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, \
                                    zi_p_mean, zi_p_logvar) # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, \
                                    zi_q_mean, zi_q_logvar) # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, \
                                    0.0, 0.0) # KL(p || global prior)

            # compute the next si, given the sampled zi
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if (self.step_type == 'jump'):
                # jump steps always completely overwrite the current guesses
                sip1 = si_step
            else:
                # additive steps update the current guesses like an LSTM
                write_gate = T.nnet.sigmoid(3.0 + hydra_out[1])
                erase_gate = T.nnet.sigmoid(3.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            # compute NLL for the current imputation
            nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask)
            return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g
 def _construct_kld_costs(self):
     """
     Construct the posterior KL-d from prior part of cost to minimize.
     """
     # compute the KLds between posteriors and priors. we compute the KLd
     # independently for each input and each latent variable dimension
     kld_z = gaussian_kld(self.z_mean, self.z_logvar, \
                          self.prior_mean, self.prior_logvar)
     kld_costs = T.sum(kld_z, axis=1, keepdims=True)
     return kld_costs
Ejemplo n.º 12
0
 def _construct_kld_costs(self):
     """
     Construct the posterior KL-divergence part of cost to minimize.
     """
     # construct a penalty that is L2-like near 0 and L1-like away from 0.
     huber_pen = lambda x, d: \
             ((1.0 / (2.0 * d)) * ((T.abs_(x) < d) * (x**2.0))) + \
             ((T.abs_(x) >= d) * (T.abs_(x) - (d / 2.0)))
     # construct KLd cost for the distributions over hi. the prior over
     # hi is given by a distribution conditioned on si, which we estimate
     # using self.p_hi_given_si[i]. the conditionals produced by each
     # self.p_hi_given_si[i] will also be regularized towards a shared
     # prior, e.g. a Gaussian with zero mean and unit variance.
     kld_hi_conds = []
     kld_hi_globs = []
     for i in range(self.ir_steps):
         kld_hi_cond = gaussian_kld( \
                 self.q_hi_given_x_si[i].output_mean, \
                 self.q_hi_given_x_si[i].output_logvar, \
                 self.p_hi_given_si[i].output_mean, \
                 self.p_hi_given_si[i].output_logvar)
         kld_hi_glob = gaussian_kld( \
                 self.p_hi_given_si[i].output_mean, \
                 self.p_hi_given_si[i].output_logvar, \
                 0.0, 0.0)
         kld_hi_cond_l1l2 = (self.l1l2_weight[0] * kld_hi_cond) + \
                 ((1.0 - self.l1l2_weight[0]) * kld_hi_cond**2.0)
         kld_hi_conds.append(T.sum(kld_hi_cond_l1l2, \
                 axis=1, keepdims=True))
         kld_hi_globs.append(T.sum(kld_hi_glob**2.0, \
                 axis=1, keepdims=True))
     # compute the batch-wise costs
     kld_hi_cond = sum(kld_hi_conds)
     kld_hi_glob = sum(kld_hi_globs)
     # construct KLd cost for the distributions over z
     kld_z_all = gaussian_kld(self.q_z_given_x.output_mean, \
             self.q_z_given_x.output_logvar, \
             0.0, 0.0)
     kld_z_l1l2 = (self.l1l2_weight[0] * kld_z_all) + \
             ((1.0 - self.l1l2_weight[0]) * kld_z_all**2.0)
     kld_z = T.sum(kld_z_l1l2, \
             axis=1, keepdims=True)
     return [kld_z, kld_hi_cond, kld_hi_glob]
Ejemplo n.º 13
0
 def _construct_kld_costs(self):
     """
     Construct the posterior KL-divergence part of cost to minimize.
     """
     # construct a penalty that is L2-like near 0 and L1-like away from 0.
     huber_pen = lambda x, d: \
             ((1.0 / (2.0 * d)) * ((T.abs_(x) < d) * (x**2.0))) + \
             ((T.abs_(x) >= d) * (T.abs_(x) - (d / 2.0)))
     # construct KLd cost for the distributions over hi. the prior over
     # hi is given by a distribution conditioned on si, which we estimate
     # using self.p_hi_given_si[i]. the conditionals produced by each
     # self.p_hi_given_si[i] will also be regularized towards a shared
     # prior, e.g. a Gaussian with zero mean and unit variance.
     kld_hi_conds = []
     kld_hi_globs = []
     for i in range(self.ir_steps):
         kld_hi_cond = gaussian_kld( \
                 self.q_hi_given_x_si[i].output_mean, \
                 self.q_hi_given_x_si[i].output_logvar, \
                 self.p_hi_given_si[i].output_mean, \
                 self.p_hi_given_si[i].output_logvar)
         kld_hi_glob = gaussian_kld( \
                 self.p_hi_given_si[i].output_mean, \
                 self.p_hi_given_si[i].output_logvar, \
                 0.0, 0.0)
         kld_hi_cond_l1l2 = (self.l1l2_weight[0] * kld_hi_cond) + \
                 ((1.0 - self.l1l2_weight[0]) * kld_hi_cond**2.0)
         kld_hi_conds.append(T.sum(kld_hi_cond_l1l2, \
                 axis=1, keepdims=True))
         kld_hi_globs.append(T.sum(kld_hi_glob**2.0, \
                 axis=1, keepdims=True))
     # compute the batch-wise costs
     kld_hi_cond = sum(kld_hi_conds)
     kld_hi_glob = sum(kld_hi_globs)
     # construct KLd cost for the distributions over z
     kld_z_all = gaussian_kld(self.q_z_given_x.output_mean, \
             self.q_z_given_x.output_logvar, \
             0.0, 0.0)
     kld_z_l1l2 = (self.l1l2_weight[0] * kld_z_all) + \
             ((1.0 - self.l1l2_weight[0]) * kld_z_all**2.0)
     kld_z = T.sum(kld_z_l1l2, \
             axis=1, keepdims=True)
     return [kld_z, kld_hi_cond, kld_hi_glob]
 def _construct_kld_costs(self):
     """
     Construct the posterior KL-d from prior part of cost to minimize.
     """
     # compute the KLds between posteriors and priors. we compute the KLd
     # independently for each input and each latent variable dimension
     kld_z = gaussian_kld(self.z_mean, self.z_logvar, \
                          self.prior_mean, self.prior_logvar)
     kld_costs = T.sum(kld_z, axis=1, keepdims=True)
     return kld_costs
 def _construct_compute_post_klds(self):
     """
     Construct theano function to compute the info about the variational
     approximate posteriors for some inputs.
     """
     # setup some symbolic variables for theano to deal with
     all_klds = gaussian_kld(self.z_mean, self.z_logvar, \
                             self.prior_mean, self.prior_logvar)
     # compile theano function for a one-sample free-energy estimate
     kld_func = theano.function(inputs=[self.x_in], outputs=all_klds)
     return kld_func
 def _construct_compute_post_klds(self):
     """
     Construct theano function to compute the info about the variational
     approximate posteriors for some inputs.
     """
     # setup some symbolic variables for theano to deal with
     all_klds = gaussian_kld(self.z_mean, self.z_logvar, \
                             self.prior_mean, self.prior_logvar)
     # compile theano function for a one-sample free-energy estimate
     kld_func = theano.function(inputs=[self.x_in], outputs=all_klds)
     return kld_func
Ejemplo n.º 17
0
    def _construct_compute_post_klds(self):
        """
        Construct theano function to compute the info about the variational
        approximate posteriors for some inputs.
        """
        # setup some symbolic variables for theano to deal with
        x = T.matrix()
        # construct symbolic expressions for the desired KLds
        cond_klds = []
        glob_klds = []
        for i in range(self.ir_steps):
            kld_hi_cond = gaussian_kld(self.q_hi_given_x_si[i].output_mean, \
                    self.q_hi_given_x_si[i].output_logvar, \
                    self.p_hi_given_si[i].output_mean, \
                    self.p_hi_given_si[i].output_logvar)
            kld_hi_glob = gaussian_kld(self.p_hi_given_si[i].output_mean, \
                    self.p_hi_given_si[i].output_logvar, 0.0, 0.0)
            cond_klds.append(kld_hi_cond)
            glob_klds.append(kld_hi_glob)
        # gather conditional and global klds for all IR steps
        all_klds = cond_klds + glob_klds
        # gather kld for the initialization step
        kld_z_all = gaussian_kld(self.q_z_given_x.output_mean, \
                self.q_z_given_x.output_logvar, \
                0.0, 0.0)
        all_klds.append(kld_z_all)
        # compile theano function for a one-sample free-energy estimate
        kld_func = theano.function(inputs=[x], outputs=all_klds, \
                givens={ self.x: x })

        def post_kld_computer(X):
            f_all_klds = kld_func(X)
            f_kld_z = f_all_klds[-1]
            f_kld_hi_cond = np.zeros(f_all_klds[0].shape)
            f_kld_hi_glob = np.zeros(f_all_klds[0].shape)
            for j in range(self.ir_steps):
                f_kld_hi_cond += f_all_klds[j]
                f_kld_hi_glob += f_all_klds[j + self.ir_steps]
            return [f_kld_z, f_kld_hi_cond, f_kld_hi_glob]

        return post_kld_computer
Ejemplo n.º 18
0
 def _construct_kld_cost(self):
     """
     Compute (analytically) the KL divergence between each approximate
     posterior encoded by self.mu/self.sigma and the isotropic Gaussian
     distribution with mean 0 and standard deviation self.prior_sigma.
     """
     prior_mu = 0.0
     prior_logvar = np.log(self.prior_sigma**2.0)
     post_klds = gaussian_kld(self.output_mean, self.output_logvar, \
             prior_mu, prior_logvar)
     kld_cost = T.sum(post_klds, axis=1, keepdims=True)
     return kld_cost
 def _construct_kld_costs(self):
     """
     Construct the posterior KL-d from prior part of cost to minimize.
     """
     # compute the KLds between posteriors and priors. we compute the KLd
     # independently for each input and each latent variable dimension
     kld_z = gaussian_kld(self.z_mean, self.z_logvar, \
                          self.prior_mean, self.prior_logvar)
     # compute the batch-wise L1 and L2 penalties on per-dim KLds
     kld_l1_costs = T.sum(kld_z, axis=1, keepdims=True)
     kld_l2_costs = (kld_l1_costs - self.kld_z_mean[0])**2.0
     return [kld_l1_costs, kld_l2_costs]
 def _construct_compute_post_klds(self):
     """
     Construct theano function to compute the info about the variational
     approximate posteriors for some inputs.
     """
     # setup some symbolic variables for theano to deal with
     x = T.matrix()
     # construct symbolic expressions for the desired KLds
     cond_klds = []
     glob_klds = []
     for i in range(self.ir_steps):
         kld_hi_cond = gaussian_kld(self.q_hi_given_x_si[i].output_mean, \
                 self.q_hi_given_x_si[i].output_logvar, \
                 self.p_hi_given_si[i].output_mean, \
                 self.p_hi_given_si[i].output_logvar)
         kld_hi_glob = gaussian_kld(self.p_hi_given_si[i].output_mean, \
                 self.p_hi_given_si[i].output_logvar, 0.0, 0.0)
         cond_klds.append(kld_hi_cond)
         glob_klds.append(kld_hi_glob)
     # gather conditional and global klds for all IR steps
     all_klds = cond_klds + glob_klds
     # gather kld for the initialization step
     kld_z_all = gaussian_kld(self.q_z_given_x.output_mean, \
             self.q_z_given_x.output_logvar, \
             0.0, 0.0)
     all_klds.append(kld_z_all)
     # compile theano function for a one-sample free-energy estimate
     kld_func = theano.function(inputs=[x], outputs=all_klds, \
             givens={ self.x: x })
     def post_kld_computer(X):
         f_all_klds = kld_func(X)
         f_kld_z = f_all_klds[-1]
         f_kld_hi_cond = np.zeros(f_all_klds[0].shape)
         f_kld_hi_glob = np.zeros(f_all_klds[0].shape)
         for j in range(self.ir_steps):
             f_kld_hi_cond += f_all_klds[j]
             f_kld_hi_glob += f_all_klds[j + self.ir_steps]
         return [f_kld_z, f_kld_hi_cond, f_kld_hi_glob]
     return post_kld_computer
Ejemplo n.º 21
0
 def _construct_kld_costs(self, p=1.0):
     """
     Construct the posterior KL-divergence part of cost to minimize.
     """
     kld_hi_q2ps = []
     kld_hi_p2qs = []
     for i in range(self.ir_steps):
         kld_hi_q2p = self.kldi_q2p[i]
         kld_hi_p2q = self.kldi_p2q[i]
         kld_hi_q2ps.append(T.sum(kld_hi_q2p**p, \
                 axis=1, keepdims=True))
         kld_hi_p2qs.append(T.sum(kld_hi_p2q**p, \
                 axis=1, keepdims=True))
     # compute the batch-wise costs
     kld_hi_q2p = sum(kld_hi_q2ps)
     kld_hi_p2q = sum(kld_hi_p2qs)
     # construct KLd cost for the distributions over z
     kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \
                               self.p_z_mean, self.p_z_logvar)
     kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \
                               self.q_z_mean, self.q_z_logvar)
     kld_z_q2p = T.sum(kld_z_q2ps**p, axis=1, keepdims=True)
     kld_z_p2q = T.sum(kld_z_p2qs**p, axis=1, keepdims=True)
     return [kld_z_q2p, kld_z_p2q, kld_hi_q2p, kld_hi_p2q]
Ejemplo n.º 22
0
        def ir_step_func(hi_zmuv, sim1):
            # get variables used throughout this refinement step
            sim1_obs = self.obs_transform(sim1) # transform state -> obs
            grad_ll = self.x_out - sim1_obs

            # get samples of next hi, conditioned on current si
            hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \
                    sim1_obs, do_samples=False)
            # now we build the model for variational hi given si
            hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \
                    T.horizontal_stack(grad_ll, sim1_obs), \
                    do_samples=False)
            hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean
            hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean

            # make hi samples that can be switched between hi_p and hi_q
            hi = ( ((self.train_switch[0] * hi_q) + \
                    ((1.0 - self.train_switch[0]) * hi_p)) )

            # p_sip1_given_si_hi is conditioned on si and  hi.
            ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi)
                    
            # get the transformed values (for an LSTM style update)
            i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0)
            f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0)
            # perform an LSTM-like update of the state sim1 -> si
            si = (in_vals * i_gate) + (sim1 * f_gate)

            # compute generator NLL for this step
            nlli = self.log_prob_func(self.x_out, self.obs_transform(si))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \
                                    hi_p_mean, hi_p_logvar)
            kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \
                                    hi_q_mean, hi_q_logvar)
            return si, nlli, kldi_q2p, kldi_p2q
Ejemplo n.º 23
0
 def _construct_compute_post_klds(self):
     """
     Construct theano function to compute the info about the variational
     approximate posteriors for some inputs.
     """
     # setup some symbolic variables for theano to deal with
     Xd = T.matrix()
     Xc = T.zeros_like(Xd)
     Xm = T.zeros_like(Xd)
     all_klds = gaussian_kld(self.q_z_given_x.output_mean, \
             self.q_z_given_x.output_logvar, \
             self.prior_mean, self.prior_logvar)
     # compile theano function for a one-sample free-energy estimate
     kld_func = theano.function(inputs=[Xd], outputs=all_klds, \
             givens={self.Xd: Xd, self.Xc: Xc, self.Xm: Xm})
     return kld_func
Ejemplo n.º 24
0
 def _construct_compute_post_klds(self):
     """
     Construct theano function to compute the info about the variational
     approximate posteriors for some inputs.
     """
     # setup some symbolic variables for theano to deal with
     Xd = T.matrix()
     Xc = T.zeros_like(Xd)
     Xm = T.zeros_like(Xd)
     all_klds = gaussian_kld(self.q_z_given_x.output_mean, \
             self.q_z_given_x.output_logvar, \
             self.prior_mean, self.prior_logvar)
     # compile theano function for a one-sample free-energy estimate
     kld_func = theano.function(inputs=[Xd], outputs=all_klds, \
             givens={self.Xd: Xd, self.Xc: Xc, self.Xm: Xm})
     return kld_func
Ejemplo n.º 25
0
 def _construct_kld_costs(self):
     """
     Construct the posterior KL-d from prior part of cost to minimize.
     """
     # compute the KLds between posteriors and priors. we compute the KLd
     # independently for each input and each latent variable dimension
     kld_z = gaussian_kld(self.q_z_given_x.output_mean, \
             self.q_z_given_x.output_logvar, \
             self.prior_mean, self.prior_logvar)
     # compute the batch-wise L1 and L2 penalties on per-dim KLds
     kld_l1_costs = T.sum(kld_z, axis=1, keepdims=True)
     derp1 = T.mean(kld_l1_costs)
     derp2 = kld_l1_costs > derp1
     batch_kld_mean = theano.gradient.disconnected_grad(derp1)
     mask = theano.gradient.disconnected_grad(derp2)
     kld_l2_costs = T.sum(((kld_l1_costs-batch_kld_mean)**2.0 * mask), \
                          axis=1, keepdims=True)
     return [kld_l1_costs, kld_l2_costs]
Ejemplo n.º 26
0
 def _construct_kld_costs(self):
     """
     Construct the posterior KL-d from prior part of cost to minimize.
     """
     # compute the KLds between posteriors and priors. we compute the KLd
     # independently for each input and each latent variable dimension
     kld_z = gaussian_kld(self.q_z_given_x.output_mean, \
             self.q_z_given_x.output_logvar, \
             self.prior_mean, self.prior_logvar)
     # compute the batch-wise L1 and L2 penalties on per-dim KLds
     kld_l1_costs = T.sum(kld_z, axis=1, keepdims=True)
     derp1 = T.mean(kld_l1_costs)
     derp2 = kld_l1_costs > derp1
     batch_kld_mean = theano.gradient.disconnected_grad(derp1)
     mask = theano.gradient.disconnected_grad(derp2)
     kld_l2_costs = T.sum(((kld_l1_costs-batch_kld_mean)**2.0 * mask), \
                          axis=1, keepdims=True)
     return [kld_l1_costs, kld_l2_costs]
Ejemplo n.º 27
0
 def _construct_compute_post_stats(self):
     """
     Construct theano function to compute some stats describing the latent
     posteriors inferred by this model.
     """
     # construct the stats to compute
     obs_count = T.cast(self.Xd.shape[0], 'floatX')
     prior_mean = 0.0
     prior_logvar = 0.0
     all_klds = gaussian_kld(self.IN.output_mean, self.IN.output_logvar, \
             prior_mean, prior_logvar)
     obs_klds = T.sum(all_klds, axis=1)
     dim_klds = T.sum(all_klds, axis=0) / obs_count
     dim_vars = T.sum(self.IN.output_mean**2.0, axis=0) / obs_count
     # make a theano function to compute them
     outputs = [all_klds, obs_klds, dim_klds, dim_vars]
     func = theano.function(inputs=[self.Xd, self.Xc, self.Xm], \
             outputs=outputs)
     return func
Ejemplo n.º 28
0
 def chain_step_func(zi_zmuv, xim1):
     # get mean and logvar of z samples for this step
     zi_mean, zi_logvar = self.IN.apply(xim1, do_samples=False)
     # transform ZMUV samples to get desired samples
     zi = (T.exp(0.5 * zi_logvar) * zi_zmuv) + zi_mean
     # get the next generated xi (pre-transformation)
     outputs = self.GN.apply(zi)
     xti = outputs[-1]
     # apply the observation "mean" transform
     xgi = self.xt_transform(xti)
     # compute NLL for this step
     if self.chain_type == 'walkout':
         x_true = self.x_d
     else:
         x_true = xim1
     nlli = self._log_prob(x_true, xgi).flatten()
     kldi = T.sum(gaussian_kld(zi_mean, zi_logvar, \
                  self.p_z_mean, self.p_z_logvar), axis=1)
     return xgi, nlli, kldi
Ejemplo n.º 29
0
    def _construct_compute_fe_terms(self):
        """
        Construct theano function to compute the log-likelihood and posterior
        KL-divergence terms for the variational free-energy.
        """
        # setup some symbolic variables for theano to deal with
        Xd = T.matrix()
        Xc = T.zeros_like(Xd)
        Xm = T.zeros_like(Xd)
        # construct values to output
        if self.x_type == 'bernoulli':
            ll_term = log_prob_bernoulli(self.x, self.xg)
        else:
            ll_term = log_prob_gaussian2(self.x, self.xg, \
                    log_vars=self.bounded_logvar)
        all_klds = gaussian_kld(self.q_z_given_x.output_mean, \
                self.q_z_given_x.output_logvar, \
                self.prior_mean, self.prior_logvar)
        kld_term = T.sum(all_klds, axis=1)
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[Xd], \
                outputs=[ll_term, kld_term], \
                givens={self.Xd: Xd, self.Xc: Xc, self.Xm: Xm})

        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(X, sample_count):
            ll_sum = np.zeros((X.shape[0], ))
            kld_sum = np.zeros((X.shape[0], ))
            for i in range(sample_count):
                result = fe_term_sample(X)
                ll_sum = ll_sum + result[0].ravel()
                kld_sum = kld_sum + result[1].ravel()
            mean_nll = -ll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            return [mean_nll, mean_kld]

        return fe_term_estimator
Ejemplo n.º 30
0
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_s0_given_z=None, \
            p_hi_given_si=None, \
            p_sip1_given_si_hi=None, \
            q_z_given_x=None, \
            q_hi_given_x_si=None, \
            obs_dim=None, \
            z_dim=None, h_dim=None, \
            ir_steps=4, params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.obs_dim = obs_dim
        self.z_dim = z_dim
        self.h_dim = h_dim
        self.ir_steps = ir_steps

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x
        self.q_hi_given_x_si = q_hi_given_x_si
        self.p_s0_given_z = p_s0_given_z
        self.p_hi_given_si = p_hi_given_si
        self.p_sip1_given_si_hi = p_sip1_given_si_hi

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.hi_zmuv = T.tensor3() # for ZMUV Gaussian samples to use in scan

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)
        # setup a variable for controlling dropout noise
        self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate')
        self.set_drop_rate(0.0)
        # this weight balances l1 vs. l2 penalty on posterior KLds
        self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2')
        self.set_lam_kld_l1l2(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((self.z_dim,)) )
            self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar')
            init_vec = to_fX( np.zeros((self.obs_dim,)) )
            self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        # setup a function for computing reconstruction log likelihood
        if self.x_type == 'bernoulli':
            self.log_prob_func = lambda xo, xh: \
                    (-1.0 * log_prob_bernoulli(xo, xh))
        else:
            self.log_prob_func = lambda xo, xh: \
                    (-1.0 * log_prob_gaussian2(xo, xh, \
                     log_vars=self.bounded_logvar))

        # get a drop mask that drops things with probability p
        drop_scale = 1. / (1. - self.drop_rate[0])
        drop_rnd = self.rng.uniform(size=self.x_out.shape, \
                low=0.0, high=1.0, dtype=theano.config.floatX)
        drop_mask = drop_scale * (drop_rnd > self.drop_rate[0])

        #############################
        # Setup self.z and self.s0. #
        #############################
        print("Building MSM step 0...")
        drop_x = drop_mask * self.x_in
        self.q_z_mean, self.q_z_logvar, self.z = \
                self.q_z_given_x.apply(drop_x, do_samples=True)
        # get initial observation state
        self.s0, _ = self.p_s0_given_z.apply(self.z, do_samples=False)

        # gather KLd and NLL for the initialization step
        self.init_klds = gaussian_kld(self.q_z_mean, self.q_z_logvar, \
                                      self.p_z_mean, self.p_z_logvar)
        self.init_nlls =  -1.0 * \
                self.log_prob_func(self.x_out, self.obs_transform(self.s0))

        ##################################################
        # Setup the iterative generation loop using scan #
        ##################################################
        def ir_step_func(hi_zmuv, sim1):
            # get variables used throughout this refinement step
            sim1_obs = self.obs_transform(sim1) # transform state -> obs
            grad_ll = self.x_out - sim1_obs

            # get samples of next hi, conditioned on current si
            hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \
                    sim1_obs, do_samples=False)
            # now we build the model for variational hi given si
            hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \
                    T.horizontal_stack(grad_ll, sim1_obs), \
                    do_samples=False)
            hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean
            hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean

            # make hi samples that can be switched between hi_p and hi_q
            hi = ( ((self.train_switch[0] * hi_q) + \
                    ((1.0 - self.train_switch[0]) * hi_p)) )

            # p_sip1_given_si_hi is conditioned on si and  hi.
            ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi)
                    
            # get the transformed values (for an LSTM style update)
            i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0)
            f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0)
            # perform an LSTM-like update of the state sim1 -> si
            si = (in_vals * i_gate) + (sim1 * f_gate)

            # compute generator NLL for this step
            nlli = self.log_prob_func(self.x_out, self.obs_transform(si))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \
                                    hi_p_mean, hi_p_logvar)
            kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \
                                    hi_q_mean, hi_q_logvar)
            return si, nlli, kldi_q2p, kldi_p2q

        init_values = [self.s0, None, None, None]

        self.scan_results, self.scan_updates = theano.scan(ir_step_func, \
                outputs_info=init_values, sequences=self.hi_zmuv)

        self.si = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi_q2p = self.scan_results[2]
        self.kldi_p2q = self.scan_results[3]

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z')
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.q_params = []
        self.q_params.extend(self.q_z_given_x.mlp_params)
        self.q_params.extend(self.q_hi_given_x_si.mlp_params)
        # Grab all of the "optimizable" parameters in "group 2"
        self.p_params = [self.p_z_mean, self.p_z_logvar]
        self.p_params.extend(self.p_hi_given_si.mlp_params)
        self.p_params.extend(self.p_sip1_given_si_hi.mlp_params)
        self.p_params.extend(self.p_s0_given_z.mlp_params)

        # Make a joint list of parameters group 1/2
        self.joint_params = self.q_params + self.p_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z_q2p, self.kld_z_p2q, self.kld_hi_q2p, self.kld_hi_p2q = \
                self._construct_kld_costs(p=1.0)
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_hi = (self.lam_kld_q2p[0] * self.kld_hi_q2p) + \
                      (self.lam_kld_p2q[0] * self.kld_hi_p2q)
        self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_hi
        # now do l2 KLd costs
        self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_hi_q2p, self.kl2_hi_p2q = \
                self._construct_kld_costs(p=2.0)
        self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_z_p2q)
        self.kl2_hi = (self.lam_kld_q2p[0] * self.kl2_hi_q2p) + \
                      (self.lam_kld_p2q[0] * self.kl2_hi_p2q)
        self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_hi
        # compute joint l1/l2 KLd cost
        self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \
                ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        self.kl2_cost = T.mean(self.kl2_costs)
        self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.nlli[-1]
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_l1l2_cost + \
                          self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_l1l2_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.q_updates = get_adam_updates(params=self.q_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.p_updates = get_adam_updates(params=self.p_params, \
                grads=self.joint_grads, alpha=self.lr_2, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.q_updates:
            self.joint_updates[k] = self.q_updates[k]
        for k in self.p_updates:
            self.joint_updates[k] = self.p_updates[k]
        # add scan updates, which seem to be required
        for k in self.scan_updates:
            self.joint_updates[k] = self.scan_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling cost computer...")
        self.compute_raw_klds = self._construct_raw_klds()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        print("Compiling data-guided model sampler...")
        self.sample_from_input = self._construct_sample_from_input()
        return
    def __init__(self, rng=None,
            x_in=None, x_out=None,
            p_h_given_z=None,
            p_x_given_h=None,
            q_z_given_x=None,
            q_h_given_z_x=None,
            x_dim=None,
            z_dim=None,
            h_dim=None,
            h_det_dim=None,
            params=None,
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim
        self.h_dim = h_dim
        self.h_det_dim = h_det_dim

        # grab handles to the relevant HydraNets
        self.q_z_given_x = q_z_given_x
        self.q_h_given_z_x = q_h_given_z_x
        self.p_h_given_z = p_h_given_z
        self.p_x_given_h = p_x_given_h

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch')
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((1,self.z_dim)) )
            self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar')
            self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        ##############################################
        # Setup the TwoStageModels main computation. #
        ##############################################
        print("Building TSM...")
        # samples of "hidden" latent state (from both p and q)
        z_q_mean, z_q_logvar = self.q_z_given_x.apply(self.x_in)
        z_q = reparametrize(z_q_mean, z_q_logvar, rng=self.rng)

        z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0)
        z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0)
        z_p = reparametrize(z_p_mean, z_p_logvar, rng=self.rng)

        self.z = (self.train_switch[0] * z_q) + \
                 ((1.0 - self.train_switch[0]) * z_p)
        # compute relevant KLds for this step
        self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar,
                                      z_p_mean, z_p_logvar)
        self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar,
                                      z_q_mean, z_q_logvar)
        # samples of "hidden" latent state (from both p and q)
        h_p_mean, h_p_logvar = self.p_h_given_z.apply(self.z)
        h_p = reparametrize(h_p_mean, h_p_logvar, rng=self.rng)

        h_q_mean, h_q_logvar = self.q_h_given_z_x.apply(
                T.concatenate([h_p_mean, self.x_out], axis=1))
        h_q = reparametrize(h_q_mean, h_q_logvar, rng=self.rng)

        # compute "stochastic" and "deterministic" parts of latent state
        h_sto = (self.train_switch[0] * h_q) + \
                ((1.0 - self.train_switch[0]) * h_p)
        h_det = h_p_mean
        if self.h_det_dim is None:
            # don't pass forward any deterministic state
            self.h = h_sto
        else:
            # pass forward some deterministic state
            self.h = T.concatenate([h_det[:,:self.h_det_dim],
                                    h_sto[:,self.h_det_dim:]], axis=1)
        # compute relevant KLds for this step
        self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar,
                                      h_p_mean, h_p_logvar)
        self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar,
                                      h_q_mean, h_q_logvar)

        # p_x_given_h generates an observation x conditioned on the "hidden"
        # latent variables h.
        self.x_gen, _ = self.p_x_given_h.apply(self.h)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='tsm_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # get optimizable parameters belonging to the TwoStageModel
        self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar]
        # get optimizable parameters belonging to the underlying networks
        child_params = []
        child_params.extend(self.q_z_given_x.mlp_params)
        child_params.extend(self.q_h_given_z_x.mlp_params)
        child_params.extend(self.p_h_given_z.mlp_params)
        child_params.extend(self.p_x_given_h.mlp_params)
        # make a joint list of all optimizable parameters
        self.joint_params = self_params + child_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_h_p2q)
        self.kld_costs = T.sum(self.kld_z, axis=1) + \
                         T.sum(self.kld_h, axis=1)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs(self.x_out)
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # construct the updates for the generator and inferencer networks
        all_updates = get_adam_updates(params=self.joint_params,
                grads=self.joint_grads, alpha=self.lr,
                beta1=self.mom_1, beta2=self.mom_2,
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0)
        self.joint_updates = OrderedDict()
        for k in all_updates:
            self.joint_updates[k] = all_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        return
    def __init__(self, rng=None,
            x_in=None, x_mask=None, x_out=None, \
            p_h_given_x=None, \
            p_s0_given_h=None, \
            p_zi_given_xi=None, \
            p_sip1_given_zi=None, \
            p_x_given_si=None, \
            q_h_given_x=None, \
            q_zi_given_xi=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params['x_dim']
        self.h_dim = self.params['h_dim']
        self.z_dim = self.params['z_dim']
        self.s_dim = self.params['s_dim']
        self.use_p_x_given_si = self.params['use_p_x_given_si']
        self.imp_steps = self.params['imp_steps']
        self.step_type = self.params['step_type']
        self.x_type = self.params['x_type']
        if self.use_p_x_given_si:
            print("Constructing hypotheses via p_x_given_si...")
        else:
            print("Constructing hypotheses directly in x-space...")
            assert(self.s_dim == self.x_dim)
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts

        assert((self.step_type == 'add') or (self.step_type == 'jump'))

        # grab handles to the relevant InfNets
        self.p_h_given_x = p_h_given_x
        self.p_s0_given_h = p_s0_given_h
        self.p_zi_given_xi = p_zi_given_xi
        self.p_sip1_given_zi = p_sip1_given_zi
        self.p_x_given_si = p_x_given_si
        self.q_h_given_x = q_h_given_x
        self.q_zi_given_xi = q_zi_given_xi

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.x_mask = x_mask
        self.zi_zmuv = T.tensor3()

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='gpsi_train_switch')
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize parameters "owned" by this model
            init_ary = to_fX( np.zeros((self.x_dim,)) )
            self.s_null = theano.shared(value=init_ary, name='gpis_sn')
            self.grad_null = theano.shared(value=init_ary, name='gpsi_gn')
            self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])
            self.shared_param_dicts = {}
            self.shared_param_dicts['s_null'] = self.s_null
            self.shared_param_dicts['grad_null'] = self.grad_null
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
            self.x_null = self._from_si_to_x(self.s_null)
        else:
            # grab the parameters required by this model from a given dict
            self.s_null = self.shared_param_dicts['s_null']
            self.grad_null = self.shared_param_dicts['grad_null']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])
            self.x_null = self._from_si_to_x(self.s_null)

        ##############################################
        # Compute results of the initialization step #
        ##############################################
        self.x_init = (self.x_mask * self.x_in) + \
                      ((1.0 - self.x_mask) * self.x_null)
        # sample from primary and guide conditionals over h
        h_p_mean, h_p_logvar, h_p = \
                self.p_h_given_x.apply(self.x_init, do_samples=True)
        h_q_mean, h_q_logvar, h_q = \
                self.q_h_given_x.apply(self.x_in, do_samples=True)
        # make h samples that can be switched between h_p and h_q
        self.h = ((self.train_switch[0] * h_q) + \
                 ((1.0 - self.train_switch[0]) * h_p))
        # get the emitted initial state s0 (sampled via either p or q)
        hydra_out = self.p_s0_given_h.apply(self.h)
        self.s0 = hydra_out[0]
        # compute NLL reconstruction cost for the initialization step
        self.nll0 = self._construct_nll_costs(self.s0, self.x_out, self.x_mask)
        # compute KLds for the initialization step
        self.kldh_q2p = gaussian_kld(h_q_mean, h_q_logvar, \
                                     h_p_mean, h_p_logvar) # KL(q || p)
        self.kldh_p2q = gaussian_kld(h_p_mean, h_p_logvar, \
                                     h_q_mean, h_q_logvar) # KL(p || q)
        self.kldh_p2g = gaussian_kld(h_p_mean, h_p_logvar, \
                                     0.0, 0.0) # KL(p || global prior)

        ##################################################
        # Setup the iterative imputation loop using scan #
        ##################################################
        self.ones_mask = T.ones_like(self.x_mask)
        def imp_step_func(zi_zmuv, si):
            si_as_x = self._from_si_to_x(si)
            xi_unmasked = self.x_out
            xi_masked = (self.x_mask * xi_unmasked) + \
                        ((1.0 - self.x_mask) * si_as_x)
            grad_unmasked = self.x_out - si_as_x
            grad_masked = (self.x_mask * grad_unmasked) + \
                          ((1.0 - self.x_mask) * self.grad_null)
            # get samples of next zi, according to the global policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( \
                    T.horizontal_stack(xi_masked, grad_masked), \
                    do_samples=False)
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( \
                    T.horizontal_stack(xi_masked, grad_unmasked), \
                    do_samples=False)
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)

            # make zi samples that can be switched between zi_p and zi_q
            zi = ((self.train_switch[0] * zi_q) + \
                 ((1.0 - self.train_switch[0]) * zi_p))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, \
                                    zi_p_mean, zi_p_logvar) # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, \
                                    zi_q_mean, zi_q_logvar) # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, \
                                    0.0, 0.0) # KL(p || global prior)

            # compute the next si, given the sampled zi
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if (self.step_type == 'jump'):
                # jump steps always completely overwrite the current guesses
                sip1 = si_step
            else:
                # additive steps update the current guesses like an LSTM
                write_gate = T.nnet.sigmoid(3.0 + hydra_out[1])
                erase_gate = T.nnet.sigmoid(3.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            # compute NLL for the current imputation
            nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask)
            return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g

        # apply scan op for the sequential imputation loop
        init_vals = [self.s0, None, None, None, None]
        self.scan_results, self.scan_updates = theano.scan(imp_step_func, \
                    outputs_info=init_vals, sequences=self.zi_zmuv)

        self.si = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi_q2p = self.scan_results[2]
        self.kldi_p2q = self.scan_results[3]
        self.kldi_p2g = self.scan_results[4]

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='gpsi_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p')
        self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q')
        self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g')
        self.lam_kld_s = theano.shared(value=zero_ary, name='gpsi_lam_kld_s')
        self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in the model
        self.joint_params = [self.s_null, self.grad_null, self.obs_logvar]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.p_x_given_si.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g, self.kld_s = \
                self._construct_kld_costs(p=1.0)
        self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \
                         (self.lam_kld_q[0] * self.kld_q) + \
                         (self.lam_kld_g[0] * self.kld_g) + \
                         (self.lam_kld_s[0] * self.kld_s)
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.nlli[-1]
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling best step cost computer...")
        self.compute_per_step_cost = self._construct_compute_per_step_cost()
        print("Compiling data-guided imputer sampler...")
        self.sample_imputer = self._construct_sample_imputer()
        # make easy access points for some interesting parameters
        #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_h_given_z=None, \
            p_x_given_h=None, \
            q_z_given_x=None, \
            q_h_given_z_x=None, \
            x_dim=None, \
            z_dim=None, \
            h_dim=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim
        self.h_dim = h_dim

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x
        self.q_h_given_z_x = q_h_given_z_x
        self.p_h_given_z = p_h_given_z
        self.p_x_given_h = p_x_given_h

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch')
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((1,self.z_dim)) )
            self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar')
            self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        ##############################################
        # Setup the TwoStageModels main computation. #
        ##############################################
        print("Building TSM...")
        # samples of "hidden" latent state (from both p and q)
        z_q_mean, z_q_logvar, z_q = \
                self.q_z_given_x.apply(self.x_in, do_samples=True)
        z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0)
        z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0)
        zmuv = self.rng.normal(size=z_q.shape, avg=0.0, std=1.0, \
                               dtype=theano.config.floatX)
        z_p = (T.exp(0.5*z_p_logvar) * zmuv) + z_p_mean
        self.z = (self.train_switch[0] * z_q) + \
                 ((1.0 - self.train_switch[0]) * z_p)
        # compute relevant KLds for this step
        self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, \
                                      z_p_mean, z_p_logvar)
        self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, \
                                      z_q_mean, z_q_logvar)
        # samples of "hidden" latent state (from both p and q)
        h_p_mean, h_p_logvar, h_p = self.p_h_given_z.apply(self.z)
        h_q_mean, h_q_logvar, h_q = self.q_h_given_z_x.apply( \
                T.horizontal_stack(h_p_mean, h_p_logvar, self.x_out))
        self.h = (self.train_switch[0] * h_q) + \
                 ((1.0 - self.train_switch[0]) * h_p)
        # compute relevant KLds for this step
        self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, \
                                      h_p_mean, h_p_logvar)
        self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, \
                                      h_q_mean, h_q_logvar)

        # p_x_given_h generates an observation x conditioned on the "hidden"
        # latent variables h.
        self.x_gen, _ = self.p_x_given_h.apply(self.h, do_samples=False)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='tsm_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # get optimizable parameters belonging to the TwoStageModel
        self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar]
        # get optimizable parameters belonging to the underlying networks
        child_params = []
        child_params.extend(self.q_z_given_x.mlp_params)
        child_params.extend(self.q_h_given_z_x.mlp_params)
        child_params.extend(self.p_h_given_z.mlp_params)
        child_params.extend(self.p_x_given_h.mlp_params)
        # make a joint list of all optimizable parameters
        self.joint_params = self_params + child_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_h_p2q)
        self.kld_costs = T.sum(self.kld_z, axis=1) + \
                         T.sum(self.kld_h, axis=1)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs(self.x_out)
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # construct the updates for the generator and inferencer networks
        all_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0)
        self.joint_updates = OrderedDict()
        for k in all_updates:
            self.joint_updates[k] = all_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        return
Ejemplo n.º 34
0
    def __init__(self, rng=None, \
            x_in=None, y_in=None, \
            q_z_given_x=None, \
            class_count=None, \
            z_dim=None, \
            use_samples=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # record the dimensions of various spaces relevant to this model
        self.class_count = class_count
        self.z_dim = z_dim
        self.shared_dim = q_z_given_x.shared_layers[-1].out_dim
        self.use_samples = use_samples

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.y_in = y_in

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        # setup a variable for controlling dropout noise
        self.drop_rate = theano.shared(value=zero_ary, name='cm_drop_rate')
        self.set_drop_rate(0.0)

        # initialize classification layer parameters
        init_mat = to_fX(0.01 * npr.randn(self.shared_dim, self.class_count))
        init_vec = to_fX( np.zeros((self.class_count,)) )
        self.W_class = theano.shared(value=init_mat, name='cm_W_class')
        self.b_class = theano.shared(value=init_vec, name='cm_b_class')
        # initialize "optimizable" parameters specific to this CM
        init_vec = to_fX( np.zeros((self.z_dim,)) )
        self.p_z_mean = theano.shared(value=init_vec, name='cm_p_z_mean')
        self.p_z_logvar = theano.shared(value=init_vec, name='cm_p_z_logvar')

        #################
        # Setup self.z. #
        #################
        self.q_z_mean, self.q_z_logvar, self.q_z_samples = \
                self.q_z_given_x.apply(self.x_in, do_samples=True)
        self.q_z_samples = self.q_z_given_x.apply_shared(self.x_in)

        # get a drop mask that drops things with probability p
        drop_scale = 1. / (1. - self.drop_rate[0])
        drop_rnd = self.rng.uniform(size=self.q_z_samples.shape, \
                low=0.0, high=1.0, dtype=theano.config.floatX)
        drop_mask = drop_scale * (drop_rnd > self.drop_rate[0])

        # get a droppy version of either z mean or z samples
        # if self.use_samples:
        #     self.z = self.q_z_samples * drop_mask
        # else:
        #     self.z = self.q_z_mean * drop_mask
        self.z = self.q_z_samples * drop_mask

        # compute class predictions
        self.y_out = T.dot(self.z, self.W_class) + self.b_class

        # compute KLds for training via variational free-energy
        self.kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \
                                       self.p_z_mean, self.p_z_logvar)
        self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \
                                       self.q_z_mean, self.q_z_logvar)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_1 = theano.shared(value=zero_ary, name='cm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='cm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='cm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='cm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='cm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='cm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='cm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_q2p=0.9, lam_kld_p2q=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='cm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters
        self.joint_params = [self.p_z_mean, self.p_z_logvar, \
                             self.W_class, self.b_class]
        self.joint_params.extend(self.q_z_given_x.mlp_params)

        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.lam_nll[0] * self._construct_nll_costs(self.y_in)
        self.nll_cost = T.mean(self.nll_costs)
        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z_q2p, self.kld_z_p2q = self._construct_kld_costs(p=1.0)
        self.kld_costs = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                         (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_cost = T.mean(self.kld_costs)
        ##################################
        # CONSTRUCT THE FINAL JOINT COST #
        ##################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the model parameters
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling class error estimator...")
        self.class_error = self._construct_class_error()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        # make easy access points for some interesting parameters
        self.inf_weights = self.q_z_given_x.shared_layers[0].W
        return
Ejemplo n.º 35
0
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_s_given_z=None, \
            p_h_given_s=None, \
            p_x_given_s_h=None, \
            q_z_given_x=None, \
            q_h_given_x_s=None, \
            x_dim=None, \
            z_dim=None, \
            s_dim=None, \
            h_dim=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim
        self.s_dim = s_dim
        self.h_dim = h_dim

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x
        self.q_h_given_x_s = q_h_given_x_s
        self.p_s_given_z = p_s_given_z
        self.p_h_given_s = p_h_given_s
        self.p_x_given_s_h = p_x_given_s_h

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.batch_reps = T.lscalar()

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)
        # setup a variable for controlling dropout noise
        self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate')
        self.set_drop_rate(0.0)
        # this weight balances l1 vs. l2 penalty on posterior KLds
        self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2')
        self.set_lam_kld_l1l2(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((self.z_dim,)) )
            self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar')
            init_vec = to_fX( np.zeros((self.x_dim,)) )
            self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        # get a drop mask that drops things with probability p
        drop_scale = 1. / (1. - self.drop_rate[0])
        drop_rnd = self.rng.uniform(size=self.x_out.shape, \
                low=0.0, high=1.0, dtype=theano.config.floatX)
        drop_mask = drop_scale * (drop_rnd > self.drop_rate[0])

        ##############################################
        # Setup the TwoStageModels main computation. #
        ##############################################
        print("Building TSM...")
        # samples of "first" latent state
        drop_x = drop_mask * self.x_in
        z_q_mean, z_q_logvar, self.z = \
                self.q_z_given_x.apply(drop_x, do_samples=True)
        # compute relevant KLds for this step
        self.kld_z_q2ps = gaussian_kld(z_q_mean, z_q_logvar, \
                                       self.p_z_mean, self.p_z_logvar)
        self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \
                                       z_q_mean, z_q_logvar)
        # transform "first" latent state into "second" latent state
        self.s, _ = self.p_s_given_z.apply(self.z, do_samples=False)

        # get samples of h, conditioned on current s
        h_p_mean, h_p_logvar, h_p = self.p_h_given_s.apply( \
                self.s, do_samples=True)
        # get variational samples of h, given s and x_out
        h_q_mean, h_q_logvar, h_q = self.q_h_given_x_s.apply( \
                T.horizontal_stack(self.x_out, self.s), \
                do_samples=True)

        # make h samples that can be switched between h_p and h_q
        self.h = (self.train_switch[0] * h_q) + \
                 ((1.0 - self.train_switch[0]) * h_p)

        # compute relevant KLds for this step
        self.kld_h_q2ps = gaussian_kld(h_q_mean, h_q_logvar, \
                                       h_p_mean, h_p_logvar)
        self.kld_h_p2qs = gaussian_kld(h_p_mean, h_p_logvar, \
                                       h_q_mean, h_q_logvar)

        # p_x_given_s_h is conditioned on s and  h.
        self.x_gen, _ = self.p_x_given_s_h.apply( \
                T.horizontal_stack(self.s, self.h), \
                do_samples=False)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z')
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.group_1_params = []
        self.group_1_params.extend(self.q_z_given_x.mlp_params)
        self.group_1_params.extend(self.q_h_given_x_s.mlp_params)
        # Grab all of the "optimizable" parameters in "group 2"
        self.group_2_params = [self.p_z_mean, self.p_z_logvar]
        self.group_2_params.extend(self.p_s_given_z.mlp_params)
        self.group_2_params.extend(self.p_h_given_s.mlp_params)
        self.group_2_params.extend(self.p_x_given_s_h.mlp_params)

        # Make a joint list of parameters group 1/2
        self.joint_params = self.group_1_params + self.group_2_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z_q2p, self.kld_z_p2q, self.kld_h_q2p, self.kld_h_p2q = \
                self._construct_kld_costs(p=1.0)
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_h_p2q)
        self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_h
        # now do l2 KLd costs
        self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_h_q2p, self.kl2_h_p2q = \
                self._construct_kld_costs(p=2.0)
        self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_z_p2q)
        self.kl2_h = (self.lam_kld_q2p[0] * self.kl2_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_h_p2q)
        self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_h
        # compute joint l1/l2 KLd cost
        self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \
                ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        self.kl2_cost = T.mean(self.kl2_costs)
        self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs(self.x_out)
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_l1l2_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_l1l2_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.group_1_updates = get_adam_updates(params=self.group_1_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.group_2_updates = get_adam_updates(params=self.group_2_params, \
                grads=self.joint_grads, alpha=self.lr_2, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.group_1_updates:
            self.joint_updates[k] = self.group_1_updates[k]
        for k in self.group_2_updates:
            self.joint_updates[k] = self.group_2_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        print("Compiling data-guided model sampler...")
        self.sample_from_input = self._construct_sample_from_input()
        # make easy access points for some interesting parameters
        self.gen_gen_weights = self.p_x_given_s_h.mu_layers[-1].W
        return