Esempio n. 1
0
    def __init__(self, rng=None, \
            Xd=None, Yd=None, Xc=None, Xm=None, \
            g_net=None, i_net=None, p_net=None, \
            data_dim=None, prior_dim=None, label_dim=None, \
            params=None):
        # TODO: refactor for use with "encoded" inferencer/generator
        assert(not (i_net.use_encoder or g_net.use_decoder))

        # setup a rng for this GIStack
        self.rng = RandStream(rng.randint(100000))
        # record the symbolic variables that will provide inputs to the
        # computation graph created for this GIStack
        self.Xd = Xd
        self.Yd = Yd
        self.Xc = Xc
        self.Xm = Xm
        self.Xd2 = T.vertical_stack(self.Xd, self.Xd)
        self.Yd2 = T.vertical_stack(self.Yd, self.Yd)
        self.Xc2 = T.vertical_stack(self.Xc, self.Xc)
        self.Xm2 = T.vertical_stack(self.Xm, self.Xm)
        self.obs_count = T.cast(self.Xd2.shape[0], 'floatX')
        # record the dimensionality of the data handled by this GIStack
        self.data_dim = data_dim
        self.label_dim = label_dim
        self.prior_dim = prior_dim
        # create a "shared-parameter" clone of the latent inferencer
        self.IN2 = i_net.shared_param_clone(rng=rng, \
                Xd=self.Xd2, Xc=self.Xc2, Xm=self.Xm2)
        # capture a handle for latent samples from the inferencer
        self.Xp2 = self.IN2.output
        # feed it into a shared-parameter clone of the generator
        self.GN2 = g_net.shared_param_clone(rng=rng, Xp=self.Xp2)
        # capture a handle for outputs from the observation generator
        self.Xg2 = self.GN2.output
        # and feed it into a shared-parameter clone of the label generator
        self.PN2 = p_net.shared_param_clone(rng=rng, Xd=self.Xp2)
        # capture handles for noisy/clean outputs of the label generator
        self.Yp2 = self.PN2.output_spawn[0] # noisy predictions
        self.Yp2_proto = self.PN2.output_proto # noise-free predictions

        # we require the PeaNet to have one proto-net and one spawn net
        assert(len(self.PN2.proto_nets) == 1)
        assert(len(self.PN2.spawn_nets) == 1)
        # check that all networks agree on the latent variable dimension
        assert(self.prior_dim == self.IN2.mu_layers[-1].out_dim)
        assert(self.prior_dim == self.IN2.sigma_layers[-1].out_dim)
        assert(self.prior_dim == self.GN2.mlp_layers[0].in_dim)
        assert(self.prior_dim == self.PN2.proto_nets[0][0].in_dim)
        # check that we've been told the correct cardinality for the
        # categorical variable we will be "decoding"
        assert(self.label_dim == self.PN2.proto_nets[0][-1].out_dim)

        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        # shared var learning rates for all networks
        self.lr_gn = theano.shared(value=zero_ary, name='gis_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='gis_lr_in')
        self.lr_pn = theano.shared(value=zero_ary, name='gis_lr_pn')
        # shared var momentum parameters for all networks
        self.mom_1 = theano.shared(value=zero_ary, name='gis_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='gis_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='gis_it_count')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='gis_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting posterior KL-div from prior
        self.lam_kld = theano.shared(value=zero_ary, name='gis_lam_kld')
        self.set_lam_kld(lam_kld=1.0)
        # init shared var for weighting semi-supervised classification
        self.lam_cat = theano.shared(value=zero_ary, name='gis_lam_cat')
        self.set_lam_cat(lam_cat=0.0)
        # init shared var for weighting PEA cost on (un)supervised inputs
        self.lam_pea_su = theano.shared(value=zero_ary, name='gis_lam_pea_su')
        self.lam_pea_un = theano.shared(value=zero_ary, name='gis_lam_pea_un')
        self.set_lam_pea(lam_pea_su=1.0, lam_pea_un=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='gis_lam_l2w')
        self.set_lam_l2w(lam_l2w=1e-3)

        # grab the full set of "optimizable" parameters from the generator
        # and inferencer networks that we'll be working with.
        self.gn_params = [p for p in self.GN2.mlp_params]
        self.in_params = [p for p in self.IN2.mlp_params]
        self.pn_params = [p for p in self.PN2.proto_params]
        self.joint_params = self.pn_params + self.in_params + self.gn_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        pea_cost_su, pea_cost_un = self._construct_post_pea_costs()
        self.data_nll_cost = self.lam_nll[0] * self._construct_data_nll_cost()
        self.post_kld_cost = self.lam_kld[0] * self._construct_post_kld_cost()
        self.post_cat_cost = self.lam_cat[0] * self._construct_post_cat_cost()
        self.post_pea_cost = (self.lam_pea_su[0] * pea_cost_su) + \
                (self.lam_pea_un[0] * pea_cost_un)
        self.other_reg_cost = self._construct_other_reg_cost()
        self.joint_cost = self.data_nll_cost + self.post_kld_cost + self.post_cat_cost + \
                self.post_pea_cost + self.other_reg_cost

        # grab the gradients for all parameters to optimize
        self.joint_grads = OrderedDict()
        for p in self.joint_params:
            self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1)

        # construct the updates for all parameters to optimize
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in_updates = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.pn_updates = get_adam_updates(params=self.pn_params, \
                grads=self.joint_grads, alpha=self.lr_pn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        #self.gn_updates = get_adadelta_updates(params=self.gn_params, \
        #        grads=self.joint_grads, alpha=self.lr_gn, beta1=0.98)
        #self.in_updates = get_adadelta_updates(params=self.in_params, \
        #        grads=self.joint_grads, alpha=self.lr_in, beta1=0.98)
        #self.pn_updates = get_adadelta_updates(params=self.pn_params, \
        #        grads=self.joint_grads, alpha=self.lr_dn, beta1=0.98)

        # bag up all the updates required for training
        self.joint_updates = OrderedDict()
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]
        for k in self.pn_updates:
            self.joint_updates[k] = self.pn_updates[k]

        # construct a training function for all parameters. training for the
        # various networks can be switched on and off via learning rates
        self.train_joint = self._construct_train_joint()
        return
Esempio n. 2
0
    def __init__(self, rng=None, Xd=None, Xp=None, d_net=None, g_net=None, \
                 obs_dim=None, z_dim=None, params=None):
        # Do some stuff!
        self.rng = RandStream(rng.randint(100000))
        self.obs_dim = obs_dim
        self.z_dim = z_dim
        self.params = params
        # check that z_dim agrees with input dim for g_net
        assert(self.z_dim == g_net.shared_layers[0].in_dim)
        # set the transform on generator's raw output
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)

        # symbolic var for inputting samples from the data distribution
        self.Xd = Xd
        # symbolic var for inputting samples from the generator's prior
        self.Xp = Xp
        # symbolic matrix of indices for data inputs
        self.Id = T.lvector(name='gcp_Id')
        # symbolic matrix of indices for noise inputs
        self.In = T.lvector(name='gcp_In')

        # create clones of the given generator and discriminator, after
        # rewiring their computation graphs to take the right inputs
        self.GN = g_net.shared_param_clone(rng=rng, Xd=self.Xp)
        self.out_mean, self.out_logvar, self.out_samples = \
                self.GN.apply(self.Xp, do_samples=True)
        self.Xg = self.obs_transform(self.out_samples)
        self.DN = d_net.shared_param_clone(rng=rng, \
                Xd=T.vertical_stack(self.Xd, self.Xg))

        # shared var learning rate for generator and discriminator
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_gn = theano.shared(value=zero_ary, name='gcp_lr_gn')
        self.lr_dn = theano.shared(value=zero_ary, name='gcp_lr_dn')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='msm_it_count')
        # shared var weights for collaborative classification objective
        self.dw_gn = theano.shared(value=zero_ary, name='gcp_dw_gn')
        self.dw_dn = theano.shared(value=zero_ary, name='gcp_dw_dn')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()    # init SGD rate/momentum
        self.set_disc_weights()  # initcollaborative cost weights for GN/DN
        self.lam_l2d = theano.shared(value=(zero_ary + self.params['lam_l2d']), \
                name='gcp_lam_l2d')

        #######################################################
        # Welcome to: Moment Matching Cost Information Center #
        #######################################################
        #
        # Get parameters for managing the moment matching cost. The moment
        # matching is based on exponentially-decaying estimates of the mean
        # and covariance of the distribution induced by the generator network
        # and the (latent) noise being fed to it.
        #
        # We provide the option of performing moment matching with either the
        # raw generator output, or with linearly-transformed generator output.
        # Either way, the given target mean and covariance should have the
        # appropriate dimension for the space in which we'll be matching the
        # generator's 1st/2nd moments with the target's 1st/2nd moments. For
        # clarity, the computation we'll perform looks like:
        #
        #   Xm = X - np.mean(X, axis=0)
        #   XmP = np.dot(Xm, P)
        #   C = np.dot(XmP.T, XmP)
        #
        # where Xm is the mean-centered samples from the generator and P is
        # the matrix for the linear transform to apply prior to computing
        # the moment matching cost. For simplicity, the above code ignores the
        # use of an exponentially decaying average to track the estimated mean
        # and covariance of the generator's output distribution.
        #
        # The relative contribution of the current batch to these running
        # estimates is determined by self.mom_mix_rate. The mean estimate is
        # first updated based on the current batch, then the current batch
        # is centered with the updated mean, then the covariance estimate is
        # updated with the mean-centered samples in the current batch.
        #
        # Strength of the moment matching cost is given by self.mom_match_cost.
        # Target mean/covariance are given by self.target_mean/self.target_cov.
        # If a linear transform is to be applied prior to matching, it is given
        # by self.mom_match_proj.
        #
        C_init = to_fX( np.zeros((self.obs_dim, self.obs_dim)) )
        m_init = to_fX( np.zeros((self.obs_dim,)) )
        self.dist_cov = theano.shared(C_init, name='gcp_dist_cov')
        self.dist_mean = theano.shared(m_init, name='gcp_dist_mean')
        

        zero_ary = np.zeros((1,))
        mmr = zero_ary + self.params['mom_mix_rate']
        self.mom_mix_rate = theano.shared(name='gcp_mom_mix_rate', \
            value=to_fX(mmr))
        mmw = zero_ary + self.params['mom_match_weight']
        self.mom_match_weight = theano.shared(name='gcp_mom_match_weight', \
            value=to_fX(mmw))
        targ_mean = to_fX( self.params['target_mean'] )
        targ_cov = to_fX( self.params['target_cov'] )
        assert(targ_mean.size == targ_cov.shape[0]) # mean and cov use same dim
        assert(targ_cov.shape[0] == targ_cov.shape[1]) # cov must be square
        self.target_mean = theano.shared(value=targ_mean, name='gcp_target_mean')
        self.target_cov = theano.shared(value=targ_cov, name='gcp_target_cov')
        mmp = np.identity(targ_cov.shape[0]) # default to identity transform
        if 'mom_match_proj' in self.params:
            mmp = self.params['mom_match_proj'] # use a user-specified transform
        assert(mmp.shape[0] == self.obs_dim) # transform matches data dim
        assert(mmp.shape[1] == targ_cov.shape[0]) # and matches mean/cov dims
        mmp = to_fX( mmp )
        self.mom_match_proj = theano.shared(value=mmp, name='gcp_mom_map_proj')
        # finally, we can construct the moment matching cost! and the updates
        # for the running mean/covariance estimates too!
        self.mom_match_cost, self.mom_updates = self._construct_mom_stuff()
        #########################################
        # Thank you for visiting the M.M.C.I.C. #
        #########################################

        # Grab the full set of "optimizable" parameters from the generator
        # and discriminator networks that we'll be working with. We need to
        # ignore parameters in the final layers of the proto-networks in the
        # discriminator network (a generalized pseudo-ensemble). We ignore them
        # because the GCPair requires that they be "bypassed" in favor of some
        # binary classification layers that will be managed by this GCPair.
        self.dn_params = []
        for pn in self.DN.proto_nets:
            for pnl in pn[0:-1]:
                self.dn_params.extend(pnl.params)
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.dn_params + self.gn_params
        # Now construct a binary discriminator layer for each proto-net in the
        # discriminator network. And, add their params to optimization list.
        self._construct_disc_layers(rng)
        self.disc_reg_cost = self.lam_l2d[0] * \
                T.sum([dl.act_l2_sum for dl in self.disc_layers])

        # Construct costs for the generator and discriminator networks based 
        # on collaborative binary classification
        self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs()

        # compute small l2 penalty on params
        self.dn_l2_cost = constFX(1e-4) * T.sum([T.sum(p**2.0) for p in self.dn_params])
        self.gn_l2_cost = constFX(1e-4) * T.sum([T.sum(p**2.0) for p in self.gn_params])

        # Cost w.r.t. discriminator parameters is only the collaborative binary
        # classification cost. Cost w.r.t. comprises a collaborative binary
        # classification cost and the (weighted) moment matching cost.
        self.dn_cost = self.disc_cost_dn + self.disc_reg_cost + self.dn_l2_cost
        self.gn_cost = self.disc_cost_gn + self.mom_match_cost + self.gn_l2_cost
        self.joint_cost = self.dn_cost + self.gn_cost

        # Compute gradients on generator and dicriminator parameters
        print("Computing gradients on generator...")
        self.gn_grads = OrderedDict()
        grad_list = T.grad(self.gn_cost, self.gn_params)
        for i, p in enumerate(self.gn_params):
            self.gn_grads[p] = grad_list[i]
        print("Computing gradients on discriminator...")
        self.dn_grads = OrderedDict()
        grad_list = T.grad(self.dn_cost, self.dn_params)
        for i, p in enumerate(self.dn_params):
            self.dn_grads[p] = grad_list[i]

        # Construct the updates for the generator and discriminator network
        self.joint_updates = OrderedDict()
        self.dn_updates = OrderedDict()
        self.gn_updates = OrderedDict()
        for var in self.mom_updates:
            # these updates are for the generator distribution's running first
            # and second-order moment estimates
            self.gn_updates[var] = self.mom_updates[var]
            self.joint_updates[var] = self.gn_updates[var]
        # Construct the updates for the generator and inferencer networks
        self.dn_updates = get_adam_updates(params=self.dn_params, \
                grads=self.dn_grads, alpha=self.lr_dn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.gn_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        for k in self.dn_updates:
            self.joint_updates[k] = self.dn_updates[k]
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]

        # Construct batch-based training functions for the generator and
        # discriminator networks, as well as a joint training function.
        print("Compiling generator training function...")
        self.train_gn = self._construct_train_gn()
        print("Compiling discriminator training function...")
        self.train_dn = self._construct_train_dn()
        print("Compiling joint training function...")
        self.train_joint = self._construct_train_joint()

        # Construct a function for computing the ouputs of the generator
        # network for a batch of noise. Presumably, the noise will be drawn
        # from the same distribution that was used in training....
        self.sample_from_gn = self._construct_model_sampler()
        return
    def __init__(self, rng=None,
            x_out=None, \
            p_z_given_x=None, \
            p_x_given_z=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this WalkoutModel
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params['x_dim']
        self.z_dim = self.params['z_dim']
        self.walkout_steps = self.params['walkout_steps']
        self.x_type = self.params['x_type']
        self.shared_param_dicts = shared_param_dicts
        if 'x_transform' in self.params:
            assert((self.params['x_transform'] == 'sigmoid') or \
                    (self.params['x_transform'] == 'none'))
            if self.params['x_transform'] == 'sigmoid':
                self.x_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.x_transform = lambda x: x
        else:
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        assert((self.step_type == 'add') or (self.step_type == 'jump'))

        # grab handles to the relevant networks
        self.p_z_given_x = p_z_given_x
        self.p_x_given_z = p_x_given_z

        # record the symbolic variables that will provide inputs to the
        # computation graph created for this WalkoutModel
        self.x_out = x_out           # target output for generation
        self.zi_zmuv = T.tensor3()   # ZMUV gauss noise for walk-out wobble

        if self.shared_param_dicts is None:
            # initialize the parameters "owned" by this model
            zero_ary = to_fX( np.zeros((1,)) )
            self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])
            self.shared_param_dicts = {}
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            # grab the parameters required by this model from a given dict
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])

        ###############################################################
        # Setup the forwards (i.e. training) walk-out loop using scan #
        ###############################################################
        def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw):
            # get samples of next zi, according to the forwards model
            zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv)

            # check reverse direction probability p(xi_fw | zi_fw)
            xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_bw_mean = self.x_transform(xi_bw_mean)
            nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \
                        log_vars=xi_bw_logvar, mask=None)
            nll_xi_bw = nll_xi_bw.flatten()

            # get samples of next xi, according to the forwards model
            xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_fw_mean = self.x_transform(xi_fw_mean)
            xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv)

            # check reverse direction probability p(zi_fw | xi_fw)
            zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \
                        log_vars=zi_bw_logvar, mask=None)
            nll_zi_bw = nll_zi_bw.flatten()

            # each loop iteration produces the following values:
            #   xi_fw: xi generated fom zi by forwards walk
            #   zi_fw: zi generated fom xi by forwards walk
            #   xi_fw_mean: ----
            #   xi_fw_logvar: ----
            #   zi_fw_mean: ----
            #   zi_fw_logvar: ----
            #   nll_xi_bw: NLL for reverse step zi_fw -> xi_fw
            #   nll_zi_bw: NLL for reverse step xi_fw -> zi_fw
            return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw

        # initialize states for x/z
        self.x0 = self.x_out
        self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim)
        # setup initial values to pass to scan op
        outputs_init = [self.x0, self.z0, None, None, None, None, None, None]
        sequences_init = [self.xi_zmuv, self.zi_zmuv]
        # apply scan op for the sequential imputation loop
        self.scan_results, self.scan_updates = theano.scan(forwards_loop, \
                    outputs_info=outputs_init, \
                    sequences=sequences_init)

        # grab results of the scan op. all values are computed for each step
        self.xi = self.scan_results[0]
        self.zi = self.scan_results[1]
        self.xi_fw_mean = self.scan_results[2]
        self.xi_fw_logvar = self.scan_results[3]
        self.zi_fw_mean = self.scan_results[4]
        self.zi_fw_logvar = self.scan_results[5]
        self.nll_xi_bw = self.scan_results[6]
        self.nll_zi_bw = self.scan_results[7]

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='srr_lr')
        # shared var momentum parameters for ADAM optimization
        self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared vars for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p')
        self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q')
        self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g')
        self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s')
        self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w')
        self.set_lam_l2w(1e-5)

        # grab all of the "optimizable" parameters from the base networks
        self.joint_params = [self.s0, self.obs_logvar, self.step_scales]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.p_x_given_si.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0)
        self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \
                         (self.lam_kld_q[0] * self.kld_q) + \
                         (self.lam_kld_g[0] * self.kld_g) + \
                         (self.lam_kld_s[0] * self.kld_s)
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs
        self.nll_cost = T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct theano functions for training and diagnostic computations
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling sequence sampler...")
        self.sequence_sampler = self._construct_sequence_sampler()
        # make easy access points for some interesting parameters
        #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return
Esempio n. 4
0
    def __init__(self, rng=None, \
            Xd=None, Xc=None, Xm=None, \
            g_net=None, i_net=None, \
            data_dim=None, prior_dim=None, \
            params=None, shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))
        if params is None:
            self.params = {}
        else:
            self.params = params

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this GIPair
        self.Xd = Xd
        self.Xc = Xc
        self.Xm = Xm
        # check whether we'll be working with "encoded" inputs
        self.use_encoder = i_net.use_encoder
        print("i_net.use_encoder: {0:s}, g_net.use_decoder: {1:s}".format( \
                str(i_net.use_encoder), str(g_net.use_decoder)))
        assert(self.use_encoder == g_net.use_decoder)
        # create a "shared-parameter" clone of the inferencer, set up to
        # receive input from the appropriate symbolic variables.
        self.IN = i_net.shared_param_clone(rng=rng, \
                Xd=apply_mask(self.Xd, self.Xc, self.Xm))
        self.posterior_means = self.IN.output_mean
        self.posterior_sigmas = self.IN.output_sigma
        self.posterior_norms = T.sqrt(T.sum(self.posterior_means**2.0, axis=1, keepdims=1))
        self.posterior_klds = self.IN.kld_cost
        self.kld2_scale = self.IN.kld2_scale
        # capture a handle for samples from the variational posterior
        self.Xp = self.IN.output
        # create a "shared-parameter" clone of the generator, set up to
        # receive input from samples from the variational posterior
        self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output)
        # capture a handle for sampled reconstructions from the generator
        self.Xg = self.GN.output

        # record and validate the data dimensionality parameters
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        # output of the generator and input to the inferencer should both be
        # equal to self.data_dim
        assert(self.data_dim == self.GN.mlp_layers[-1].out_dim)
        assert(self.data_dim == self.IN.shared_layers[0].in_dim)
        # input of the generator and mu/sigma outputs of the inferencer should
        # both be equal to self.prior_dim
        assert(self.prior_dim == self.GN.mlp_layers[0].in_dim)
        assert(self.prior_dim == self.IN.mu_layers[-1].out_dim)
        assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim)

        # determine whether this GIPair is a clone or an original
        if shared_param_dicts is None:
            # This is not a clone, and we will need to make a dict for
            # referring to the parameters of each network layer
            self.shared_param_dicts = {}
            self.is_clone = False
        else:
            # This is a clone, and its layer parameters can be found by
            # referring to the given param dict (i.e. shared_param_dicts).
            self.shared_param_dicts = shared_param_dicts
            self.is_clone = True

        if not self.is_clone:
            # shared var learning rate for generator and inferencer
            zero_ary = np.zeros((1,)).astype(theano.config.floatX)
            self.lr_gn = theano.shared(value=zero_ary, name='gip_lr_gn')
            self.lr_in = theano.shared(value=zero_ary, name='gip_lr_in')
            # shared var momentum parameters for generator and inferencer
            self.mom_1 = theano.shared(value=zero_ary, name='gip_mom_1')
            self.mom_2 = theano.shared(value=zero_ary, name='gip_mom_2')
            self.it_count = theano.shared(value=zero_ary, name='gip_it_count')
            # init parameters for controlling learning dynamics
            self.set_all_sgd_params()
            # init shared var for weighting nll of data given posterior sample
            self.lam_nll = theano.shared(value=zero_ary, name='gip_lam_nll')
            self.set_lam_nll(lam_nll=1.0)
            # init shared var for weighting prior kld against reconstruction
            self.lam_kld = theano.shared(value=zero_ary, name='gip_lam_kld')
            self.set_lam_kld(lam_kld=1.0)
            # init shared var for controlling l2 regularization on params
            self.lam_l2w = theano.shared(value=zero_ary, name='gip_lam_l2w')
            self.set_lam_l2w(1e-4)
            # record shared parameters that are to be shared among clones
            self.shared_param_dicts['gip_lr_gn'] = self.lr_gn
            self.shared_param_dicts['gip_lr_in'] = self.lr_in
            self.shared_param_dicts['gip_mom_1'] = self.mom_1
            self.shared_param_dicts['gip_mom_2'] = self.mom_2
            self.shared_param_dicts['gip_it_count'] = self.it_count
            self.shared_param_dicts['gip_lam_nll'] = self.lam_nll
            self.shared_param_dicts['gip_lam_kld'] = self.lam_kld
            self.shared_param_dicts['gip_lam_l2w'] = self.lam_l2w
        else:
            # use some shared parameters that are shared among all clones of
            # some "base" GIPair
            self.lr_gn = self.shared_param_dicts['gip_lr_gn']
            self.lr_in = self.shared_param_dicts['gip_lr_in']
            self.mom_1 = self.shared_param_dicts['gip_mom_1']
            self.mom_2 = self.shared_param_dicts['gip_mom_2']
            self.it_count = self.shared_param_dicts['gip_it_count']
            self.lam_nll = self.shared_param_dicts['gip_lam_nll']
            self.lam_kld = self.shared_param_dicts['gip_lam_kld']
            self.lam_l2w = self.shared_param_dicts['gip_lam_l2w']

        # Grab the full set of "optimizable" parameters from the generator
        # and inferencer networks that we'll be working with.
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.data_nll_cost = self.lam_nll[0] * self._construct_data_nll_cost()
        self.post_kld_cost = self.lam_kld[0] * \
                self._construct_post_kld_cost(kld2_scale=self.kld2_scale)
        self.other_reg_cost = self._construct_other_reg_cost()
        self.joint_cost = self.data_nll_cost + self.post_kld_cost + \
                self.other_reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        for p in self.joint_params:
            self.joint_grads[p] = T.grad(self.joint_cost, p)

        # Construct the updates for the generator and inferencer networks
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.in_updates = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]
        self.joint_updates[self.IN.kld_mean] = self.IN.kld_mean_update

        # Construct a function for jointly training the generator/inferencer
        self.train_joint = self._construct_train_joint()
        self.compute_costs = self._construct_compute_costs()
        self.compute_ll_bound = self._construct_compute_ll_bound()
        self.compute_post_stats = self._construct_compute_post_stats()
        return
    def __init__(self, rng=None, x_in=None, \
            p_x_given_z=None, q_z_given_x=None, \
            x_dim=None, z_dim=None, \
            params=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        if params is None:
            self.params = {}
        else:
            self.params = params
        if 'xt_transform' in self.params:
            assert((self.params['xt_transform'] == 'sigmoid') or \
                    (self.params['xt_transform'] == 'none'))
            if self.params['xt_transform'] == 'sigmoid':
                self.xt_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.xt_transform = lambda x: x
        else:
            self.xt_transform = lambda x: T.nnet.sigmoid(x)
        if 'logvar_bound' in self.params:
            self.logvar_bound = self.params['logvar_bound']
        else:
            self.logvar_bound = 10.0
        #
        # x_type: this tells if we're using bernoulli or gaussian model for
        #         the observations
        #
        self.x_type = self.params['x_type']
        assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim

        # set parameters for the isotropic Gaussian prior over z
        self.prior_mean = 0.0
        self.prior_logvar = 0.0

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this OneStageModel
        self.x_in = x_in

        #####################################################################
        # Setup the computation graph that provides values in our objective #
        #####################################################################
        # inferencer model for latent variables given observations
        self.q_z_given_x = q_z_given_x
        self.z_mean, self.z_logvar = self.q_z_given_x.apply(self.x_in)
        # reparametrize ZMUV Gaussian samples to get latent samples...
        self.z = reparametrize(self.z_mean, self.z_logvar, rng=self.rng)

        # generator model for observations given latent variables
        self.p_x_given_z = p_x_given_z
        self.xt, _ = self.p_x_given_z.apply(self.z)

        # construct the final output of generator, conditioned on z
        if self.x_type == 'bernoulli':
            self.xg = T.nnet.sigmoid(self.xt)
        else:
            self.xg = self.xt_transform(self.xt)

        # self.output_logvar modifies the output distribution
        zero_ary = to_fX(np.zeros((1, )))
        self.output_logvar = theano.shared(value=zero_ary,
                                           name='osm_output_logvar')
        self.bounded_logvar = self.logvar_bound * \
                    T.tanh(self.output_logvar[0] / self.logvar_bound)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX(np.zeros((1, )))
        self.lr = theano.shared(value=zero_ary, name='osm_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='osm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='osm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='osm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting controlling KL(q(z|x) || p(z))
        self.lam_kld = theano.shared(value=zero_ary, name='osm_lam_kld')
        self.set_lam_kld(lam_kld=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='osm_lam_l2w')
        self.set_lam_l2w(1e-4)

        # grab a list of all the parameters to optimize
        self.joint_params = [self.output_logvar]
        self.joint_params.extend(self.q_z_given_x.mlp_params)
        self.joint_params.extend(self.p_x_given_z.mlp_params)

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        # first, do NLL
        self.nll_costs = self.lam_nll[0] * self._construct_nll_costs()
        self.nll_cost = T.mean(self.nll_costs)
        # second, do KLd
        self.kld_costs = self.lam_kld[0] * self._construct_kld_costs()
        self.kld_cost = T.mean(self.kld_costs)
        # third, do regularization
        self.reg_cost = self.lam_l2w[0] * self._construct_reg_costs()
        # finally, combine them for the joint cost.
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)

        # Construct a function for jointly training the generator/inferencer
        print("Compiling self.train_joint...")
        self.train_joint = self._construct_train_joint()
        print("Compiling self.compute_fe_terms...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling self.compute_post_klds...")
        self.compute_post_klds = self._construct_compute_post_klds()
        print("Compiling self.sample_from_prior...")
        self.sample_from_prior = self._construct_sample_from_prior()
        self.transform_x_to_z = theano.function(inputs=[self.x_in], \
                                                outputs=self.z_mean)
        self.transform_z_to_x = theano.function(inputs=[self.z], \
                                                outputs=self.xg)
        self.inf_weights = self.q_z_given_x.shared_layers[0].W
        self.gen_weights = self.p_x_given_z.output_layers[-1].W
        return
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_h_given_z=None, \
            p_x_given_h=None, \
            q_z_given_x=None, \
            q_h_given_z_x=None, \
            x_dim=None, \
            z_dim=None, \
            h_dim=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim
        self.h_dim = h_dim

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x
        self.q_h_given_z_x = q_h_given_z_x
        self.p_h_given_z = p_h_given_z
        self.p_x_given_h = p_x_given_h

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch')
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((1,self.z_dim)) )
            self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar')
            self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        ##############################################
        # Setup the TwoStageModels main computation. #
        ##############################################
        print("Building TSM...")
        # samples of "hidden" latent state (from both p and q)
        z_q_mean, z_q_logvar, z_q = \
                self.q_z_given_x.apply(self.x_in, do_samples=True)
        z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0)
        z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0)
        zmuv = self.rng.normal(size=z_q.shape, avg=0.0, std=1.0, \
                               dtype=theano.config.floatX)
        z_p = (T.exp(0.5*z_p_logvar) * zmuv) + z_p_mean
        self.z = (self.train_switch[0] * z_q) + \
                 ((1.0 - self.train_switch[0]) * z_p)
        # compute relevant KLds for this step
        self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, \
                                      z_p_mean, z_p_logvar)
        self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, \
                                      z_q_mean, z_q_logvar)
        # samples of "hidden" latent state (from both p and q)
        h_p_mean, h_p_logvar, h_p = self.p_h_given_z.apply(self.z)
        h_q_mean, h_q_logvar, h_q = self.q_h_given_z_x.apply( \
                T.horizontal_stack(h_p_mean, h_p_logvar, self.x_out))
        self.h = (self.train_switch[0] * h_q) + \
                 ((1.0 - self.train_switch[0]) * h_p)
        # compute relevant KLds for this step
        self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, \
                                      h_p_mean, h_p_logvar)
        self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, \
                                      h_q_mean, h_q_logvar)

        # p_x_given_h generates an observation x conditioned on the "hidden"
        # latent variables h.
        self.x_gen, _ = self.p_x_given_h.apply(self.h, do_samples=False)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='tsm_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # get optimizable parameters belonging to the TwoStageModel
        self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar]
        # get optimizable parameters belonging to the underlying networks
        child_params = []
        child_params.extend(self.q_z_given_x.mlp_params)
        child_params.extend(self.q_h_given_z_x.mlp_params)
        child_params.extend(self.p_h_given_z.mlp_params)
        child_params.extend(self.p_x_given_h.mlp_params)
        # make a joint list of all optimizable parameters
        self.joint_params = self_params + child_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_h_p2q)
        self.kld_costs = T.sum(self.kld_z, axis=1) + \
                         T.sum(self.kld_h, axis=1)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs(self.x_out)
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # construct the updates for the generator and inferencer networks
        all_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0)
        self.joint_updates = OrderedDict()
        for k in all_updates:
            self.joint_updates[k] = all_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        return
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_s0_given_z=None, \
            p_hi_given_si=None, \
            p_sip1_given_si_hi=None, \
            q_z_given_x=None, \
            q_hi_given_x_si=None, \
            obs_dim=None, \
            z_dim=None, h_dim=None, \
            ir_steps=4, params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.obs_dim = obs_dim
        self.z_dim = z_dim
        self.h_dim = h_dim
        self.ir_steps = ir_steps

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x
        self.q_hi_given_x_si = q_hi_given_x_si
        self.p_s0_given_z = p_s0_given_z
        self.p_hi_given_si = p_hi_given_si
        self.p_sip1_given_si_hi = p_sip1_given_si_hi

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.hi_zmuv = T.tensor3() # for ZMUV Gaussian samples to use in scan

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)
        # setup a variable for controlling dropout noise
        self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate')
        self.set_drop_rate(0.0)
        # this weight balances l1 vs. l2 penalty on posterior KLds
        self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2')
        self.set_lam_kld_l1l2(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((self.z_dim,)) )
            self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar')
            init_vec = to_fX( np.zeros((self.obs_dim,)) )
            self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        # setup a function for computing reconstruction log likelihood
        if self.x_type == 'bernoulli':
            self.log_prob_func = lambda xo, xh: \
                    (-1.0 * log_prob_bernoulli(xo, xh))
        else:
            self.log_prob_func = lambda xo, xh: \
                    (-1.0 * log_prob_gaussian2(xo, xh, \
                     log_vars=self.bounded_logvar))

        # get a drop mask that drops things with probability p
        drop_scale = 1. / (1. - self.drop_rate[0])
        drop_rnd = self.rng.uniform(size=self.x_out.shape, \
                low=0.0, high=1.0, dtype=theano.config.floatX)
        drop_mask = drop_scale * (drop_rnd > self.drop_rate[0])

        #############################
        # Setup self.z and self.s0. #
        #############################
        print("Building MSM step 0...")
        drop_x = drop_mask * self.x_in
        self.q_z_mean, self.q_z_logvar, self.z = \
                self.q_z_given_x.apply(drop_x, do_samples=True)
        # get initial observation state
        self.s0, _ = self.p_s0_given_z.apply(self.z, do_samples=False)

        # gather KLd and NLL for the initialization step
        self.init_klds = gaussian_kld(self.q_z_mean, self.q_z_logvar, \
                                      self.p_z_mean, self.p_z_logvar)
        self.init_nlls =  -1.0 * \
                self.log_prob_func(self.x_out, self.obs_transform(self.s0))

        ##################################################
        # Setup the iterative generation loop using scan #
        ##################################################
        def ir_step_func(hi_zmuv, sim1):
            # get variables used throughout this refinement step
            sim1_obs = self.obs_transform(sim1) # transform state -> obs
            grad_ll = self.x_out - sim1_obs

            # get samples of next hi, conditioned on current si
            hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \
                    sim1_obs, do_samples=False)
            # now we build the model for variational hi given si
            hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \
                    T.horizontal_stack(grad_ll, sim1_obs), \
                    do_samples=False)
            hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean
            hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean

            # make hi samples that can be switched between hi_p and hi_q
            hi = ( ((self.train_switch[0] * hi_q) + \
                    ((1.0 - self.train_switch[0]) * hi_p)) )

            # p_sip1_given_si_hi is conditioned on si and  hi.
            ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi)
                    
            # get the transformed values (for an LSTM style update)
            i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0)
            f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0)
            # perform an LSTM-like update of the state sim1 -> si
            si = (in_vals * i_gate) + (sim1 * f_gate)

            # compute generator NLL for this step
            nlli = self.log_prob_func(self.x_out, self.obs_transform(si))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \
                                    hi_p_mean, hi_p_logvar)
            kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \
                                    hi_q_mean, hi_q_logvar)
            return si, nlli, kldi_q2p, kldi_p2q

        init_values = [self.s0, None, None, None]

        self.scan_results, self.scan_updates = theano.scan(ir_step_func, \
                outputs_info=init_values, sequences=self.hi_zmuv)

        self.si = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi_q2p = self.scan_results[2]
        self.kldi_p2q = self.scan_results[3]

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z')
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.q_params = []
        self.q_params.extend(self.q_z_given_x.mlp_params)
        self.q_params.extend(self.q_hi_given_x_si.mlp_params)
        # Grab all of the "optimizable" parameters in "group 2"
        self.p_params = [self.p_z_mean, self.p_z_logvar]
        self.p_params.extend(self.p_hi_given_si.mlp_params)
        self.p_params.extend(self.p_sip1_given_si_hi.mlp_params)
        self.p_params.extend(self.p_s0_given_z.mlp_params)

        # Make a joint list of parameters group 1/2
        self.joint_params = self.q_params + self.p_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z_q2p, self.kld_z_p2q, self.kld_hi_q2p, self.kld_hi_p2q = \
                self._construct_kld_costs(p=1.0)
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_hi = (self.lam_kld_q2p[0] * self.kld_hi_q2p) + \
                      (self.lam_kld_p2q[0] * self.kld_hi_p2q)
        self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_hi
        # now do l2 KLd costs
        self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_hi_q2p, self.kl2_hi_p2q = \
                self._construct_kld_costs(p=2.0)
        self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_z_p2q)
        self.kl2_hi = (self.lam_kld_q2p[0] * self.kl2_hi_q2p) + \
                      (self.lam_kld_p2q[0] * self.kl2_hi_p2q)
        self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_hi
        # compute joint l1/l2 KLd cost
        self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \
                ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        self.kl2_cost = T.mean(self.kl2_costs)
        self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.nlli[-1]
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_l1l2_cost + \
                          self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_l1l2_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.q_updates = get_adam_updates(params=self.q_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.p_updates = get_adam_updates(params=self.p_params, \
                grads=self.joint_grads, alpha=self.lr_2, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.q_updates:
            self.joint_updates[k] = self.q_updates[k]
        for k in self.p_updates:
            self.joint_updates[k] = self.p_updates[k]
        # add scan updates, which seem to be required
        for k in self.scan_updates:
            self.joint_updates[k] = self.scan_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling cost computer...")
        self.compute_raw_klds = self._construct_raw_klds()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        print("Compiling data-guided model sampler...")
        self.sample_from_input = self._construct_sample_from_input()
        return
Esempio n. 8
0
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_s_given_z=None, \
            p_h_given_s=None, \
            p_x_given_s_h=None, \
            q_z_given_x=None, \
            q_h_given_x_s=None, \
            x_dim=None, \
            z_dim=None, \
            s_dim=None, \
            h_dim=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim
        self.s_dim = s_dim
        self.h_dim = h_dim

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x
        self.q_h_given_x_s = q_h_given_x_s
        self.p_s_given_z = p_s_given_z
        self.p_h_given_s = p_h_given_s
        self.p_x_given_s_h = p_x_given_s_h

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.batch_reps = T.lscalar()

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)
        # setup a variable for controlling dropout noise
        self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate')
        self.set_drop_rate(0.0)
        # this weight balances l1 vs. l2 penalty on posterior KLds
        self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2')
        self.set_lam_kld_l1l2(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((self.z_dim,)) )
            self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar')
            init_vec = to_fX( np.zeros((self.x_dim,)) )
            self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        # get a drop mask that drops things with probability p
        drop_scale = 1. / (1. - self.drop_rate[0])
        drop_rnd = self.rng.uniform(size=self.x_out.shape, \
                low=0.0, high=1.0, dtype=theano.config.floatX)
        drop_mask = drop_scale * (drop_rnd > self.drop_rate[0])

        ##############################################
        # Setup the TwoStageModels main computation. #
        ##############################################
        print("Building TSM...")
        # samples of "first" latent state
        drop_x = drop_mask * self.x_in
        z_q_mean, z_q_logvar, self.z = \
                self.q_z_given_x.apply(drop_x, do_samples=True)
        # compute relevant KLds for this step
        self.kld_z_q2ps = gaussian_kld(z_q_mean, z_q_logvar, \
                                       self.p_z_mean, self.p_z_logvar)
        self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \
                                       z_q_mean, z_q_logvar)
        # transform "first" latent state into "second" latent state
        self.s, _ = self.p_s_given_z.apply(self.z, do_samples=False)

        # get samples of h, conditioned on current s
        h_p_mean, h_p_logvar, h_p = self.p_h_given_s.apply( \
                self.s, do_samples=True)
        # get variational samples of h, given s and x_out
        h_q_mean, h_q_logvar, h_q = self.q_h_given_x_s.apply( \
                T.horizontal_stack(self.x_out, self.s), \
                do_samples=True)

        # make h samples that can be switched between h_p and h_q
        self.h = (self.train_switch[0] * h_q) + \
                 ((1.0 - self.train_switch[0]) * h_p)

        # compute relevant KLds for this step
        self.kld_h_q2ps = gaussian_kld(h_q_mean, h_q_logvar, \
                                       h_p_mean, h_p_logvar)
        self.kld_h_p2qs = gaussian_kld(h_p_mean, h_p_logvar, \
                                       h_q_mean, h_q_logvar)

        # p_x_given_s_h is conditioned on s and  h.
        self.x_gen, _ = self.p_x_given_s_h.apply( \
                T.horizontal_stack(self.s, self.h), \
                do_samples=False)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z')
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.group_1_params = []
        self.group_1_params.extend(self.q_z_given_x.mlp_params)
        self.group_1_params.extend(self.q_h_given_x_s.mlp_params)
        # Grab all of the "optimizable" parameters in "group 2"
        self.group_2_params = [self.p_z_mean, self.p_z_logvar]
        self.group_2_params.extend(self.p_s_given_z.mlp_params)
        self.group_2_params.extend(self.p_h_given_s.mlp_params)
        self.group_2_params.extend(self.p_x_given_s_h.mlp_params)

        # Make a joint list of parameters group 1/2
        self.joint_params = self.group_1_params + self.group_2_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z_q2p, self.kld_z_p2q, self.kld_h_q2p, self.kld_h_p2q = \
                self._construct_kld_costs(p=1.0)
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_h_p2q)
        self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_h
        # now do l2 KLd costs
        self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_h_q2p, self.kl2_h_p2q = \
                self._construct_kld_costs(p=2.0)
        self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_z_p2q)
        self.kl2_h = (self.lam_kld_q2p[0] * self.kl2_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_h_p2q)
        self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_h
        # compute joint l1/l2 KLd cost
        self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \
                ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        self.kl2_cost = T.mean(self.kl2_costs)
        self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs(self.x_out)
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_l1l2_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_l1l2_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.group_1_updates = get_adam_updates(params=self.group_1_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.group_2_updates = get_adam_updates(params=self.group_2_params, \
                grads=self.joint_grads, alpha=self.lr_2, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.group_1_updates:
            self.joint_updates[k] = self.group_1_updates[k]
        for k in self.group_2_updates:
            self.joint_updates[k] = self.group_2_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        print("Compiling data-guided model sampler...")
        self.sample_from_input = self._construct_sample_from_input()
        # make easy access points for some interesting parameters
        self.gen_gen_weights = self.p_x_given_s_h.mu_layers[-1].W
        return
def test_with_model_init():
    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/')
    del Xte
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 250

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    x_dim = Xtr.shape[1]
    write_dim = 220
    enc_dim = 260
    dec_dim = 260
    mix_dim = 20
    z_dim = 100
    n_iter = 18
    
    rnninits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    # setup the reader and writer
    read_dim = 2*x_dim
    reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
    writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], \
                     name="writer_mlp", **inits)
    
    # setup the mixture weight sampler
    mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \
                          name="mix_enc_mlp", **inits)
    mix_dec_mlp = MLP([Tanh(), Tanh()], \
                      [mix_dim, 250, (2*enc_dim + 2*dec_dim)], \
                      name="mix_dec_mlp", **inits)
    # setup the components of the generative DRAW model
    enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim), 4*enc_dim], \
                        name="enc_mlp_in", **inits)
    dec_mlp_in = MLP([Identity()], [               z_dim, 4*dec_dim], \
                        name="dec_mlp_in", **inits)
    enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits)
    dec_mlp_out = CondNet([], [dec_dim, z_dim], name="dec_mlp_out", **inits)
    enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="enc_rnn", **rnninits)
    dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="dec_rnn", **rnninits)
    enc_mlp_stop = MLP([Tanh(), None], [(x_dim + dec_dim), 500, 1], \
                       name="enc_mlp_stop", **inits)
    dec_mlp_stop = MLP([Tanh(), None], [dec_dim, 500, 1], \
                       name="dec_mlp_stop", **inits)

    draw = IMoESDrawModels(
                n_iter,
                step_type='add', # step_type can be 'add' or 'jump'
                mix_enc_mlp=mix_enc_mlp,
                mix_dec_mlp=mix_dec_mlp,
                reader_mlp=reader_mlp,
                writer_mlp=writer_mlp,
                enc_mlp_in=enc_mlp_in,
                enc_mlp_out=enc_mlp_out,
                enc_rnn=enc_rnn,
                enc_mlp_stop=enc_mlp_stop,
                dec_mlp_in=dec_mlp_in,
                dec_mlp_out=dec_mlp_out,
                dec_rnn=dec_rnn,
                dec_mlp_stop=dec_mlp_stop)
    draw.initialize()

    # some symbolic vars to represent various inputs/outputs
    x_in_sym = T.matrix('x_in_sym')
    x_out_sym = T.matrix('x_out_sym')

    # collect reconstructions of x produced by the IMoDRAW model
    vfe_cost, cost_all = draw.reconstruct(x_in_sym, x_out_sym)

    # grab handles for all the optimizable parameters in our cost
    cg = ComputationGraph([vfe_cost])
    joint_params = VariableFilter(roles=[PARAMETER])(cg.variables)

    # apply some l2 regularization to the model parameters
    reg_term = (1e-5 * sum([T.sum(p**2.0) for p in joint_params]))
    reg_term.name = "reg_term"

    # compute the full cost w.r.t. which we will optimize
    total_cost = vfe_cost + reg_term
    total_cost.name = "total_cost"

    # Get the gradient of the joint cost for all optimizable parameters
    print("Computing gradients of total_cost...")
    joint_grads = OrderedDict()
    grad_list = T.grad(total_cost, joint_params)
    for i, p in enumerate(joint_params):
        joint_grads[p] = grad_list[i]
    
    # shared var learning rate for generator and inferencer
    zero_ary = to_fX( np.zeros((1,)) )
    lr_shared = theano.shared(value=zero_ary, name='tbm_lr')
    # shared var momentum parameters for generator and inferencer
    mom_1_shared = theano.shared(value=zero_ary, name='tbm_mom_1')
    mom_2_shared = theano.shared(value=zero_ary, name='tbm_mom_2')
    # construct the updates for the generator and inferencer networks
    joint_updates = get_adam_updates(params=joint_params, \
            grads=joint_grads, alpha=lr_shared, \
            beta1=mom_1_shared, beta2=mom_2_shared, \
            mom2_init=1e-4, smoothing=1e-6, max_grad_norm=10.0)

    # collect the outputs to return from this function
    outputs = [total_cost, vfe_cost, reg_term]
    # compile the theano function
    print("Compiling model training/update function...")
    train_joint = theano.function(inputs=[ x_in_sym, x_out_sym ], \
                                  outputs=outputs, updates=joint_updates)
    print("Compiling NLL bound estimator function...")
    compute_nll_bound = theano.function(inputs=[ x_in_sym, x_out_sym], \
                                        outputs=outputs)
    print("Compiling model sampler...")
    n_samples = T.iscalar("n_samples")
    samples = draw.sample(n_samples)
    do_sample = theano.function([n_samples], outputs=samples, allow_input_downcast=True)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    print("Beginning to train the model...")
    out_file = open("TBM_ES_RESULTS.txt", 'wb')
    costs = [0. for i in range(10)]
    learn_rate = 0.0002
    momentum = 0.9
    fresh_idx = np.arange(batch_size) + tr_samples
    for i in range(250000):
        scale = min(1.0, ((i+1) / 2500.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        # get the indices of training samples for this batch update
        fresh_idx += batch_size
        if (np.max(fresh_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr = row_shuffle(Xtr)
            fresh_idx = np.arange(batch_size)
        batch_idx = fresh_idx
        # set sgd and objective function hyperparams for this update
        zero_ary = np.zeros((1,))
        lr_shared.set_value(to_fX(zero_ary + scale*learn_rate))
        mom_1_shared.set_value(to_fX(zero_ary + scale*momentum))
        mom_2_shared.set_value(to_fX(zero_ary + 0.99))

        # perform a minibatch update and record the cost for this batch
        Xb = to_fX( Xtr.take(batch_idx, axis=0) )
        result = train_joint(Xb, Xb)
        # aggregate costs over multiple minibatches
        costs = [(costs[j] + result[j]) for j in range(len(result))]
        if ((i % 200) == 0):
            # occasionally dump information about the costs
            costs = [(v / 200.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    total_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_bound : {0:.4f}".format(costs[1])
            str4 = "    reg_term  : {0:.4f}".format(costs[2])
            joint_str = "\n".join([str1, str2, str3, str4])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 1000) == 0):
            # compute a small-sample estimate of NLL bound on validation set
            Xva = row_shuffle(Xva)
            Xb = to_fX(Xva[:5000])
            va_costs = compute_nll_bound(Xb, Xb)
            str1 = "    va_nll_bound : {}".format(va_costs[1])
            joint_str = "\n".join([str1])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            # draw some independent samples from the model
            samples = do_sample(16*16)
            n_iter, N, D = samples.shape
            samples = samples.reshape( (n_iter, N, 28, 28) )
            for j in xrange(n_iter):
                img = img_grid(samples[j,:,:,:])
                img.save("TBM-ES-samples-b%06d-%03d.png" % (i, j))
Esempio n. 10
0
    def __init__(self, rng=None, Xd=None, \
            g_net=None, i_net=None, pn_seq=None, \
            data_dim=None, prior_dim=None, \
            params=None):
        # setup a rng for this AEDPair
        self.rng = RandStream(rng.randint(100000))

        if (params is None):
            self.params = {}
        else:
            self.params = params
        if 'match_type' in params:
            self.match_type = params['match_type']
        else:
            self.match_type = 'grad_sign'
        # we can only try to match sign or direction...
        assert((self.match_type == 'grad_dir') or \
                (self.match_type == 'grad_sign'))
        if self.match_type == 'grad_dir':
            # we match the direction of the gradient under the assumption
            # of gaussian observation noise
            self.mean_transform = lambda x: max_normalize(x, axis=1)
            assert(g_net.out_type == 'gaussian')
        else:
            # we match the sign of the gradient as if it were a collection
            # of independent binary variables
            self.mean_transform = lambda x: 2.0 * (x - 0.5)
            assert(g_net.out_type == 'bernoulli')

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this AEDPair
        self.Xd = Xd
        self.Yd = T.icol('adp_Yd') # labels to pass to the PeaNetSeq
        self.Xc = 0.0 * self.Xd
        self.Xm = 0.0 * self.Xd
        self.obs_count = T.cast(Xd.shape[0], 'floatX')

        # create a "shared-parameter" clone of the inferencer, set up to
        # receive input from the appropriate symbolic variables.
        self.IN = i_net.shared_param_clone(rng=rng, \
                Xd=self.Xd, Xc=self.Xc, Xm=self.Xm)
        self.policy_mean = self.IN.output_mean
        self.policy_logvar = self.IN.output_logvar
        # capture a handle for samples from the variational posterior
        self.Xp = self.IN.output
        # create a "shared-parameter" clone of the generator, set up to
        # receive input from samples from the variational posterior
        self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output)
        # set up a var for controlling the max-norm bound on perturbations
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.lam_mnb = theano.shared(value=zero_ary, \
                name='adp_lam_mnb')
        self.set_lam_mnb(lam_mnb=0.1)

        # get the perturbations output by the generator network
        self.Pg = self.mean_transform(self.GN.output)
        if self.match_type == 'grad_dir':
            # samples because we're matching gradient via squared error
            self.Pg_samples = self.mean_transform(self.GN.output_samples)
        else:
            # no samples, because we're matching gradient sign
            self.Pg_samples = self.mean_transform(self.GN.output)

        # record and validate the data dimensionality parameters
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        # output of the generator and input to the inferencer should both be
        # equal to self.data_dim
        assert(self.data_dim == self.GN.mlp_layers[-1].out_dim)
        assert(self.data_dim == self.IN.shared_layers[0].in_dim)
        # input of the generator and mu/sigma outputs of the inferencer should
        # both be equal to self.prior_dim
        assert(self.prior_dim == self.GN.mlp_layers[0].in_dim)
        assert(self.prior_dim == self.IN.mu_layers[-1].out_dim)
        assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim)

        # make a clone of the target PeaNetSeq that takes perturbed inputs
        self.PNS = pn_seq.shared_param_clone(rng=rng, seq_len=2, \
                seq_Xd=[self.Xd, self.Xd], seq_Yd=[self.Yd, self.Yd], \
                no_funcs=True)
        self.grad_pea_Xd = T.grad(self.PNS.joint_cost, self.Xd)
        if self.match_type == 'grad_dir':
            # turn gradient into a unit max-normalized vector
            self.match_target = max_normalize(self.grad_pea_Xd)
        else:
            # transform gradient into binary indicators of sign
            self.match_target = (self.grad_pea_Xd > 0.0)
        # get the symbolic vars for passing inputs to self.PNS
        self.Xd_seq = self.PNS.Xd_seq
        self.Yd_seq = self.PNS.Yd_seq
        self.seq_inputs = self.Xd_seq + self.Yd_seq

        # shared var learning rate for generator and inferencer
        self.lr_gn = theano.shared(value=zero_ary, name='adp_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='adp_lr_in')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='adp_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='adp_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='adp_it_count')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_adv = theano.shared(value=zero_ary, name='adp_lam_adv')
        self.set_lam_adv(lam_adv=1.0)
        # init shared vars for weighting a penalty on the norms of our learned
        # policies and a reward to encourage maximizing their entropy.
        self.lam_kld = theano.shared(value=zero_ary, name='adp_lam_kld')
        self.set_lam_kld(lam_kld=0.1)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='adp_lam_l2w')
        self.set_lam_l2w(1e-4)

        # Grab the full set of "optimizable" parameters from the generator
        # and inferencer networks that we'll be working with.
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.adv_cost = self.lam_adv[0] * self._construct_adv_cost()
        self.kld_cost = self.lam_kld[0] * self._construct_kld_cost()
        self.other_reg_cost = self._construct_other_reg_cost()
        self.joint_cost = self.adv_cost + self.kld_cost + \
                self.other_reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        for p in self.joint_params:
            self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1)

        # Construct the updates for the generator and inferencer networks
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in_updates = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.joint_updates = OrderedDict()
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]

        # Construct a function for jointly training the generator/inferencer
        self.train_joint = self._construct_train_joint()

        # Construct a function for computing the outputs of the generator
        # network for a batch of noise. Presumably, the noise will be drawn
        # from the same distribution that was used in training....
        self.sample_from_gn = self.GN.sample_from_model
        self.sample_from_Xd = self._construct_sample_from_Xd()
        return
Esempio n. 11
0
    def __init__(self, rng=None, \
            x_in=None, y_in=None, \
            q_z_given_x=None, \
            class_count=None, \
            z_dim=None, \
            use_samples=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # record the dimensions of various spaces relevant to this model
        self.class_count = class_count
        self.z_dim = z_dim
        self.shared_dim = q_z_given_x.shared_layers[-1].out_dim
        self.use_samples = use_samples

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.y_in = y_in

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        # setup a variable for controlling dropout noise
        self.drop_rate = theano.shared(value=zero_ary, name='cm_drop_rate')
        self.set_drop_rate(0.0)

        # initialize classification layer parameters
        init_mat = to_fX(0.01 * npr.randn(self.shared_dim, self.class_count))
        init_vec = to_fX( np.zeros((self.class_count,)) )
        self.W_class = theano.shared(value=init_mat, name='cm_W_class')
        self.b_class = theano.shared(value=init_vec, name='cm_b_class')
        # initialize "optimizable" parameters specific to this CM
        init_vec = to_fX( np.zeros((self.z_dim,)) )
        self.p_z_mean = theano.shared(value=init_vec, name='cm_p_z_mean')
        self.p_z_logvar = theano.shared(value=init_vec, name='cm_p_z_logvar')

        #################
        # Setup self.z. #
        #################
        self.q_z_mean, self.q_z_logvar, self.q_z_samples = \
                self.q_z_given_x.apply(self.x_in, do_samples=True)
        self.q_z_samples = self.q_z_given_x.apply_shared(self.x_in)

        # get a drop mask that drops things with probability p
        drop_scale = 1. / (1. - self.drop_rate[0])
        drop_rnd = self.rng.uniform(size=self.q_z_samples.shape, \
                low=0.0, high=1.0, dtype=theano.config.floatX)
        drop_mask = drop_scale * (drop_rnd > self.drop_rate[0])

        # get a droppy version of either z mean or z samples
        # if self.use_samples:
        #     self.z = self.q_z_samples * drop_mask
        # else:
        #     self.z = self.q_z_mean * drop_mask
        self.z = self.q_z_samples * drop_mask

        # compute class predictions
        self.y_out = T.dot(self.z, self.W_class) + self.b_class

        # compute KLds for training via variational free-energy
        self.kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \
                                       self.p_z_mean, self.p_z_logvar)
        self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \
                                       self.q_z_mean, self.q_z_logvar)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_1 = theano.shared(value=zero_ary, name='cm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='cm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='cm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='cm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='cm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='cm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='cm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_q2p=0.9, lam_kld_p2q=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='cm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters
        self.joint_params = [self.p_z_mean, self.p_z_logvar, \
                             self.W_class, self.b_class]
        self.joint_params.extend(self.q_z_given_x.mlp_params)

        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.lam_nll[0] * self._construct_nll_costs(self.y_in)
        self.nll_cost = T.mean(self.nll_costs)
        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z_q2p, self.kld_z_p2q = self._construct_kld_costs(p=1.0)
        self.kld_costs = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                         (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_cost = T.mean(self.kld_costs)
        ##################################
        # CONSTRUCT THE FINAL JOINT COST #
        ##################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the model parameters
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling class error estimator...")
        self.class_error = self._construct_class_error()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        # make easy access points for some interesting parameters
        self.inf_weights = self.q_z_given_x.shared_layers[0].W
        return
Esempio n. 12
0
    def __init__(self, rng=None, x_d=None, x_t=None, \
                 i_net=None, g_net=None, d_net=None, \
                 chain_len=None, data_dim=None, z_dim=None, \
                 params=None):
        # Do some stuff!
        self.rng = RandStream(rng.randint(100000))
        self.data_dim = data_dim
        self.z_dim = z_dim
        self.p_z_mean = 0.0
        self.p_z_logvar = 0.0
        if params is None:
            self.params = {}
        else:
            self.params = params
        if 'cost_decay' in self.params:
            self.cost_decay = self.params['cost_decay']
        else:
            self.cost_decay = 0.1
        if 'chain_type' in self.params:
            assert((self.params['chain_type'] == 'walkback') or \
                (self.params['chain_type'] == 'walkout'))
            self.chain_type = self.params['chain_type']
        else:
            self.chain_type = 'walkout'
        if 'xt_transform' in self.params:
            assert((self.params['xt_transform'] == 'sigmoid') or \
                    (self.params['xt_transform'] == 'none'))
            if self.params['xt_transform'] == 'sigmoid':
                self.xt_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.xt_transform = lambda x: x
        else:
            self.xt_transform = lambda x: T.nnet.sigmoid(x)
        if 'logvar_bound' in self.params:
            self.logvar_bound = self.params['logvar_bound']
        else:
            self.logvar_bound = 10
        #
        # x_type: this tells if we're using bernoulli or gaussian model for
        #         the observations
        #
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))

        # grab symbolic input variables
        self.x_d = x_d             # initial input for starting the chain
        self.x_t = x_t             # samples from target distribution
        self.z_zmuv = T.tensor3()  # ZMUV gaussian samples for use in scan

        # get the number of steps for chain unrolling
        self.chain_len = chain_len 

        # symbolic matrix of indices for inputs from target distribution
        self.It = T.arange(self.x_t.shape[0])
        # symbolic matrix of indices for noise/generated inputs
        self.Id = T.arange(self.chain_len * self.x_d.shape[0]) + self.x_t.shape[0]

        # get a clone of the desired VAE, for easy access
        self.OSM = OneStageModel(rng=rng, x_in=self.x_d, \
                                 p_x_given_z=g_net, q_z_given_x=i_net, \
                                 x_dim=self.data_dim, z_dim=self.z_dim, \
                                 params=self.params)
        self.IN = self.OSM.q_z_given_x
        self.GN = self.OSM.p_x_given_z
        self.transform_x_to_z = self.OSM.transform_x_to_z
        self.transform_z_to_x = self.OSM.transform_z_to_x
        self.bounded_logvar = self.OSM.bounded_logvar

        ##################################################
        # self-loop the VAE into a multi-step Markov chain.
        # ** All VAEs in the chain share the same Xc and Xm, which are the
        #    symbolic inputs for providing the observed portion of the input
        #    and a mask indicating which part of the input is "observed".
        #    These inputs are used for training "reconstruction" policies.
        ##################################################
        # Setup the iterative generation loop using scan #
        ##################################################
        def chain_step_func(zi_zmuv, xim1):
            # get mean and logvar of z samples for this step
            zi_mean, zi_logvar = self.IN.apply(xim1, do_samples=False)
            # transform ZMUV samples to get desired samples
            zi = (T.exp(0.5 * zi_logvar) * zi_zmuv) + zi_mean
            # get the next generated xi (pre-transformation)
            outputs = self.GN.apply(zi)
            xti = outputs[-1]
            # apply the observation "mean" transform
            xgi = self.xt_transform(xti)
            # compute NLL for this step
            if self.chain_type == 'walkout':
                x_true = self.x_d
            else:
                x_true = xim1
            nlli = self._log_prob(x_true, xgi).flatten()
            kldi = T.sum(gaussian_kld(zi_mean, zi_logvar, \
                         self.p_z_mean, self.p_z_logvar), axis=1)
            return xgi, nlli, kldi

        # apply the scan op
        init_values = [self.x_d, None, None]
        self.scan_results, self.scan_updates = \
                theano.scan(chain_step_func, outputs_info=init_values, \
                            sequences=self.z_zmuv)
        # get the outputs of the scan op
        self.xgi = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi = self.scan_results[2]
        self.xgi_list = [self.xgi[i] for i in range(self.chain_len)]

        # make a clone of the desired discriminator network, which will try
        # to discriminate between samples from the training data and samples
        # generated by the self-looped VAE chain.
        self.DN = d_net.shared_param_clone(rng=rng, \
                          Xd=T.vertical_stack(self.x_t, *self.xgi_list))

        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        # init shared var for weighting nll of data given posterior sample
        self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll')
        self.set_lam_chain_nll(lam_chain_nll=1.0)
        # init shared var for weighting posterior KL-div from prior
        self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld')
        self.set_lam_chain_kld(lam_chain_kld=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w')
        self.set_lam_l2w(lam_l2w=1e-4)
        # shared var learning rates for all networks
        self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn')
        self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in')
        # shared var momentum parameters for all networks
        self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2')
        # shared var weights for adversarial classification objective
        self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn')
        self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()
        # init adversarial cost weights for GN/DN
        self.set_disc_weights()  
        # set a shared var for regularizing the output of the discriminator
        self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \
                                     name='vcg_lam_l2d')

        # Grab the full set of "optimizable" parameters from the generator
        # and discriminator networks that we'll be working with. We need to
        # ignore parameters in the final layers of the proto-networks in the
        # discriminator network (a generalized pseudo-ensemble). We ignore them
        # because the VCGair requires that they be "bypassed" in favor of some
        # binary classification layers that will be managed by this VCGair.
        self.dn_params = []
        for pn in self.DN.proto_nets:
            for pnl in pn[0:-1]:
                self.dn_params.extend(pnl.params)
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params + self.dn_params

        # Now construct a binary discriminator layer for each proto-net in the
        # discriminator network. And, add their params to optimization list.
        self._construct_disc_layers(rng)
        self.disc_reg_cost = self.lam_l2d[0] * \
                T.sum([dl.act_l2_sum for dl in self.disc_layers])

        # Construct costs for the generator and discriminator networks based 
        # on adversarial binary classification
        self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs()

        # first, build the cost to be optimized by the discriminator network,
        # in general this will be treated somewhat indepedently of the
        # optimization of the generator and inferencer networks.
        self.dn_cost = self.disc_cost_dn + self.disc_reg_cost

        # construct costs relevant to the optimization of the generator and
        # discriminator networks
        self.chain_nll_cost = self.lam_chain_nll[0] * \
                self._construct_chain_nll_cost(cost_decay=self.cost_decay)
        self.chain_kld_cost = self.lam_chain_kld[0] * \
                self._construct_chain_kld_cost(cost_decay=self.cost_decay)
        self.other_reg_cost = self._construct_other_reg_cost()
        self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \
                        self.chain_kld_cost + self.other_reg_cost
        # compute total cost on the discriminator and VB generator/inferencer
        self.joint_cost = self.dn_cost + self.osm_cost

        print("Computing VCGLoop joint_grad...")
        # grab the gradients for all parameters to optimize
        self.joint_grads = OrderedDict()
        for p in self.dn_params:
            self.joint_grads[p] = T.grad(self.dn_cost, p)
        for p in self.in_params:
            self.joint_grads[p] = T.grad(self.osm_cost, p)
        for p in self.gn_params:
            self.joint_grads[p] = T.grad(self.osm_cost, p)

        # construct the updates for the discriminator, generator and 
        # inferencer networks. all networks share the same first/second
        # moment momentum and iteration count. the networks each have their
        # own learning rates, which lets you turn their learning on/off.
        self.dn_updates = get_adam_updates(params=self.dn_params, \
                grads=self.joint_grads, alpha=self.lr_dn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)
        self.in_updates = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)

        # bag up all the updates required for training
        self.joint_updates = OrderedDict()
        for k in self.dn_updates:
            self.joint_updates[k] = self.dn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]

        print("Compiling VCGLoop train_joint...")
        # construct the function for training on training data
        self.train_joint = self._construct_train_joint()
        return
Esempio n. 13
0
    def __init__(self, rng=None, pea_net=None, seq_len=2, seq_Xd=None, \
            seq_Yd=None, no_noise=False, no_funcs=False, params=None, \
            shared_param_dict=None):
        assert(not (rng is None))
        assert(not (pea_net is None))
        assert(seq_len >= 2)
        if not (seq_Xd is None):
            # if symbolic inputs for the sequence to receive are given when
            # the sequence is created, check if it's the right amount.
            assert(len(seq_Xd) == seq_len)
        if not (seq_Yd is None):
            # if symbolic inputs for the sequence to receive are given when
            # the sequence is created, check if it's the right amount.
            assert(len(seq_Yd) == seq_len)
        self.params = params
        # setup a rng for this PeaNetSeq
        self.rng = RandStream(rng.randint(100000))
        if shared_param_dict is None:
            self.is_clone = False
            self.shared_param_dict = {}
        else:
            print("Inititalizing a PeaNetSeq clone...")
            self.is_clone = True
            self.shared_param_dict = shared_param_dict
        # make param dict for a noiseless version of the PNSeq
        new_pn_params = pea_net.params.copy()
        if no_noise:
            for sc in new_pn_params['spawn_configs']:
                sc['input_noise'] = 0.0
                sc['bias_noise'] = 0.0
                sc['do_dropout'] = False

        # setup the sequence of PeaNet clones
        self.seq_len = seq_len
        self.Xd_seq = []
        self.Yd_seq = []
        self.PN_seq = []
        for i in range(self.seq_len):
            if seq_Xd is None:
                # make new symbolic inputs if none were given
                Xd_i = T.matrix(name="Xd_{0:d}".format(i))
            else:
                # otherwise, use the given symbolic inputs
                Xd_i = seq_Xd[i]
            if seq_Yd is None:
                # create a label vector to be associated with this clone
                Yd_i = T.icol(name="Yd_{0:d}".format(i))
            else:
                # otherwise, use the given symbolic inputs
                Yd_i = seq_Yd[i]
            # add observation/label inputs and the clone to the sequence
            self.Xd_seq.append(Xd_i)
            self.Yd_seq.append(Yd_i)
            self.PN_seq.append(pea_net.shared_param_clone(rng=rng, Xd=Xd_i, \
                    params=new_pn_params))
        self.PN = self.PN_seq[0]
        # create the full list of symbolic inputs required for training
        self.seq_inputs = self.Xd_seq + self.Yd_seq

        if not self.is_clone:
            # shared var learning rate for the base network
            zero_ary = np.zeros((1,)).astype(theano.config.floatX)
            self.lr_pn = theano.shared(value=zero_ary, name='pnseq_lr_pn')
            # shared var momentum parameters for the base network
            self.mom_1 = theano.shared(value=zero_ary, name='pnseq_mom_1')
            self.mom_2 = theano.shared(value=zero_ary, name='pnseq_mom_2')
            self.it_count = theano.shared(value=zero_ary, name='pnseq_it_count')
            # init parameters for controlling learning dynamics
            self.set_pn_sgd_params()
            # init shared var for weighting PEA cost on supervised inputs
            self.lam_pea_su = theano.shared(value=zero_ary, name='pnseq_lam_pea_su')
            self.set_lam_pea_su(lam_pea_su=1.0)
            # init shared var for weighting PEA cost on unsupervised inputs
            self.lam_pea_un = theano.shared(value=zero_ary, name='pnseq_lam_pea_un')
            self.set_lam_pea_un(lam_pea_un=1.0)
            # init shared var for weighting entropy cost on unsupervised inputs
            self.lam_ent = theano.shared(value=zero_ary, name='pnseq_lam_ent')
            self.set_lam_ent(lam_ent=0.0)
            # init shared var for weighting classification cost on supervised inputs
            self.lam_class = theano.shared(value=zero_ary, name='pnseq_lam_class')
            self.set_lam_class(lam_class=1.0)
            # init shared var for controlling l2 regularization on params
            self.lam_l2w = theano.shared(value=zero_ary, name='pnseq_lam_l2w')
            self.set_lam_l2w(1e-4)
            # make the dict for passing around shared lambdas
            self.shared_param_dict['lr_pn'] = self.lr_pn
            self.shared_param_dict['mom_1'] = self.mom_1
            self.shared_param_dict['mom_2'] = self.mom_2
            self.shared_param_dict['it_count'] = self.it_count
            self.shared_param_dict['lam_pea_su'] = self.lam_pea_su
            self.shared_param_dict['lam_pea_un'] = self.lam_pea_un
            self.shared_param_dict['lam_ent'] = self.lam_ent
            self.shared_param_dict['lam_class'] = self.lam_class
            self.shared_param_dict['lam_l2w'] = self.lam_l2w
        else:
            # copy shared lambdas from the cloning dict
            self.lr_pn = self.shared_param_dict['lr_pn']
            self.mom_1 = self.shared_param_dict['mom_1']
            self.mom_2 = self.shared_param_dict['mom_2']
            self.it_count = self.shared_param_dict['it_count']
            self.lam_pea_su = self.shared_param_dict['lam_pea_su']
            self.lam_pea_un = self.shared_param_dict['lam_pea_un']
            self.lam_ent = self.shared_param_dict['lam_ent']
            self.lam_class = self.shared_param_dict['lam_class']
            self.lam_l2w = self.shared_param_dict['lam_l2w']

        # grab the full set of "optimizable" parameters from the base network
        self.mlp_params = [p for p in self.PN.proto_params]

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.pea_su_cost, self.pea_un_cost = self._construct_pea_costs()
        self.pea_cost = (self.lam_pea_su[0] * self.pea_su_cost) + \
                (self.lam_pea_un[0] * self.pea_un_cost)
        self.ent_cost = self.lam_ent[0] * self._construct_ent_cost()
        self.class_cost = self.lam_class[0] * self._construct_class_cost()
        self.other_reg_cost = self._construct_other_reg_cost()
        self.joint_cost = self.pea_cost + self.ent_cost + self.class_cost + \
                self.other_reg_cost

        ######################################################
        # Construct updates for the shared PeaNet parameters #
        ######################################################
        self.mlp_grads = OrderedDict()
        for p in self.mlp_params:
            self.mlp_grads[p] = T.grad(self.joint_cost, p).clip(-1.0,1.0)
        # Construct the updates for the generator and inferencer networks
        self.mlp_updates = get_adam_updates(params=self.mlp_params, \
                grads=self.mlp_grads, alpha=self.lr_pn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)

        # Construct a function for training the base network to minimize the
        # sequential PEAR cost
        if no_funcs:
            self.train_joint = None
            self.get_pn_output = None
        else:
            self.train_joint = self._construct_train_joint()
            # make a function for computing outputs of the main PeaNet
            self.get_pn_output = theano.function([self.PN.Xd], \
                    outputs=self.PN.output_proto)
        return
    def __init__(self, rng=None,
            x_in=None, x_out=None,
            p_h_given_z=None,
            p_x_given_h=None,
            q_z_given_x=None,
            q_h_given_z_x=None,
            x_dim=None,
            z_dim=None,
            h_dim=None,
            h_det_dim=None,
            params=None,
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim
        self.h_dim = h_dim
        self.h_det_dim = h_det_dim

        # grab handles to the relevant HydraNets
        self.q_z_given_x = q_z_given_x
        self.q_h_given_z_x = q_h_given_z_x
        self.p_h_given_z = p_h_given_z
        self.p_x_given_h = p_x_given_h

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch')
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((1,self.z_dim)) )
            self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar')
            self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        ##############################################
        # Setup the TwoStageModels main computation. #
        ##############################################
        print("Building TSM...")
        # samples of "hidden" latent state (from both p and q)
        z_q_mean, z_q_logvar = self.q_z_given_x.apply(self.x_in)
        z_q = reparametrize(z_q_mean, z_q_logvar, rng=self.rng)

        z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0)
        z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0)
        z_p = reparametrize(z_p_mean, z_p_logvar, rng=self.rng)

        self.z = (self.train_switch[0] * z_q) + \
                 ((1.0 - self.train_switch[0]) * z_p)
        # compute relevant KLds for this step
        self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar,
                                      z_p_mean, z_p_logvar)
        self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar,
                                      z_q_mean, z_q_logvar)
        # samples of "hidden" latent state (from both p and q)
        h_p_mean, h_p_logvar = self.p_h_given_z.apply(self.z)
        h_p = reparametrize(h_p_mean, h_p_logvar, rng=self.rng)

        h_q_mean, h_q_logvar = self.q_h_given_z_x.apply(
                T.concatenate([h_p_mean, self.x_out], axis=1))
        h_q = reparametrize(h_q_mean, h_q_logvar, rng=self.rng)

        # compute "stochastic" and "deterministic" parts of latent state
        h_sto = (self.train_switch[0] * h_q) + \
                ((1.0 - self.train_switch[0]) * h_p)
        h_det = h_p_mean
        if self.h_det_dim is None:
            # don't pass forward any deterministic state
            self.h = h_sto
        else:
            # pass forward some deterministic state
            self.h = T.concatenate([h_det[:,:self.h_det_dim],
                                    h_sto[:,self.h_det_dim:]], axis=1)
        # compute relevant KLds for this step
        self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar,
                                      h_p_mean, h_p_logvar)
        self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar,
                                      h_q_mean, h_q_logvar)

        # p_x_given_h generates an observation x conditioned on the "hidden"
        # latent variables h.
        self.x_gen, _ = self.p_x_given_h.apply(self.h)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='tsm_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # get optimizable parameters belonging to the TwoStageModel
        self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar]
        # get optimizable parameters belonging to the underlying networks
        child_params = []
        child_params.extend(self.q_z_given_x.mlp_params)
        child_params.extend(self.q_h_given_z_x.mlp_params)
        child_params.extend(self.p_h_given_z.mlp_params)
        child_params.extend(self.p_x_given_h.mlp_params)
        # make a joint list of all optimizable parameters
        self.joint_params = self_params + child_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_h_p2q)
        self.kld_costs = T.sum(self.kld_z, axis=1) + \
                         T.sum(self.kld_h, axis=1)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs(self.x_out)
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # construct the updates for the generator and inferencer networks
        all_updates = get_adam_updates(params=self.joint_params,
                grads=self.joint_grads, alpha=self.lr,
                beta1=self.mom_1, beta2=self.mom_2,
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0)
        self.joint_updates = OrderedDict()
        for k in all_updates:
            self.joint_updates[k] = all_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        return
    def __init__(self, rng=None, x_in=None, \
            p_s0_obs_given_z_obs=None, p_hi_given_si=None, p_sip1_given_si_hi=None, \
            p_x_given_si_hi=None, q_z_given_x=None, q_hi_given_x_si=None, \
            obs_dim=None, z_dim=None, h_dim=None, \
            model_init_obs=True, ir_steps=2, \
            params=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # TODO: implement functionality for working with "latent" si
        assert(p_x_given_si_hi is None)

        # decide whether to initialize from a model or from a "constant"
        self.model_init_obs = model_init_obs

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)

        # record the dimensions of various spaces relevant to this model
        self.obs_dim = obs_dim
        self.z_dim = z_dim
        self.h_dim = h_dim
        self.ir_steps = ir_steps

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x = x_in
        self.batch_reps = T.lscalar()

        # setup switching variable for changing between sampling/training
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)
        # setup a weight for pulling priors over hi given si towards a
        # shared global prior -- e.g. zero mean and unit variance.
        self.kzg_weight = theano.shared(value=zero_ary, name='msm_kzg_weight')
        self.set_kzg_weight(0.1)
        # this weight balances l1 vs. l2 penalty on posterior KLds
        self.l1l2_weight = theano.shared(value=zero_ary, name='msm_l1l2_weight')
        self.set_l1l2_weight(1.0)
        # this parameter controls dropout rate in the generator read function
        self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate')
        self.set_drop_rate(0.0)

        #############################
        # Setup self.z and self.s0. #
        #############################
        print("Building MSM step 0...")
        obs_scale = 0.0
        if self.model_init_obs: # initialize obs state from generative model
            obs_scale = 1.0
        self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x)
        self.z = self.q_z_given_x.output
        self.p_s0_obs_given_z_obs = p_s0_obs_given_z_obs.shared_param_clone( \
                rng=rng, Xd=self.z)
        _s0_obs_model = self.p_s0_obs_given_z_obs.output_mean
        _s0_obs_const = self.p_s0_obs_given_z_obs.mu_layers[-1].b
        self.s0_obs = (obs_scale * _s0_obs_model) + \
                ((1.0 - obs_scale) * _s0_obs_const)
        self.output_logvar = self.p_s0_obs_given_z_obs.sigma_layers[-1].b
        self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.output_logvar)

        ###############################################################
        # Setup the iterative refinement loop, starting from self.s0. #
        ###############################################################
        self.p_hi_given_si = []       # holds p_hi_given_si for each i
        self.p_sip1_given_si_hi = []  # holds p_sip1_given_si_hi for each i
        self.q_hi_given_x_si = []     # holds q_hi_given_x_si for each i
        self.si = [self.s0_obs]       # holds si for each i
        self.hi = []                  # holds hi for each i
        for i in range(self.ir_steps):
            print("Building MSM step {0:d}...".format(i+1))
            si_obs = self.si[i]
            # get samples of next hi, conditioned on current si
            self.p_hi_given_si.append( \
                    p_hi_given_si.shared_param_clone(rng=rng, \
                    Xd=self.obs_transform(si_obs)))
            hi_p = self.p_hi_given_si[i].output
            # now we build the model for variational hi given si
            grad_ll = self.x - self.obs_transform(si_obs)
            self.q_hi_given_x_si.append(\
                    q_hi_given_x_si.shared_param_clone(rng=rng, \
                    Xd=T.horizontal_stack( \
                    grad_ll, self.obs_transform(si_obs))))
            hi_q = self.q_hi_given_x_si[i].output
            # make hi samples that can be switched between hi_p and hi_q
            self.hi.append( ((self.train_switch[0] * hi_q) + \
                    ((1.0 - self.train_switch[0]) * hi_p)) )
            # p_sip1_given_si_hi is conditioned on hi.
            self.p_sip1_given_si_hi.append( \
                    p_sip1_given_si_hi.shared_param_clone(rng=rng, \
                    Xd=self.hi[i]))
            # construct the update from si_obs to sip1_obs
            sip1_obs = si_obs + self.p_sip1_given_si_hi[i].output_mean
            # record the updated state of the generative process
            self.si.append(sip1_obs)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_1 = theano.shared(value=zero_ary, name='msm_lam_kld_1')
        self.lam_kld_2 = theano.shared(value=zero_ary, name='msm_lam_kld_2')
        self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.group_1_params = []
        self.group_1_params.extend(self.q_z_given_x.mlp_params)
        self.group_1_params.extend(self.p_s0_obs_given_z_obs.mlp_params)
        # Grab all of the "optimizable" parameters in "group 2"
        self.group_2_params = []
        for i in range(self.ir_steps):
            self.group_2_params.extend(self.q_hi_given_x_si[i].mlp_params)
            self.group_2_params.extend(self.p_hi_given_si[i].mlp_params)
            self.group_2_params.extend(self.p_sip1_given_si_hi[i].mlp_params)
        # Make a joint list of parameters group 1/2
        self.joint_params = self.group_1_params + self.group_2_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z, self.kld_hi_cond, self.kld_hi_glob = \
                self._construct_kld_costs()
        self.kld_cost = (self.lam_kld_1[0] * T.mean(self.kld_z)) + \
                (self.lam_kld_2[0] * (T.mean(self.kld_hi_cond) + \
                (self.kzg_weight[0] * T.mean(self.kld_hi_glob))))
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs()
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.group_1_updates = get_adam_updates(params=self.group_1_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.group_2_updates = get_adam_updates(params=self.group_2_params, \
                grads=self.joint_grads, alpha=self.lr_2, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.group_1_updates:
            self.joint_updates[k] = self.group_1_updates[k]
        for k in self.group_2_updates:
            self.joint_updates[k] = self.group_2_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        self.compute_post_klds = self._construct_compute_post_klds()
        self.compute_fe_terms = self._construct_compute_fe_terms()
        self.sample_from_prior = self._construct_sample_from_prior()
        # make easy access points for some interesting parameters
        self.inf_1_weights = self.q_z_given_x.shared_layers[0].W
        self.gen_1_weights = self.p_s0_obs_given_z_obs.mu_layers[-1].W
        self.inf_2_weights = self.q_hi_given_x_si[0].shared_layers[0].W
        self.gen_2_weights = self.p_sip1_given_si_hi[0].mu_layers[-1].W
        self.gen_inf_weights = self.p_hi_given_si[0].shared_layers[0].W
        return
Esempio n. 16
0
    def __init__(self, rng=None, \
            Xd=None, Xc=None, Xm=None, \
            g_net=None, i_net=None, \
            data_dim=None, prior_dim=None, \
            g_net_2=None, i_net_2=None, \
            prior_dim_2=None, \
            params=None, shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))
        if params is None:
            self.params = {}
        else:
            self.params = params

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this GIPair
        self.Xd = Xd
        self.Xc = Xc
        self.Xm = Xm
        # check whether we'll be working with "encoded" inputs
        self.use_encoder = i_net.use_encoder
        print("i_net.use_encoder: {0:s}, g_net.use_decoder: {1:s}".format( \
                str(i_net.use_encoder), str(g_net.use_decoder)))
        assert(self.use_encoder == g_net.use_decoder)
        # create a "shared-parameter" clone of the inferencer, set up to
        # receive input from the appropriate symbolic variables.
        self.IN = i_net.shared_param_clone(rng=rng, \
                Xd=apply_mask(Xd=self.Xd, Xc=self.Xc, Xm=self.Xm))
        self.posterior_means = self.IN.output_mean
        self.posterior_sigmas = self.IN.output_sigma
        self.posterior_norms = T.sqrt(T.sum(self.posterior_means**2.0, axis=1, keepdims=1))
        self.posterior_klds = self.IN.kld_cost
        self.kld2_scale = self.IN.kld2_scale
        # capture a handle for samples from the variational posterior
        self.Xp = self.IN.output
        # create a "shared-parameter" clone of the generator, set up to
        # receive input from samples from the variational posterior
        self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output)
        # capture a handle for sampled reconstructions from the generator
        self.Xg = self.GN.output

        # construct a second GIPair stacked on top of the first GIPair, which
        # learns to model the posterior samples emitted by the inferencer in
        # the first GIPair
        self.IN2 = i_net_2.shared_param_clone(rng=rng, Xd=apply_mask(Xd=self.Xp, \
                Xc=T.zeros_like(self.Xp), Xm=T.zeros_like(self.Xp)))
        # capture a handle for samples from the top's variational posterior
        self.Xp2 = self.IN2.output
        # feed these variational posterior samples into the top's generator
        self.GN2 = g_net_2.shared_param_clone(rng=rng, Xp=self.Xp2)
        # capture a handle for sampled (latent) reconstructions from GN2
        self.Xg2 = self.GN2.output

        # record and validate the data dimensionality parameters
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        self.prior_dim_2 = prior_dim_2
        # output of the generator and input to the inferencer should both be
        # equal to self.data_dim
        assert(self.data_dim == self.GN.mlp_layers[-1].out_dim)
        assert(self.data_dim == self.IN.shared_layers[0].in_dim)
        # input of the generator and mu/sigma outputs of the inferencer should
        # both be equal to self.prior_dim
        assert(self.prior_dim == self.GN.mlp_layers[0].in_dim)
        assert(self.prior_dim == self.IN.mu_layers[-1].out_dim)
        assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim)
        # input of the generator and mu/sigma outputs of the inferencer should
        # both be equal to self.prior_dim
        assert(self.prior_dim_2 == self.GN2.mlp_layers[0].in_dim)
        assert(self.prior_dim_2 == self.IN2.mu_layers[-1].out_dim)
        assert(self.prior_dim_2 == self.IN2.sigma_layers[-1].out_dim)

        # determine whether this GIPair is a clone or an original
        if shared_param_dicts is None:
            # This is not a clone, and we will need to make a dict for
            # referring to the parameters of each network layer
            self.shared_param_dicts = {}
            self.is_clone = False
        else:
            # This is a clone, and its layer parameters can be found by
            # referring to the given param dict (i.e. shared_param_dicts).
            self.shared_param_dicts = shared_param_dicts
            self.is_clone = True
        if not self.is_clone:
            # shared var learning rate for generator and inferencer
            zero_ary = np.zeros((1,)).astype(theano.config.floatX)
            self.lr_gn = theano.shared(value=zero_ary, name='gip_lr_gn')
            self.lr_in = theano.shared(value=zero_ary, name='gip_lr_in')
            # shared var momentum parameters for generator and inferencer
            self.mom_1 = theano.shared(value=zero_ary, name='gip_mom_1')
            self.mom_2 = theano.shared(value=zero_ary, name='gip_mom_2')
            self.it_count_bot = theano.shared(value=zero_ary, name='gip_it_count_bot')
            self.it_count_top = theano.shared(value=zero_ary, name='gip_it_count_top')
            self.it_count_joint = theano.shared(value=zero_ary, name='gip_it_count_joint')
            # init parameters for controlling learning dynamics
            self.set_all_sgd_params()
            # init shared var for weighting nll of data given posterior sample
            self.lam_nll = theano.shared(value=zero_ary, name='gip_lam_nll')
            self.set_lam_nll(lam_nll=1.0)
            # init shared var for weighting prior kld against reconstruction
            self.lam_kld = theano.shared(value=zero_ary, name='gip_lam_kld')
            self.set_lam_kld(lam_kld=1.0)
            # init shared var for controlling l2 regularization on params
            self.lam_l2w = theano.shared(value=zero_ary, name='gip_lam_l2w')
            self.set_lam_l2w(1e-4)
            # record shared parameters that are to be shared among clones
            self.shared_param_dicts['gip_lr_gn'] = self.lr_gn
            self.shared_param_dicts['gip_lr_in'] = self.lr_in
            self.shared_param_dicts['gip_mom_1'] = self.mom_1
            self.shared_param_dicts['gip_mom_2'] = self.mom_2
            self.shared_param_dicts['gip_it_count_bot'] = self.it_count_bot
            self.shared_param_dicts['gip_it_count_top'] = self.it_count_top
            self.shared_param_dicts['gip_it_count_joint'] = self.it_count_joint
            self.shared_param_dicts['gip_lam_nll'] = self.lam_nll
            self.shared_param_dicts['gip_lam_kld'] = self.lam_kld
            self.shared_param_dicts['gip_lam_l2w'] = self.lam_l2w
        else:
            # use some shared parameters that are shared among all clones of
            # some "base" GIPair
            self.lr_gn = self.shared_param_dicts['gip_lr_gn']
            self.lr_in = self.shared_param_dicts['gip_lr_in']
            self.mom_1 = self.shared_param_dicts['gip_mom_1']
            self.mom_2 = self.shared_param_dicts['gip_mom_2']
            self.it_count_bot = self.shared_param_dicts['gip_it_count_bot']
            self.it_count_top = self.shared_param_dicts['gip_it_count_top']
            self.it_count_joint = self.shared_param_dicts['gip_it_count_joint']
            self.lam_nll = self.shared_param_dicts['gip_lam_nll']
            self.lam_kld = self.shared_param_dicts['gip_lam_kld']
            self.lam_l2w = self.shared_param_dicts['gip_lam_l2w']

        # grab the optimizable parameters in the bottom GIPair
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]
        self.bot_params = self.in_params + self.gn_params
        # grab the optimizable parameters in the top GIPair
        self.in2_params = [p for p in self.IN2.mlp_params]
        self.gn2_params = [p for p in self.GN2.mlp_params]
        self.top_params = self.in2_params + self.gn2_params
        # get the optimizable parameters of bottom + top GIPair
        self.joint_params = self.top_params + self.bot_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.data_nll_cost_bot = self.lam_nll[0] * \
                self._construct_data_nll_cost(which_gip='bot')
        self.data_nll_cost_top = self.lam_nll[0] * \
                self._construct_data_nll_cost(which_gip='top')
        self.post_kld_cost_bot = self.lam_kld[0] * \
                self._construct_post_kld_cost(which_gip='bot', kld2_scale=self.kld2_scale)
        self.post_kld_cost_top = self.lam_kld[0] * \
                self._construct_post_kld_cost(which_gip='top', kld2_scale=self.kld2_scale)
        self.other_reg_cost_bot = \
                self._construct_other_reg_cost(which_gip='bot')
        self.other_reg_cost_top = \
                self._construct_other_reg_cost(which_gip='top')
        # summed costs for bottom, top, and joint objectives
        self.bot_cost = self.data_nll_cost_bot + self.post_kld_cost_bot + \
                self.other_reg_cost_bot
        self.top_cost = self.data_nll_cost_top + self.post_kld_cost_top + \
                self.other_reg_cost_top
        self.joint_cost = self.bot_cost + self.top_cost

        #########################################
        # CONSTRUCT THE GRADIENTS FOR THE COSTS #
        #########################################
        self.bot_grads = OrderedDict()
        for p in self.bot_params:
            self.bot_grads[p] = T.grad(self.bot_cost, p).clip(-0.1, 0.1)
        # Get the gradient of the top cost for all relevant parameters
        self.top_grads = OrderedDict()
        for p in self.top_params:
            self.top_grads[p] = T.grad(self.top_cost, p).clip(-0.1, 0.1)
        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        for p in self.joint_params:
            self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1)

        #######################################
        # CONSTRUCT THE UPDATES FOR THE COSTS #
        #######################################
        # construct updates for the bottom GIPair, for the bottom cost
        self.gn_updates_bot = get_adam_updates(params=self.gn_params, \
                grads=self.bot_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_bot, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in_updates_bot = get_adam_updates(params=self.in_params, \
                grads=self.bot_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_bot, \
                mom2_init=1e-3, smoothing=1e-8)
        # construct updates for the top GIPair, for the top cost
        self.gn2_updates_top = get_adam_updates(params=self.gn2_params, \
                grads=self.top_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_top, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in2_updates_top = get_adam_updates(params=self.in2_params, \
                grads=self.top_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_top, \
                mom2_init=1e-3, smoothing=1e-8)
        # construct updates for the bottom GIPair, for the joint cost
        self.gn_updates_joint = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_joint, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in_updates_joint = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_joint, \
                mom2_init=1e-3, smoothing=1e-8)
        # construct updates for the top GIPair, for the joint cost
        self.gn2_updates_joint = get_adam_updates(params=self.gn2_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_joint, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in2_updates_joint = get_adam_updates(params=self.in2_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_joint, \
                mom2_init=1e-3, smoothing=1e-8)


        # Merge the bottom updates for easier application
        self.bot_updates = OrderedDict()
        for k in self.gn_updates_bot:
            self.bot_updates[k] = self.gn_updates_bot[k]
        for k in self.in_updates_bot:
            self.bot_updates[k] = self.in_updates_bot[k]
        self.bot_updates[self.IN.kld_mean] = self.IN.kld_mean_update
        # Merge the top updates for easier application
        self.top_updates = OrderedDict()
        for k in self.gn2_updates_top:
            self.top_updates[k] = self.gn2_updates_top[k]
        for k in self.in2_updates_top:
            self.top_updates[k] = self.in2_updates_top[k]
        self.top_updates[self.IN2.kld_mean] = self.IN2.kld_mean_update
        # Merge the joint updates for easier application
        self.joint_updates = OrderedDict()
        for k in self.gn_updates_joint:
            self.joint_updates[k] = self.gn_updates_joint[k]
        for k in self.in_updates_joint:
            self.joint_updates[k] = self.in_updates_joint[k]
        for k in self.gn2_updates_joint:
            self.joint_updates[k] = self.gn2_updates_joint[k]
        for k in self.in2_updates_joint:
            self.joint_updates[k] = self.in2_updates_joint[k]
        self.joint_updates[self.IN.kld_mean] = self.IN.kld_mean_update
        self.joint_updates[self.IN2.kld_mean] = self.IN2.kld_mean_update
        # Construct a function for jointly training the generator/inferencer
        self.train_bot = self._construct_train_bot()
        self.train_top = self._construct_train_top()
        self.train_joint = self._construct_train_joint()
        self.compute_costs = self._construct_compute_costs()
        return
Esempio n. 17
0
    def __init__(self, rng=None,
            x_out=None, \
            p_z_given_x=None, \
            p_x_given_z=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this WalkoutModel
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params['x_dim']
        self.z_dim = self.params['z_dim']
        self.walkout_steps = self.params['walkout_steps']
        self.x_type = self.params['x_type']
        self.shared_param_dicts = shared_param_dicts
        if 'x_transform' in self.params:
            assert((self.params['x_transform'] == 'sigmoid') or \
                    (self.params['x_transform'] == 'none'))
            if self.params['x_transform'] == 'sigmoid':
                self.x_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.x_transform = lambda x: x
        else:
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        assert ((self.step_type == 'add') or (self.step_type == 'jump'))

        # grab handles to the relevant networks
        self.p_z_given_x = p_z_given_x
        self.p_x_given_z = p_x_given_z

        # record the symbolic variables that will provide inputs to the
        # computation graph created for this WalkoutModel
        self.x_out = x_out  # target output for generation
        self.zi_zmuv = T.tensor3()  # ZMUV gauss noise for walk-out wobble

        if self.shared_param_dicts is None:
            # initialize the parameters "owned" by this model
            zero_ary = to_fX(np.zeros((1, )))
            self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh(
                (1.0 / 8.0) * self.obs_logvar[0])
            self.shared_param_dicts = {}
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            # grab the parameters required by this model from a given dict
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh(
                (1.0 / 8.0) * self.obs_logvar[0])

        ###############################################################
        # Setup the forwards (i.e. training) walk-out loop using scan #
        ###############################################################
        def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw):
            # get samples of next zi, according to the forwards model
            zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv)

            # check reverse direction probability p(xi_fw | zi_fw)
            xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_bw_mean = self.x_transform(xi_bw_mean)
            nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \
                        log_vars=xi_bw_logvar, mask=None)
            nll_xi_bw = nll_xi_bw.flatten()

            # get samples of next xi, according to the forwards model
            xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_fw_mean = self.x_transform(xi_fw_mean)
            xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv)

            # check reverse direction probability p(zi_fw | xi_fw)
            zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \
                        log_vars=zi_bw_logvar, mask=None)
            nll_zi_bw = nll_zi_bw.flatten()

            # each loop iteration produces the following values:
            #   xi_fw: xi generated fom zi by forwards walk
            #   zi_fw: zi generated fom xi by forwards walk
            #   xi_fw_mean: ----
            #   xi_fw_logvar: ----
            #   zi_fw_mean: ----
            #   zi_fw_logvar: ----
            #   nll_xi_bw: NLL for reverse step zi_fw -> xi_fw
            #   nll_zi_bw: NLL for reverse step xi_fw -> zi_fw
            return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw

        # initialize states for x/z
        self.x0 = self.x_out
        self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim)
        # setup initial values to pass to scan op
        outputs_init = [self.x0, self.z0, None, None, None, None, None, None]
        sequences_init = [self.xi_zmuv, self.zi_zmuv]
        # apply scan op for the sequential imputation loop
        self.scan_results, self.scan_updates = theano.scan(forwards_loop, \
                    outputs_info=outputs_init, \
                    sequences=sequences_init)

        # grab results of the scan op. all values are computed for each step
        self.xi = self.scan_results[0]
        self.zi = self.scan_results[1]
        self.xi_fw_mean = self.scan_results[2]
        self.xi_fw_logvar = self.scan_results[3]
        self.zi_fw_mean = self.scan_results[4]
        self.zi_fw_logvar = self.scan_results[5]
        self.nll_xi_bw = self.scan_results[6]
        self.nll_zi_bw = self.scan_results[7]

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX(np.zeros((1, )))
        self.lr = theano.shared(value=zero_ary, name='srr_lr')
        # shared var momentum parameters for ADAM optimization
        self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared vars for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p')
        self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q')
        self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g')
        self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s')
        self.set_lam_kld(lam_kld_p=0.0,
                         lam_kld_q=1.0,
                         lam_kld_g=0.0,
                         lam_kld_s=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w')
        self.set_lam_l2w(1e-5)

        # grab all of the "optimizable" parameters from the base networks
        self.joint_params = [self.s0, self.obs_logvar, self.step_scales]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.p_x_given_si.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(
            p=1.0)
        self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \
                         (self.lam_kld_q[0] * self.kld_q) + \
                         (self.lam_kld_g[0] * self.kld_g) + \
                         (self.lam_kld_s[0] * self.kld_s)
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = T.sum(self.nlli, axis=0)  # sum the per-step NLLs
        self.nll_cost = T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct theano functions for training and diagnostic computations
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling sequence sampler...")
        self.sequence_sampler = self._construct_sequence_sampler()
        # make easy access points for some interesting parameters
        #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return
Esempio n. 18
0
    def __init__(self, rng=None,
            x_in=None, x_mask=None, x_out=None, \
            p_zi_given_xi=None, \
            p_sip1_given_zi=None, \
            q_zi_given_xi=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params['x_dim']
        self.z_dim = self.params['z_dim']
        self.imp_steps = self.params['imp_steps']
        self.step_type = self.params['step_type']
        self.x_type = self.params['x_type']
        assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        self.shared_param_dicts = shared_param_dicts

        # grab handles to the relevant InfNets
        self.p_zi_given_xi = p_zi_given_xi
        self.p_sip1_given_zi = p_sip1_given_zi
        self.q_zi_given_xi = q_zi_given_xi

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.x_mask = x_mask
        self.zi_zmuv = T.tensor3()

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX(np.zeros((1, )))
        self.train_switch = theano.shared(value=zero_ary,
                                          name='msm_train_switch')
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize parameters "owned" by this model
            s0_init = to_fX(np.zeros((self.x_dim, )))
            init_ary = to_fX(np.zeros((self.x_dim, )))
            self.x_null = theano.shared(value=init_ary, name='gpis_xn')
            self.grad_null = theano.shared(value=init_ary, name='gpsi_gn')
            self.s0 = theano.shared(value=s0_init, name='gpsi_s0')
            self.obs_logvar = theano.shared(value=zero_ary,
                                            name='gpsi_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh(
                (1.0 / 8.0) * self.obs_logvar[0])
            self.shared_param_dicts = {}
            self.shared_param_dicts['x_null'] = self.x_null
            self.shared_param_dicts['grad_null'] = self.grad_null
            self.shared_param_dicts['s0'] = self.s0
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            # grab the parameters required by this model from a given dict
            self.x_null = self.shared_param_dicts['x_null']
            self.grad_null = self.shared_param_dicts['grad_null']
            self.s0 = self.shared_param_dicts['s0']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh(
                (1.0 / 8.0) * self.obs_logvar[0])

        ##################################################
        # Setup the iterative imputation loop using scan #
        ##################################################
        self.ones_mask = T.ones_like(self.x_mask)

        def imp_step_func(zi_zmuv, si):
            si_as_x = self._si_as_x(si)
            xi_unmasked = self.x_out
            xi_masked = (self.x_mask * xi_unmasked) + \
                        ((1.0 - self.x_mask) * si_as_x)
            grad_unmasked = self.x_out - si_as_x
            grad_masked = self.x_mask * grad_unmasked
            # get samples of next zi, according to the global policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked)
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply(
                T.concatenate([xi_masked, xi_unmasked], axis=1))
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)

            # make zi samples that can be switched between zi_p and zi_q
            zi = ((self.train_switch[0] * zi_q) + \
                 ((1.0 - self.train_switch[0]) * zi_p))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean,
                                    zi_p_logvar)  # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean,
                                    zi_q_logvar)  # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0,
                                    0.0)  # KL(p || global prior)

            # compute the next si, given the sampled zi
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if (self.step_type == 'jump'):
                # jump steps always completely overwrite the current guesses
                sip1 = si_step
            elif (self.step_type == 'add'):
                # add steps just update the guesses additively
                sip1 = si + si_step
            elif (self.step_type == 'lstm'):
                # LSTM-style updates with write and erase gates
                write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1])
                erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            elif (self.step_type == 'layer'):
                alpha_gate = T.nnet.sigmoid(hydra_out[1])
                sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step)
            else:
                assert False, "Unknown step type!"

            # compute NLL for the current imputation
            nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask)
            return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g

        # apply scan op for the sequential imputation loop
        self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0
        init_vals = [self.s0_full, None, None, None, None]
        self.scan_results, self.scan_updates = theano.scan(imp_step_func, \
                    outputs_info=init_vals, sequences=self.zi_zmuv)

        self.si = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi_q2p = self.scan_results[2]
        self.kldi_p2q = self.scan_results[3]
        self.kldi_p2g = self.scan_results[4]

        # get the initial imputation state
        self.x0 = (self.x_mask * self.x_in) + \
                  ((1.0 - self.x_mask) * self._si_as_x(self.s0_full))

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX(np.zeros((1, )))
        self.lr = theano.shared(value=zero_ary, name='gpsi_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p')
        self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q')
        self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g')
        self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.joint_params = [self.s0, self.obs_logvar]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0)
        self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \
                         (self.lam_kld_q[0] * self.kld_q) + \
                         (self.lam_kld_g[0] * self.kld_g)
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.nlli[-1]
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct a function for jointly training the generator/inferencer
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling best step cost computer...")
        self.compute_per_step_cost = self._construct_compute_per_step_cost()
        print("Compiling data-guided imputer sampler...")
        self.sample_imputer = self._construct_sample_imputer()
        # make easy access points for some interesting parameters
        #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return
    def __init__(self, rng=None,
            x_in=None, x_mask=None, x_out=None, \
            p_zi_given_xi=None, \
            p_sip1_given_zi=None, \
            q_zi_given_xi=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params['x_dim']
        self.z_dim = self.params['z_dim']
        self.imp_steps = self.params['imp_steps']
        self.step_type = self.params['step_type']
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        self.shared_param_dicts = shared_param_dicts

        # grab handles to the relevant InfNets
        self.p_zi_given_xi = p_zi_given_xi
        self.p_sip1_given_zi = p_sip1_given_zi
        self.q_zi_given_xi = q_zi_given_xi

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.x_mask = x_mask
        self.zi_zmuv = T.tensor3()

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize parameters "owned" by this model
            s0_init = to_fX( np.zeros((self.x_dim,)) )
            init_ary = to_fX( np.zeros((self.x_dim,)) )
            self.x_null = theano.shared(value=init_ary, name='gpis_xn')
            self.grad_null = theano.shared(value=init_ary, name='gpsi_gn')
            self.s0 = theano.shared(value=s0_init, name='gpsi_s0')
            self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])
            self.shared_param_dicts = {}
            self.shared_param_dicts['x_null'] = self.x_null
            self.shared_param_dicts['grad_null'] = self.grad_null
            self.shared_param_dicts['s0'] = self.s0
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            # grab the parameters required by this model from a given dict
            self.x_null = self.shared_param_dicts['x_null']
            self.grad_null = self.shared_param_dicts['grad_null']
            self.s0 = self.shared_param_dicts['s0']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])

        ##################################################
        # Setup the iterative imputation loop using scan #
        ##################################################
        self.ones_mask = T.ones_like(self.x_mask)
        def imp_step_func(zi_zmuv, si):
            si_as_x = self._si_as_x(si)
            xi_unmasked = self.x_out
            xi_masked = (self.x_mask * xi_unmasked) + \
                        ((1.0 - self.x_mask) * si_as_x)
            grad_unmasked = self.x_out - si_as_x
            grad_masked = self.x_mask * grad_unmasked
            # get samples of next zi, according to the global policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked)
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply(
                    T.concatenate([xi_masked, xi_unmasked], axis=1))
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)

            # make zi samples that can be switched between zi_p and zi_q
            zi = ((self.train_switch[0] * zi_q) + \
                 ((1.0 - self.train_switch[0]) * zi_p))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar,
                                    zi_p_mean, zi_p_logvar) # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar,
                                    zi_q_mean, zi_q_logvar) # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar,
                                    0.0, 0.0) # KL(p || global prior)

            # compute the next si, given the sampled zi
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if (self.step_type == 'jump'):
                # jump steps always completely overwrite the current guesses
                sip1 = si_step
            elif (self.step_type == 'add'):
                # add steps just update the guesses additively
                sip1 = si + si_step
            elif (self.step_type == 'lstm'):
                # LSTM-style updates with write and erase gates
                write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1])
                erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            elif (self.step_type == 'layer'):
                alpha_gate = T.nnet.sigmoid(hydra_out[1])
                sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step)
            else:
                assert False, "Unknown step type!"

            # compute NLL for the current imputation
            nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask)
            return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g

        # apply scan op for the sequential imputation loop
        self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0
        init_vals = [self.s0_full, None, None, None, None]
        self.scan_results, self.scan_updates = theano.scan(imp_step_func, \
                    outputs_info=init_vals, sequences=self.zi_zmuv)

        self.si = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi_q2p = self.scan_results[2]
        self.kldi_p2q = self.scan_results[3]
        self.kldi_p2g = self.scan_results[4]

        # get the initial imputation state
        self.x0 = (self.x_mask * self.x_in) + \
                  ((1.0 - self.x_mask) * self._si_as_x(self.s0_full))

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='gpsi_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p')
        self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q')
        self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g')
        self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.joint_params = [self.s0, self.obs_logvar]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0)
        self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \
                         (self.lam_kld_q[0] * self.kld_q) + \
                         (self.lam_kld_g[0] * self.kld_g)
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.nlli[-1]
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct a function for jointly training the generator/inferencer
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling best step cost computer...")
        self.compute_per_step_cost = self._construct_compute_per_step_cost()
        print("Compiling data-guided imputer sampler...")
        self.sample_imputer = self._construct_sample_imputer()
        # make easy access points for some interesting parameters
        #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return
    def __init__(self, rng=None, x_in=None, \
            p_x_given_z=None, q_z_given_x=None, \
            x_dim=None, z_dim=None, \
            params=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        if params is None:
            self.params = {}
        else:
            self.params = params
        if 'xt_transform' in self.params:
            assert((self.params['xt_transform'] == 'sigmoid') or \
                    (self.params['xt_transform'] == 'none'))
            if self.params['xt_transform'] == 'sigmoid':
                self.xt_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.xt_transform = lambda x: x
        else:
            self.xt_transform = lambda x: T.nnet.sigmoid(x)
        if 'logvar_bound' in self.params:
            self.logvar_bound = self.params['logvar_bound']
        else:
            self.logvar_bound = 10.0
        #
        # x_type: this tells if we're using bernoulli or gaussian model for
        #         the observations
        #
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim

        # set parameters for the isotropic Gaussian prior over z
        self.prior_mean = 0.0
        self.prior_logvar = 0.0

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this OneStageModel
        self.x_in = x_in

        #####################################################################
        # Setup the computation graph that provides values in our objective #
        #####################################################################
        # inferencer model for latent variables given observations
        self.q_z_given_x = q_z_given_x
        self.z_mean, self.z_logvar = self.q_z_given_x.apply(self.x_in)
        # reparametrize ZMUV Gaussian samples to get latent samples...
        self.z = reparametrize(self.z_mean, self.z_logvar, rng=self.rng)

        # generator model for observations given latent variables
        self.p_x_given_z = p_x_given_z
        self.xt, _ = self.p_x_given_z.apply(self.z)

        # construct the final output of generator, conditioned on z
        if self.x_type == 'bernoulli':
            self.xg = T.nnet.sigmoid(self.xt)
        else:
            self.xg = self.xt_transform(self.xt)

        # self.output_logvar modifies the output distribution
        zero_ary = to_fX( np.zeros((1,)) )
        self.output_logvar = theano.shared(value=zero_ary, name='osm_output_logvar')
        self.bounded_logvar = self.logvar_bound * \
                    T.tanh(self.output_logvar[0] / self.logvar_bound)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='osm_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='osm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='osm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='osm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting controlling KL(q(z|x) || p(z))
        self.lam_kld = theano.shared(value=zero_ary, name='osm_lam_kld')
        self.set_lam_kld(lam_kld=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='osm_lam_l2w')
        self.set_lam_l2w(1e-4)

        # grab a list of all the parameters to optimize
        self.joint_params = [self.output_logvar]
        self.joint_params.extend(self.q_z_given_x.mlp_params)
        self.joint_params.extend(self.p_x_given_z.mlp_params)

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        # first, do NLL
        self.nll_costs = self.lam_nll[0] * self._construct_nll_costs()
        self.nll_cost = T.mean(self.nll_costs)
        # second, do KLd
        self.kld_costs = self.lam_kld[0] * self._construct_kld_costs()
        self.kld_cost = T.mean(self.kld_costs)
        # third, do regularization
        self.reg_cost = self.lam_l2w[0] * self._construct_reg_costs()
        # finally, combine them for the joint cost.
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)

        # Construct a function for jointly training the generator/inferencer
        print("Compiling self.train_joint...")
        self.train_joint = self._construct_train_joint()
        print("Compiling self.compute_fe_terms...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling self.compute_post_klds...")
        self.compute_post_klds = self._construct_compute_post_klds()
        print("Compiling self.sample_from_prior...")
        self.sample_from_prior = self._construct_sample_from_prior()
        self.transform_x_to_z = theano.function(inputs=[self.x_in], \
                                                outputs=self.z_mean)
        self.transform_z_to_x = theano.function(inputs=[self.z], \
                                                outputs=self.xg)
        self.inf_weights = self.q_z_given_x.shared_layers[0].W
        self.gen_weights = self.p_x_given_z.output_layers[-1].W
        return
Esempio n. 21
0
    def __init__(self, rng=None, Xd=None, \
            g_net=None, i_net=None, pn_seq=None, \
            data_dim=None, prior_dim=None, \
            params=None):
        # setup a rng for this ADPair
        self.rng = RandStream(rng.randint(100000))

        if (params is None):
            self.params = {}
        else:
            self.params = params
        if 'mean_transform' in self.params:
            # apply a user-defined transform to the GenNet output prior to
            # rescaling by self.lam_mnb...
            self.mean_transform = self.params['mean_transform']
        else:
            # default transform is sigmoid -> shift -> scale so that
            # perturbations (for each dimension) are in range -1 --> 1.
            self.mean_transform = lambda x: 2.0 * (apply_sigmoid(x) - 0.5)

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this ADPair
        self.Xd = Xd
        self.Yd = T.icol('adp_Yd') # labels to pass to the PeaNetSeq
        self.Xc = 0.0 * self.Xd
        self.Xm = 0.0 * self.Xd
        self.obs_count = T.cast(Xd.shape[0], 'floatX')

        # create a "shared-parameter" clone of the inferencer, set up to
        # receive input from the appropriate symbolic variables.
        self.IN = i_net.shared_param_clone(rng=rng, \
                Xd=self.Xd, Xc=self.Xc, Xm=self.Xm)
        # capture a handle for samples from the variational posterior
        self.Xp = self.IN.output
        # create a "shared-parameter" clone of the generator, set up to
        # receive input from samples from the variational posterior
        self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output)
        assert(self.GN.out_type == 'gaussian') # check for right output
        # set up a var for controlling the max-norm bound on perturbations
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.lam_mnb = theano.shared(value=zero_ary, \
                name='adp_lam_mnb')
        self.set_lam_mnb(lam_mnb=0.1)

        # rescale the perturbations, to make them adjustably norm-bounded
        self.Xg = self.lam_mnb[0] * self.mean_transform(self.GN.output_mean)

        # record and validate the data dimensionality parameters
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        # output of the generator and input to the inferencer should both be
        # equal to self.data_dim
        assert(self.data_dim == self.GN.mlp_layers[-1].out_dim)
        assert(self.data_dim == self.IN.shared_layers[0].in_dim)
        # input of the generator and mu/sigma outputs of the inferencer should
        # both be equal to self.prior_dim
        assert(self.prior_dim == self.GN.mlp_layers[0].in_dim)
        assert(self.prior_dim == self.IN.mu_layers[-1].out_dim)
        assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim)

        # make a clone of the target PeaNetSeq that takes perturbed inputs
        self.PNS = pn_seq.shared_param_clone(rng=rng, seq_len=2, \
                seq_Xd=[self.Xd, (self.Xd + self.Xg)])
        # get the symbolic vars for passing inputs to self.PNS
        self.Xd_seq = self.PNS.Xd_seq
        self.Yd_seq = self.PNS.Yd_seq
        self.seq_inputs = self.Xd_seq + self.Yd_seq

        # shared var learning rate for generator and inferencer
        self.lr_gn = theano.shared(value=zero_ary, name='adp_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='adp_lr_in')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='adp_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='adp_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='adp_it_count')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_adv = theano.shared(value=zero_ary, name='adp_lam_adv')
        self.set_lam_adv(lam_adv=1.0)
        # init shared var for weighting Gaussian prior over the policy
        self.lam_kld = theano.shared(value=zero_ary, name='adp_lam_kld')
        self.set_lam_kld(lam_kld=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='adp_lam_l2w')
        self.set_lam_l2w(1e-4)

        # Grab the full set of "optimizable" parameters from the generator
        # and inferencer networks that we'll be working with.
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.adv_cost = self.lam_adv[0] * self._construct_adv_cost()
        self.kld_cost = self.lam_kld[0] * self._construct_kld_cost()
        self.other_reg_cost = self._construct_other_reg_cost()
        self.joint_cost = self.adv_cost + self.kld_cost + \
                self.other_reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        for p in self.joint_params:
            self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.05, 0.05)

        # Construct the updates for the generator and inferencer networks
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in_updates = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.joint_updates = OrderedDict()
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]

        # Construct a function for jointly training the generator/inferencer
        self.train_joint = self._construct_train_joint()

        # Construct a function for computing the outputs of the generator
        # network for a batch of noise. Presumably, the noise will be drawn
        # from the same distribution that was used in training....
        self.sample_from_gn = self.GN.sample_from_model
        self.sample_from_Xd = self._construct_sample_from_Xd()
        return
    def __init__(
        self,
        rng=None,
        x_out=None,
        p_zi_given_xi=None,
        p_sip1_given_zi=None,
        p_x_given_si=None,
        q_zi_given_xi=None,
        params=None,
        shared_param_dicts=None,
    ):
        # setup a rng for this SRRModel
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params["x_dim"]
        self.z_dim = self.params["z_dim"]
        self.s_dim = self.params["s_dim"]
        self.use_p_x_given_si = self.params["use_p_x_given_si"]
        self.step_type = self.params["step_type"]
        self.x_type = self.params["x_type"]
        if self.use_p_x_given_si:
            print("Constructing hypotheses indirectly in s-space...")
        else:
            print("Constructing hypotheses directly in x-space...")
            assert self.s_dim == self.x_dim
        if "obs_transform" in self.params:
            assert (self.params["obs_transform"] == "sigmoid") or (self.params["obs_transform"] == "none")
            if self.params["obs_transform"] == "sigmoid":
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == "bernoulli":
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts
        # Deal with revelation scheduling
        if ("rev_masks" in self.params) and (self.params["rev_masks"] is not None):
            rmp = self.params["rev_masks"][0].astype(theano.config.floatX)
            rmq = self.params["rev_masks"][1].astype(theano.config.floatX)
            self.rev_masks_p = theano.shared(value=rmp, name="srrm_rev_masks_p")
            self.rev_masks_q = theano.shared(value=rmq, name="srrm_rev_masks_q")
            self.rev_sched = None
            self.use_rev_masks = True
        else:
            self.rev_sched = self.params["rev_sched"]
            self.rev_masks_p = None
            self.rev_masks_q = None
            self.use_rev_masks = False
            nice_nums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
            # "validate" the set of revelation block descriptions
            for rev_block in self.rev_sched:
                assert rev_block[0] in nice_nums
                assert (rev_block[1] >= 0.0) and (rev_block[1] <= 1.01)
        assert (self.x_type == "bernoulli") or (self.x_type == "gaussian")
        assert (self.step_type == "add") or (self.step_type == "jump")

        # grab handles to the relevant networks
        self.p_zi_given_xi = p_zi_given_xi
        self.p_sip1_given_zi = p_sip1_given_zi
        self.p_x_given_si = p_x_given_si
        self.q_zi_given_xi = q_zi_given_xi

        # record the symbolic variables that will provide inputs to the
        # computation graph created for this SRRModel
        self.x_out = x_out  # target output for generation
        self.zi_zmuv = T.tensor3()  # ZMUV gauss noise for policy wobble
        self.p_masks = T.tensor3()  # revelation masks for primary policy
        self.q_masks = T.tensor3()  # revelation masks for guide policy
        if self.use_rev_masks:
            self.total_steps = self.params["rev_masks"][0].shape[0]
        else:
            self.total_steps = sum([rb[0] for rb in self.rev_sched])

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX(np.zeros((1,)))
        self.train_switch = theano.shared(value=zero_ary, name="srrm_train_switch")
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize the parameters "owned" by this model
            s0_init = to_fX(np.zeros((self.s_dim,)))
            ss_init = to_fX(0.5 * np.ones((self.total_steps,)))
            self.s0 = theano.shared(value=s0_init, name="srrm_s0")
            self.obs_logvar = theano.shared(value=zero_ary, name="srrm_obs_logvar")
            self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0])
            self.step_scales = theano.shared(value=ss_init, name="srrm_step_scales")
            self.shared_param_dicts = {}
            self.shared_param_dicts["s0"] = self.s0
            self.shared_param_dicts["obs_logvar"] = self.obs_logvar
            self.shared_param_dicts["step_scales"] = self.step_scales
        else:
            # grab the parameters required by this model from a given dict
            self.s0 = self.shared_param_dicts["s0"]
            self.obs_logvar = self.shared_param_dicts["obs_logvar"]
            self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0])
            self.step_scales = self.shared_param_dicts["step_scales"]

        ##################################################################
        # Setup the sequential revelation and refinement loop using scan #
        ##################################################################
        # ss: This is a sequence of scalars that will be used to rescale the
        #     "gradient" input to the primary and guide policies.
        #
        # zi_zmuv: This is a sequence of ZMUV gaussian samples that will be
        #          reparametrized to sample actions from the policies.
        #
        # p_masks: This is a sequence of "unmasking" masks. When one of these
        #          masking variables is 1, the corresponding value in self.x_out
        #          will be "revealed" to the primary policy. Prediction error
        #          is measured for a value only the first time it is revealed.
        #          Once revealed, a value remains "visible" to the policy.
        #          The final step should reveal all values.
        #
        # q_masks: This is a sequence of "unmasking" masks. These are similar
        #          to p_masks, but control which values are revealed to the
        #          guide policy. The guide policy masking sequence should be
        #          constructed to stay "ahead of" the primary policy's masking
        #          sequence. The guide policy needs to know which values will
        #          be revealed to the primary policy so that it can focus its
        #          reconstruction efforts on those values. Otherwise, the guide
        #          policy will immediately reconstruct the entire target.
        #
        # si: This is the current "belief state" for each trial in the training
        #     batch. The belief state is updated in each iteration, and passed
        #     forward through the recurrence.
        #
        # mi_p: This is the current revelation mask for the primary policy.
        #
        # mi_q: This is the current revelation mask for the guide policy.
        #
        def srr_step_func(ss, zi_zmuv, p_masks, q_masks, si, mi_p, mi_q):
            # transform the current belief state into an observation
            si_as_x = self._from_si_to_x(si)
            full_grad = T.log(1.0 + T.exp(ss)) * (self.x_out - si_as_x)

            # get the masked belief state and gradient for primary policy
            xi_for_p = (mi_p * self.x_out) + ((1.0 - mi_p) * si_as_x)
            grad_for_p = mi_p * full_grad

            # update the guide policy's revelation mask
            new_to_q = (1.0 - mi_q) * q_masks
            mip1_q = mi_q + new_to_q
            # get the masked belief state and gradient for guide policy
            # xi_for_q = (mip1_q * self.x_out) + ((1.0 - mip1_q) * si_as_x)
            xi_for_q = xi_for_p
            grad_for_q = mip1_q * full_grad

            # get samples of next zi, according to the primary policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(
                T.horizontal_stack(xi_for_p, grad_for_p), do_samples=False
            )
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply(
                T.horizontal_stack(xi_for_q, grad_for_q), do_samples=False
            )
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)
            # make zi samples that can be switched between zi_p and zi_q
            zi = (self.train_switch[0] * zi_q) + ((1.0 - self.train_switch[0]) * zi_p)

            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar)  # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar)  # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0)  # KL(p || N(0, I))

            # compute next si, given sampled zi (i.e. update the belief state)
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if self.step_type == "jump":
                # jump steps always do a full swap of belief state
                sip1 = si_step
            else:
                # additive steps adjust the belief state like an LSTM
                write_gate = T.nnet.sigmoid(2.0 + hydra_out[1])
                erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            # update the primary policy's revelation mask
            new_to_p = (1.0 - mi_p) * p_masks
            mip1_p = mi_p + new_to_p
            # compute NLL only for the newly revealed values
            nlli = self._construct_nll_costs(sip1, self.x_out, new_to_p)
            # each loop iteration produces the following values:
            #   sip1: belief state at end of current step
            #   mip1_p: revealed values mask to use in next step (primary)
            #   mip1_q: revealed values mask to use in next step (guide)
            #   nlli: NLL for values revealed at end of current step
            #   kldi_q2p: KL(q || p) for the current step
            #   kldi_p2q: KL(p || q) for the current step
            #   kldi_p2g: KL(p || N(0,I)) for the current step
            return sip1, mip1_p, mip1_q, nlli, kldi_q2p, kldi_p2q, kldi_p2g

        # initialize belief state to self.s0
        self.s0_full = T.alloc(0.0, self.x_out.shape[0], self.s_dim) + self.s0
        # initialize revelation masks to 0 for all values in all trials
        self.m0_full = T.zeros_like(self.x_out)
        # setup initial values to pass to scan op
        outputs_init = [self.s0_full, self.m0_full, self.m0_full, None, None, None, None]
        sequences_init = [self.step_scales, self.zi_zmuv, self.p_masks, self.q_masks]
        # apply scan op for the sequential imputation loop
        self.scan_results, self.scan_updates = theano.scan(
            srr_step_func, outputs_info=outputs_init, sequences=sequences_init
        )

        # grab results of the scan op. all values are computed for each step
        self.si = self.scan_results[0]  # belief states
        self.mi_p = self.scan_results[1]  # primary revelation masks
        self.mi_q = self.scan_results[2]  # guide revelation masks
        self.nlli = self.scan_results[3]  # NLL on newly revealed values
        self.kldi_q2p = self.scan_results[4]  # KL(q || p)
        self.kldi_p2q = self.scan_results[5]  # KL(p || q)
        self.kldi_p2g = self.scan_results[6]  # KL(p || N(0,I))

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX(np.zeros((1,)))
        self.lr = theano.shared(value=zero_ary, name="srr_lr")
        # shared var momentum parameters for ADAM optimization
        self.mom_1 = theano.shared(value=zero_ary, name="srr_mom_1")
        self.mom_2 = theano.shared(value=zero_ary, name="srr_mom_2")
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared vars for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name="srr_lam_kld_p")
        self.lam_kld_q = theano.shared(value=zero_ary, name="srr_lam_kld_q")
        self.lam_kld_g = theano.shared(value=zero_ary, name="srr_lam_kld_g")
        self.lam_kld_s = theano.shared(value=zero_ary, name="srr_lam_kld_s")
        self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name="srr_lam_l2w")
        self.set_lam_l2w(1e-5)

        # grab all of the "optimizable" parameters from the base networks
        self.joint_params = [self.s0, self.obs_logvar, self.step_scales]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.p_x_given_si.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0)
        self.kld_costs = (
            (self.lam_kld_p[0] * self.kld_p)
            + (self.lam_kld_q[0] * self.kld_q)
            + (self.lam_kld_g[0] * self.kld_g)
            + (self.lam_kld_s[0] * self.kld_s)
        )
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = T.sum(self.nlli, axis=0)  # sum the per-step NLLs
        self.nll_cost = T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(
            params=self.joint_params,
            grads=self.joint_grads,
            alpha=self.lr,
            beta1=self.mom_1,
            beta2=self.mom_2,
            mom2_init=1e-3,
            smoothing=1e-5,
            max_grad_norm=10.0,
        )
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct theano functions for training and diagnostic computations
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling sequence sampler...")
        self.sequence_sampler = self._construct_sequence_sampler()
        # make easy access points for some interesting parameters
        # self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return