def __init__(self, rng=None, \ Xd=None, Yd=None, Xc=None, Xm=None, \ g_net=None, i_net=None, p_net=None, \ data_dim=None, prior_dim=None, label_dim=None, \ params=None): # TODO: refactor for use with "encoded" inferencer/generator assert(not (i_net.use_encoder or g_net.use_decoder)) # setup a rng for this GIStack self.rng = RandStream(rng.randint(100000)) # record the symbolic variables that will provide inputs to the # computation graph created for this GIStack self.Xd = Xd self.Yd = Yd self.Xc = Xc self.Xm = Xm self.Xd2 = T.vertical_stack(self.Xd, self.Xd) self.Yd2 = T.vertical_stack(self.Yd, self.Yd) self.Xc2 = T.vertical_stack(self.Xc, self.Xc) self.Xm2 = T.vertical_stack(self.Xm, self.Xm) self.obs_count = T.cast(self.Xd2.shape[0], 'floatX') # record the dimensionality of the data handled by this GIStack self.data_dim = data_dim self.label_dim = label_dim self.prior_dim = prior_dim # create a "shared-parameter" clone of the latent inferencer self.IN2 = i_net.shared_param_clone(rng=rng, \ Xd=self.Xd2, Xc=self.Xc2, Xm=self.Xm2) # capture a handle for latent samples from the inferencer self.Xp2 = self.IN2.output # feed it into a shared-parameter clone of the generator self.GN2 = g_net.shared_param_clone(rng=rng, Xp=self.Xp2) # capture a handle for outputs from the observation generator self.Xg2 = self.GN2.output # and feed it into a shared-parameter clone of the label generator self.PN2 = p_net.shared_param_clone(rng=rng, Xd=self.Xp2) # capture handles for noisy/clean outputs of the label generator self.Yp2 = self.PN2.output_spawn[0] # noisy predictions self.Yp2_proto = self.PN2.output_proto # noise-free predictions # we require the PeaNet to have one proto-net and one spawn net assert(len(self.PN2.proto_nets) == 1) assert(len(self.PN2.spawn_nets) == 1) # check that all networks agree on the latent variable dimension assert(self.prior_dim == self.IN2.mu_layers[-1].out_dim) assert(self.prior_dim == self.IN2.sigma_layers[-1].out_dim) assert(self.prior_dim == self.GN2.mlp_layers[0].in_dim) assert(self.prior_dim == self.PN2.proto_nets[0][0].in_dim) # check that we've been told the correct cardinality for the # categorical variable we will be "decoding" assert(self.label_dim == self.PN2.proto_nets[0][-1].out_dim) zero_ary = np.zeros((1,)).astype(theano.config.floatX) # shared var learning rates for all networks self.lr_gn = theano.shared(value=zero_ary, name='gis_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='gis_lr_in') self.lr_pn = theano.shared(value=zero_ary, name='gis_lr_pn') # shared var momentum parameters for all networks self.mom_1 = theano.shared(value=zero_ary, name='gis_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gis_mom_2') self.it_count = theano.shared(value=zero_ary, name='gis_it_count') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gis_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_kld = theano.shared(value=zero_ary, name='gis_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for weighting semi-supervised classification self.lam_cat = theano.shared(value=zero_ary, name='gis_lam_cat') self.set_lam_cat(lam_cat=0.0) # init shared var for weighting PEA cost on (un)supervised inputs self.lam_pea_su = theano.shared(value=zero_ary, name='gis_lam_pea_su') self.lam_pea_un = theano.shared(value=zero_ary, name='gis_lam_pea_un') self.set_lam_pea(lam_pea_su=1.0, lam_pea_un=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='gis_lam_l2w') self.set_lam_l2w(lam_l2w=1e-3) # grab the full set of "optimizable" parameters from the generator # and inferencer networks that we'll be working with. self.gn_params = [p for p in self.GN2.mlp_params] self.in_params = [p for p in self.IN2.mlp_params] self.pn_params = [p for p in self.PN2.proto_params] self.joint_params = self.pn_params + self.in_params + self.gn_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### pea_cost_su, pea_cost_un = self._construct_post_pea_costs() self.data_nll_cost = self.lam_nll[0] * self._construct_data_nll_cost() self.post_kld_cost = self.lam_kld[0] * self._construct_post_kld_cost() self.post_cat_cost = self.lam_cat[0] * self._construct_post_cat_cost() self.post_pea_cost = (self.lam_pea_su[0] * pea_cost_su) + \ (self.lam_pea_un[0] * pea_cost_un) self.other_reg_cost = self._construct_other_reg_cost() self.joint_cost = self.data_nll_cost + self.post_kld_cost + self.post_cat_cost + \ self.post_pea_cost + self.other_reg_cost # grab the gradients for all parameters to optimize self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1) # construct the updates for all parameters to optimize self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.in_updates = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.pn_updates = get_adam_updates(params=self.pn_params, \ grads=self.joint_grads, alpha=self.lr_pn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) #self.gn_updates = get_adadelta_updates(params=self.gn_params, \ # grads=self.joint_grads, alpha=self.lr_gn, beta1=0.98) #self.in_updates = get_adadelta_updates(params=self.in_params, \ # grads=self.joint_grads, alpha=self.lr_in, beta1=0.98) #self.pn_updates = get_adadelta_updates(params=self.pn_params, \ # grads=self.joint_grads, alpha=self.lr_dn, beta1=0.98) # bag up all the updates required for training self.joint_updates = OrderedDict() for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] for k in self.pn_updates: self.joint_updates[k] = self.pn_updates[k] # construct a training function for all parameters. training for the # various networks can be switched on and off via learning rates self.train_joint = self._construct_train_joint() return
def __init__(self, rng=None, Xd=None, Xp=None, d_net=None, g_net=None, \ obs_dim=None, z_dim=None, params=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.obs_dim = obs_dim self.z_dim = z_dim self.params = params # check that z_dim agrees with input dim for g_net assert(self.z_dim == g_net.shared_layers[0].in_dim) # set the transform on generator's raw output if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) # symbolic var for inputting samples from the data distribution self.Xd = Xd # symbolic var for inputting samples from the generator's prior self.Xp = Xp # symbolic matrix of indices for data inputs self.Id = T.lvector(name='gcp_Id') # symbolic matrix of indices for noise inputs self.In = T.lvector(name='gcp_In') # create clones of the given generator and discriminator, after # rewiring their computation graphs to take the right inputs self.GN = g_net.shared_param_clone(rng=rng, Xd=self.Xp) self.out_mean, self.out_logvar, self.out_samples = \ self.GN.apply(self.Xp, do_samples=True) self.Xg = self.obs_transform(self.out_samples) self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(self.Xd, self.Xg)) # shared var learning rate for generator and discriminator zero_ary = to_fX( np.zeros((1,)) ) self.lr_gn = theano.shared(value=zero_ary, name='gcp_lr_gn') self.lr_dn = theano.shared(value=zero_ary, name='gcp_lr_dn') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') self.it_count = theano.shared(value=zero_ary, name='msm_it_count') # shared var weights for collaborative classification objective self.dw_gn = theano.shared(value=zero_ary, name='gcp_dw_gn') self.dw_dn = theano.shared(value=zero_ary, name='gcp_dw_dn') # init parameters for controlling learning dynamics self.set_sgd_params() # init SGD rate/momentum self.set_disc_weights() # initcollaborative cost weights for GN/DN self.lam_l2d = theano.shared(value=(zero_ary + self.params['lam_l2d']), \ name='gcp_lam_l2d') ####################################################### # Welcome to: Moment Matching Cost Information Center # ####################################################### # # Get parameters for managing the moment matching cost. The moment # matching is based on exponentially-decaying estimates of the mean # and covariance of the distribution induced by the generator network # and the (latent) noise being fed to it. # # We provide the option of performing moment matching with either the # raw generator output, or with linearly-transformed generator output. # Either way, the given target mean and covariance should have the # appropriate dimension for the space in which we'll be matching the # generator's 1st/2nd moments with the target's 1st/2nd moments. For # clarity, the computation we'll perform looks like: # # Xm = X - np.mean(X, axis=0) # XmP = np.dot(Xm, P) # C = np.dot(XmP.T, XmP) # # where Xm is the mean-centered samples from the generator and P is # the matrix for the linear transform to apply prior to computing # the moment matching cost. For simplicity, the above code ignores the # use of an exponentially decaying average to track the estimated mean # and covariance of the generator's output distribution. # # The relative contribution of the current batch to these running # estimates is determined by self.mom_mix_rate. The mean estimate is # first updated based on the current batch, then the current batch # is centered with the updated mean, then the covariance estimate is # updated with the mean-centered samples in the current batch. # # Strength of the moment matching cost is given by self.mom_match_cost. # Target mean/covariance are given by self.target_mean/self.target_cov. # If a linear transform is to be applied prior to matching, it is given # by self.mom_match_proj. # C_init = to_fX( np.zeros((self.obs_dim, self.obs_dim)) ) m_init = to_fX( np.zeros((self.obs_dim,)) ) self.dist_cov = theano.shared(C_init, name='gcp_dist_cov') self.dist_mean = theano.shared(m_init, name='gcp_dist_mean') zero_ary = np.zeros((1,)) mmr = zero_ary + self.params['mom_mix_rate'] self.mom_mix_rate = theano.shared(name='gcp_mom_mix_rate', \ value=to_fX(mmr)) mmw = zero_ary + self.params['mom_match_weight'] self.mom_match_weight = theano.shared(name='gcp_mom_match_weight', \ value=to_fX(mmw)) targ_mean = to_fX( self.params['target_mean'] ) targ_cov = to_fX( self.params['target_cov'] ) assert(targ_mean.size == targ_cov.shape[0]) # mean and cov use same dim assert(targ_cov.shape[0] == targ_cov.shape[1]) # cov must be square self.target_mean = theano.shared(value=targ_mean, name='gcp_target_mean') self.target_cov = theano.shared(value=targ_cov, name='gcp_target_cov') mmp = np.identity(targ_cov.shape[0]) # default to identity transform if 'mom_match_proj' in self.params: mmp = self.params['mom_match_proj'] # use a user-specified transform assert(mmp.shape[0] == self.obs_dim) # transform matches data dim assert(mmp.shape[1] == targ_cov.shape[0]) # and matches mean/cov dims mmp = to_fX( mmp ) self.mom_match_proj = theano.shared(value=mmp, name='gcp_mom_map_proj') # finally, we can construct the moment matching cost! and the updates # for the running mean/covariance estimates too! self.mom_match_cost, self.mom_updates = self._construct_mom_stuff() ######################################### # Thank you for visiting the M.M.C.I.C. # ######################################### # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the GCPair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this GCPair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.dn_params + self.gn_params # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on collaborative binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # compute small l2 penalty on params self.dn_l2_cost = constFX(1e-4) * T.sum([T.sum(p**2.0) for p in self.dn_params]) self.gn_l2_cost = constFX(1e-4) * T.sum([T.sum(p**2.0) for p in self.gn_params]) # Cost w.r.t. discriminator parameters is only the collaborative binary # classification cost. Cost w.r.t. comprises a collaborative binary # classification cost and the (weighted) moment matching cost. self.dn_cost = self.disc_cost_dn + self.disc_reg_cost + self.dn_l2_cost self.gn_cost = self.disc_cost_gn + self.mom_match_cost + self.gn_l2_cost self.joint_cost = self.dn_cost + self.gn_cost # Compute gradients on generator and dicriminator parameters print("Computing gradients on generator...") self.gn_grads = OrderedDict() grad_list = T.grad(self.gn_cost, self.gn_params) for i, p in enumerate(self.gn_params): self.gn_grads[p] = grad_list[i] print("Computing gradients on discriminator...") self.dn_grads = OrderedDict() grad_list = T.grad(self.dn_cost, self.dn_params) for i, p in enumerate(self.dn_params): self.dn_grads[p] = grad_list[i] # Construct the updates for the generator and discriminator network self.joint_updates = OrderedDict() self.dn_updates = OrderedDict() self.gn_updates = OrderedDict() for var in self.mom_updates: # these updates are for the generator distribution's running first # and second-order moment estimates self.gn_updates[var] = self.mom_updates[var] self.joint_updates[var] = self.gn_updates[var] # Construct the updates for the generator and inferencer networks self.dn_updates = get_adam_updates(params=self.dn_params, \ grads=self.dn_grads, alpha=self.lr_dn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.gn_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) for k in self.dn_updates: self.joint_updates[k] = self.dn_updates[k] for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] # Construct batch-based training functions for the generator and # discriminator networks, as well as a joint training function. print("Compiling generator training function...") self.train_gn = self._construct_train_gn() print("Compiling discriminator training function...") self.train_dn = self._construct_train_dn() print("Compiling joint training function...") self.train_joint = self._construct_train_joint() # Construct a function for computing the ouputs of the generator # network for a batch of noise. Presumably, the noise will be drawn # from the same distribution that was used in training.... self.sample_from_gn = self._construct_model_sampler() return
def __init__(self, rng=None, x_out=None, \ p_z_given_x=None, \ p_x_given_z=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this WalkoutModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.walkout_steps = self.params['walkout_steps'] self.x_type = self.params['x_type'] self.shared_param_dicts = shared_param_dicts if 'x_transform' in self.params: assert((self.params['x_transform'] == 'sigmoid') or \ (self.params['x_transform'] == 'none')) if self.params['x_transform'] == 'sigmoid': self.x_transform = lambda x: T.nnet.sigmoid(x) else: self.x_transform = lambda x: x else: self.x_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.x_transform = lambda x: T.nnet.sigmoid(x) assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) assert((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant networks self.p_z_given_x = p_z_given_x self.p_x_given_z = p_x_given_z # record the symbolic variables that will provide inputs to the # computation graph created for this WalkoutModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for walk-out wobble if self.shared_param_dicts is None: # initialize the parameters "owned" by this model zero_ary = to_fX( np.zeros((1,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) ############################################################### # Setup the forwards (i.e. training) walk-out loop using scan # ############################################################### def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw): # get samples of next zi, according to the forwards model zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv) # check reverse direction probability p(xi_fw | zi_fw) xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_bw_mean = self.x_transform(xi_bw_mean) nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \ log_vars=xi_bw_logvar, mask=None) nll_xi_bw = nll_xi_bw.flatten() # get samples of next xi, according to the forwards model xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_fw_mean = self.x_transform(xi_fw_mean) xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv) # check reverse direction probability p(zi_fw | xi_fw) zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \ log_vars=zi_bw_logvar, mask=None) nll_zi_bw = nll_zi_bw.flatten() # each loop iteration produces the following values: # xi_fw: xi generated fom zi by forwards walk # zi_fw: zi generated fom xi by forwards walk # xi_fw_mean: ---- # xi_fw_logvar: ---- # zi_fw_mean: ---- # zi_fw_logvar: ---- # nll_xi_bw: NLL for reverse step zi_fw -> xi_fw # nll_zi_bw: NLL for reverse step xi_fw -> zi_fw return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw # initialize states for x/z self.x0 = self.x_out self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim) # setup initial values to pass to scan op outputs_init = [self.x0, self.z0, None, None, None, None, None, None] sequences_init = [self.xi_zmuv, self.zi_zmuv] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan(forwards_loop, \ outputs_info=outputs_init, \ sequences=sequences_init) # grab results of the scan op. all values are computed for each step self.xi = self.scan_results[0] self.zi = self.scan_results[1] self.xi_fw_mean = self.scan_results[2] self.xi_fw_logvar = self.scan_results[3] self.zi_fw_mean = self.scan_results[4] self.zi_fw_logvar = self.scan_results[5] self.nll_xi_bw = self.scan_results[6] self.nll_zi_bw = self.scan_results[7] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='srr_lr') # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w') self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return
def __init__(self, rng=None, \ Xd=None, Xc=None, Xm=None, \ g_net=None, i_net=None, \ data_dim=None, prior_dim=None, \ params=None, shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) if params is None: self.params = {} else: self.params = params # record the symbolic variables that will provide inputs to the # computation graph created to describe this GIPair self.Xd = Xd self.Xc = Xc self.Xm = Xm # check whether we'll be working with "encoded" inputs self.use_encoder = i_net.use_encoder print("i_net.use_encoder: {0:s}, g_net.use_decoder: {1:s}".format( \ str(i_net.use_encoder), str(g_net.use_decoder))) assert(self.use_encoder == g_net.use_decoder) # create a "shared-parameter" clone of the inferencer, set up to # receive input from the appropriate symbolic variables. self.IN = i_net.shared_param_clone(rng=rng, \ Xd=apply_mask(self.Xd, self.Xc, self.Xm)) self.posterior_means = self.IN.output_mean self.posterior_sigmas = self.IN.output_sigma self.posterior_norms = T.sqrt(T.sum(self.posterior_means**2.0, axis=1, keepdims=1)) self.posterior_klds = self.IN.kld_cost self.kld2_scale = self.IN.kld2_scale # capture a handle for samples from the variational posterior self.Xp = self.IN.output # create a "shared-parameter" clone of the generator, set up to # receive input from samples from the variational posterior self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output) # capture a handle for sampled reconstructions from the generator self.Xg = self.GN.output # record and validate the data dimensionality parameters self.data_dim = data_dim self.prior_dim = prior_dim # output of the generator and input to the inferencer should both be # equal to self.data_dim assert(self.data_dim == self.GN.mlp_layers[-1].out_dim) assert(self.data_dim == self.IN.shared_layers[0].in_dim) # input of the generator and mu/sigma outputs of the inferencer should # both be equal to self.prior_dim assert(self.prior_dim == self.GN.mlp_layers[0].in_dim) assert(self.prior_dim == self.IN.mu_layers[-1].out_dim) assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim) # determine whether this GIPair is a clone or an original if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True if not self.is_clone: # shared var learning rate for generator and inferencer zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lr_gn = theano.shared(value=zero_ary, name='gip_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='gip_lr_in') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gip_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gip_mom_2') self.it_count = theano.shared(value=zero_ary, name='gip_it_count') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gip_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld = theano.shared(value=zero_ary, name='gip_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='gip_lam_l2w') self.set_lam_l2w(1e-4) # record shared parameters that are to be shared among clones self.shared_param_dicts['gip_lr_gn'] = self.lr_gn self.shared_param_dicts['gip_lr_in'] = self.lr_in self.shared_param_dicts['gip_mom_1'] = self.mom_1 self.shared_param_dicts['gip_mom_2'] = self.mom_2 self.shared_param_dicts['gip_it_count'] = self.it_count self.shared_param_dicts['gip_lam_nll'] = self.lam_nll self.shared_param_dicts['gip_lam_kld'] = self.lam_kld self.shared_param_dicts['gip_lam_l2w'] = self.lam_l2w else: # use some shared parameters that are shared among all clones of # some "base" GIPair self.lr_gn = self.shared_param_dicts['gip_lr_gn'] self.lr_in = self.shared_param_dicts['gip_lr_in'] self.mom_1 = self.shared_param_dicts['gip_mom_1'] self.mom_2 = self.shared_param_dicts['gip_mom_2'] self.it_count = self.shared_param_dicts['gip_it_count'] self.lam_nll = self.shared_param_dicts['gip_lam_nll'] self.lam_kld = self.shared_param_dicts['gip_lam_kld'] self.lam_l2w = self.shared_param_dicts['gip_lam_l2w'] # Grab the full set of "optimizable" parameters from the generator # and inferencer networks that we'll be working with. self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.data_nll_cost = self.lam_nll[0] * self._construct_data_nll_cost() self.post_kld_cost = self.lam_kld[0] * \ self._construct_post_kld_cost(kld2_scale=self.kld2_scale) self.other_reg_cost = self._construct_other_reg_cost() self.joint_cost = self.data_nll_cost + self.post_kld_cost + \ self.other_reg_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p) # Construct the updates for the generator and inferencer networks self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.in_updates = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] self.joint_updates[self.IN.kld_mean] = self.IN.kld_mean_update # Construct a function for jointly training the generator/inferencer self.train_joint = self._construct_train_joint() self.compute_costs = self._construct_compute_costs() self.compute_ll_bound = self._construct_compute_ll_bound() self.compute_post_stats = self._construct_compute_post_stats() return
def __init__(self, rng=None, x_in=None, \ p_x_given_z=None, q_z_given_x=None, \ x_dim=None, z_dim=None, \ params=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters if params is None: self.params = {} else: self.params = params if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10.0 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim # set parameters for the isotropic Gaussian prior over z self.prior_mean = 0.0 self.prior_logvar = 0.0 # record the symbolic variables that will provide inputs to the # computation graph created to describe this OneStageModel self.x_in = x_in ##################################################################### # Setup the computation graph that provides values in our objective # ##################################################################### # inferencer model for latent variables given observations self.q_z_given_x = q_z_given_x self.z_mean, self.z_logvar = self.q_z_given_x.apply(self.x_in) # reparametrize ZMUV Gaussian samples to get latent samples... self.z = reparametrize(self.z_mean, self.z_logvar, rng=self.rng) # generator model for observations given latent variables self.p_x_given_z = p_x_given_z self.xt, _ = self.p_x_given_z.apply(self.z) # construct the final output of generator, conditioned on z if self.x_type == 'bernoulli': self.xg = T.nnet.sigmoid(self.xt) else: self.xg = self.xt_transform(self.xt) # self.output_logvar modifies the output distribution zero_ary = to_fX(np.zeros((1, ))) self.output_logvar = theano.shared(value=zero_ary, name='osm_output_logvar') self.bounded_logvar = self.logvar_bound * \ T.tanh(self.output_logvar[0] / self.logvar_bound) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='osm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='osm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='osm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='osm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting controlling KL(q(z|x) || p(z)) self.lam_kld = theano.shared(value=zero_ary, name='osm_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='osm_lam_l2w') self.set_lam_l2w(1e-4) # grab a list of all the parameters to optimize self.joint_params = [self.output_logvar] self.joint_params.extend(self.q_z_given_x.mlp_params) self.joint_params.extend(self.p_x_given_z.mlp_params) ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### # first, do NLL self.nll_costs = self.lam_nll[0] * self._construct_nll_costs() self.nll_cost = T.mean(self.nll_costs) # second, do KLd self.kld_costs = self.lam_kld[0] * self._construct_kld_costs() self.kld_cost = T.mean(self.kld_costs) # third, do regularization self.reg_cost = self.lam_l2w[0] * self._construct_reg_costs() # finally, combine them for the joint cost. self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) # Construct a function for jointly training the generator/inferencer print("Compiling self.train_joint...") self.train_joint = self._construct_train_joint() print("Compiling self.compute_fe_terms...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling self.compute_post_klds...") self.compute_post_klds = self._construct_compute_post_klds() print("Compiling self.sample_from_prior...") self.sample_from_prior = self._construct_sample_from_prior() self.transform_x_to_z = theano.function(inputs=[self.x_in], \ outputs=self.z_mean) self.transform_z_to_x = theano.function(inputs=[self.z], \ outputs=self.xg) self.inf_weights = self.q_z_given_x.shared_layers[0].W self.gen_weights = self.p_x_given_z.output_layers[-1].W return
def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_h_given_z=None, \ p_x_given_h=None, \ q_z_given_x=None, \ q_h_given_z_x=None, \ x_dim=None, \ z_dim=None, \ h_dim=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.h_dim = h_dim # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_h_given_z_x = q_h_given_z_x self.p_h_given_z = p_h_given_z self.p_x_given_h = p_x_given_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((1,self.z_dim)) ) self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar') self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "hidden" latent state (from both p and q) z_q_mean, z_q_logvar, z_q = \ self.q_z_given_x.apply(self.x_in, do_samples=True) z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0) z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0) zmuv = self.rng.normal(size=z_q.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) z_p = (T.exp(0.5*z_p_logvar) * zmuv) + z_p_mean self.z = (self.train_switch[0] * z_q) + \ ((1.0 - self.train_switch[0]) * z_p) # compute relevant KLds for this step self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, \ z_p_mean, z_p_logvar) self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, \ z_q_mean, z_q_logvar) # samples of "hidden" latent state (from both p and q) h_p_mean, h_p_logvar, h_p = self.p_h_given_z.apply(self.z) h_q_mean, h_q_logvar, h_q = self.q_h_given_z_x.apply( \ T.horizontal_stack(h_p_mean, h_p_logvar, self.x_out)) self.h = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) # compute relevant KLds for this step self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, \ h_p_mean, h_p_logvar) self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, \ h_q_mean, h_q_logvar) # p_x_given_h generates an observation x conditioned on the "hidden" # latent variables h. self.x_gen, _ = self.p_x_given_h.apply(self.h, do_samples=False) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='tsm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w') self.set_lam_l2w(1e-5) # get optimizable parameters belonging to the TwoStageModel self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar] # get optimizable parameters belonging to the underlying networks child_params = [] child_params.extend(self.q_z_given_x.mlp_params) child_params.extend(self.q_h_given_z_x.mlp_params) child_params.extend(self.p_h_given_z.mlp_params) child_params.extend(self.p_x_given_h.mlp_params) # make a joint list of all optimizable parameters self.joint_params = self_params + child_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = T.sum(self.kld_z, axis=1) + \ T.sum(self.kld_h, axis=1) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # construct the updates for the generator and inferencer networks all_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0) self.joint_updates = OrderedDict() for k in all_updates: self.joint_updates[k] = all_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() return
def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_s0_given_z=None, \ p_hi_given_si=None, \ p_sip1_given_si_hi=None, \ q_z_given_x=None, \ q_hi_given_x_si=None, \ obs_dim=None, \ z_dim=None, h_dim=None, \ ir_steps=4, params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.obs_dim = obs_dim self.z_dim = z_dim self.h_dim = h_dim self.ir_steps = ir_steps # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_hi_given_x_si = q_hi_given_x_si self.p_s0_given_z = p_s0_given_z self.p_hi_given_si = p_hi_given_si self.p_sip1_given_si_hi = p_sip1_given_si_hi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.hi_zmuv = T.tensor3() # for ZMUV Gaussian samples to use in scan # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) # this weight balances l1 vs. l2 penalty on posterior KLds self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2') self.set_lam_kld_l1l2(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar') init_vec = to_fX( np.zeros((self.obs_dim,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) # setup a function for computing reconstruction log likelihood if self.x_type == 'bernoulli': self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_bernoulli(xo, xh)) else: self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar)) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.x_out.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) ############################# # Setup self.z and self.s0. # ############################# print("Building MSM step 0...") drop_x = drop_mask * self.x_in self.q_z_mean, self.q_z_logvar, self.z = \ self.q_z_given_x.apply(drop_x, do_samples=True) # get initial observation state self.s0, _ = self.p_s0_given_z.apply(self.z, do_samples=False) # gather KLd and NLL for the initialization step self.init_klds = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) self.init_nlls = -1.0 * \ self.log_prob_func(self.x_out, self.obs_transform(self.s0)) ################################################## # Setup the iterative generation loop using scan # ################################################## def ir_step_func(hi_zmuv, sim1): # get variables used throughout this refinement step sim1_obs = self.obs_transform(sim1) # transform state -> obs grad_ll = self.x_out - sim1_obs # get samples of next hi, conditioned on current si hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \ sim1_obs, do_samples=False) # now we build the model for variational hi given si hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \ T.horizontal_stack(grad_ll, sim1_obs), \ do_samples=False) hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean # make hi samples that can be switched between hi_p and hi_q hi = ( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on si and hi. ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi) # get the transformed values (for an LSTM style update) i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0) f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0) # perform an LSTM-like update of the state sim1 -> si si = (in_vals * i_gate) + (sim1 * f_gate) # compute generator NLL for this step nlli = self.log_prob_func(self.x_out, self.obs_transform(si)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \ hi_p_mean, hi_p_logvar) kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \ hi_q_mean, hi_q_logvar) return si, nlli, kldi_q2p, kldi_p2q init_values = [self.s0, None, None, None] self.scan_results, self.scan_updates = theano.scan(ir_step_func, \ outputs_info=init_values, sequences=self.hi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z') self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q') self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.q_params = [] self.q_params.extend(self.q_z_given_x.mlp_params) self.q_params.extend(self.q_hi_given_x_si.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.p_params = [self.p_z_mean, self.p_z_logvar] self.p_params.extend(self.p_hi_given_si.mlp_params) self.p_params.extend(self.p_sip1_given_si_hi.mlp_params) self.p_params.extend(self.p_s0_given_z.mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.q_params + self.p_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q, self.kld_hi_q2p, self.kld_hi_p2q = \ self._construct_kld_costs(p=1.0) self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_hi = (self.lam_kld_q2p[0] * self.kld_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kld_hi_p2q) self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_hi # now do l2 KLd costs self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_hi_q2p, self.kl2_hi_p2q = \ self._construct_kld_costs(p=2.0) self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_z_p2q) self.kl2_hi = (self.lam_kld_q2p[0] * self.kl2_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_hi_p2q) self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_hi # compute joint l1/l2 KLd cost self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \ ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) self.kl2_cost = T.mean(self.kl2_costs) self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_l1l2_cost + \ self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_l1l2_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.q_updates = get_adam_updates(params=self.q_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.p_updates = get_adam_updates(params=self.p_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.q_updates: self.joint_updates[k] = self.q_updates[k] for k in self.p_updates: self.joint_updates[k] = self.p_updates[k] # add scan updates, which seem to be required for k in self.scan_updates: self.joint_updates[k] = self.scan_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_klds = self._construct_raw_klds() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() print("Compiling data-guided model sampler...") self.sample_from_input = self._construct_sample_from_input() return
def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_s_given_z=None, \ p_h_given_s=None, \ p_x_given_s_h=None, \ q_z_given_x=None, \ q_h_given_x_s=None, \ x_dim=None, \ z_dim=None, \ s_dim=None, \ h_dim=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.s_dim = s_dim self.h_dim = h_dim # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_h_given_x_s = q_h_given_x_s self.p_s_given_z = p_s_given_z self.p_h_given_s = p_h_given_s self.p_x_given_s_h = p_x_given_s_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.batch_reps = T.lscalar() # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) # this weight balances l1 vs. l2 penalty on posterior KLds self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2') self.set_lam_kld_l1l2(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar') init_vec = to_fX( np.zeros((self.x_dim,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.x_out.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "first" latent state drop_x = drop_mask * self.x_in z_q_mean, z_q_logvar, self.z = \ self.q_z_given_x.apply(drop_x, do_samples=True) # compute relevant KLds for this step self.kld_z_q2ps = gaussian_kld(z_q_mean, z_q_logvar, \ self.p_z_mean, self.p_z_logvar) self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \ z_q_mean, z_q_logvar) # transform "first" latent state into "second" latent state self.s, _ = self.p_s_given_z.apply(self.z, do_samples=False) # get samples of h, conditioned on current s h_p_mean, h_p_logvar, h_p = self.p_h_given_s.apply( \ self.s, do_samples=True) # get variational samples of h, given s and x_out h_q_mean, h_q_logvar, h_q = self.q_h_given_x_s.apply( \ T.horizontal_stack(self.x_out, self.s), \ do_samples=True) # make h samples that can be switched between h_p and h_q self.h = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) # compute relevant KLds for this step self.kld_h_q2ps = gaussian_kld(h_q_mean, h_q_logvar, \ h_p_mean, h_p_logvar) self.kld_h_p2qs = gaussian_kld(h_p_mean, h_p_logvar, \ h_q_mean, h_q_logvar) # p_x_given_s_h is conditioned on s and h. self.x_gen, _ = self.p_x_given_s_h.apply( \ T.horizontal_stack(self.s, self.h), \ do_samples=False) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z') self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q') self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.group_1_params = [] self.group_1_params.extend(self.q_z_given_x.mlp_params) self.group_1_params.extend(self.q_h_given_x_s.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.group_2_params = [self.p_z_mean, self.p_z_logvar] self.group_2_params.extend(self.p_s_given_z.mlp_params) self.group_2_params.extend(self.p_h_given_s.mlp_params) self.group_2_params.extend(self.p_x_given_s_h.mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.group_1_params + self.group_2_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q, self.kld_h_q2p, self.kld_h_p2q = \ self._construct_kld_costs(p=1.0) self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_h # now do l2 KLd costs self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_h_q2p, self.kl2_h_p2q = \ self._construct_kld_costs(p=2.0) self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_z_p2q) self.kl2_h = (self.lam_kld_q2p[0] * self.kl2_h_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_h_p2q) self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_h # compute joint l1/l2 KLd cost self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \ ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) self.kl2_cost = T.mean(self.kl2_costs) self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_l1l2_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_l1l2_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.group_1_updates = get_adam_updates(params=self.group_1_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.group_2_updates = get_adam_updates(params=self.group_2_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.group_1_updates: self.joint_updates[k] = self.group_1_updates[k] for k in self.group_2_updates: self.joint_updates[k] = self.group_2_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() print("Compiling data-guided model sampler...") self.sample_from_input = self._construct_sample_from_input() # make easy access points for some interesting parameters self.gen_gen_weights = self.p_x_given_s_h.mu_layers[-1].W return
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] write_dim = 220 enc_dim = 260 dec_dim = 260 mix_dim = 20 z_dim = 100 n_iter = 18 rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # setup the reader and writer read_dim = 2*x_dim reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], \ name="writer_mlp", **inits) # setup the mixture weight sampler mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \ name="mix_enc_mlp", **inits) mix_dec_mlp = MLP([Tanh(), Tanh()], \ [mix_dim, 250, (2*enc_dim + 2*dec_dim)], \ name="mix_dec_mlp", **inits) # setup the components of the generative DRAW model enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim), 4*enc_dim], \ name="enc_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [ z_dim, 4*dec_dim], \ name="dec_mlp_in", **inits) enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits) dec_mlp_out = CondNet([], [dec_dim, z_dim], name="dec_mlp_out", **inits) enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \ name="enc_rnn", **rnninits) dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, \ name="dec_rnn", **rnninits) enc_mlp_stop = MLP([Tanh(), None], [(x_dim + dec_dim), 500, 1], \ name="enc_mlp_stop", **inits) dec_mlp_stop = MLP([Tanh(), None], [dec_dim, 500, 1], \ name="dec_mlp_stop", **inits) draw = IMoESDrawModels( n_iter, step_type='add', # step_type can be 'add' or 'jump' mix_enc_mlp=mix_enc_mlp, mix_dec_mlp=mix_dec_mlp, reader_mlp=reader_mlp, writer_mlp=writer_mlp, enc_mlp_in=enc_mlp_in, enc_mlp_out=enc_mlp_out, enc_rnn=enc_rnn, enc_mlp_stop=enc_mlp_stop, dec_mlp_in=dec_mlp_in, dec_mlp_out=dec_mlp_out, dec_rnn=dec_rnn, dec_mlp_stop=dec_mlp_stop) draw.initialize() # some symbolic vars to represent various inputs/outputs x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') # collect reconstructions of x produced by the IMoDRAW model vfe_cost, cost_all = draw.reconstruct(x_in_sym, x_out_sym) # grab handles for all the optimizable parameters in our cost cg = ComputationGraph([vfe_cost]) joint_params = VariableFilter(roles=[PARAMETER])(cg.variables) # apply some l2 regularization to the model parameters reg_term = (1e-5 * sum([T.sum(p**2.0) for p in joint_params])) reg_term.name = "reg_term" # compute the full cost w.r.t. which we will optimize total_cost = vfe_cost + reg_term total_cost.name = "total_cost" # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of total_cost...") joint_grads = OrderedDict() grad_list = T.grad(total_cost, joint_params) for i, p in enumerate(joint_params): joint_grads[p] = grad_list[i] # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) lr_shared = theano.shared(value=zero_ary, name='tbm_lr') # shared var momentum parameters for generator and inferencer mom_1_shared = theano.shared(value=zero_ary, name='tbm_mom_1') mom_2_shared = theano.shared(value=zero_ary, name='tbm_mom_2') # construct the updates for the generator and inferencer networks joint_updates = get_adam_updates(params=joint_params, \ grads=joint_grads, alpha=lr_shared, \ beta1=mom_1_shared, beta2=mom_2_shared, \ mom2_init=1e-4, smoothing=1e-6, max_grad_norm=10.0) # collect the outputs to return from this function outputs = [total_cost, vfe_cost, reg_term] # compile the theano function print("Compiling model training/update function...") train_joint = theano.function(inputs=[ x_in_sym, x_out_sym ], \ outputs=outputs, updates=joint_updates) print("Compiling NLL bound estimator function...") compute_nll_bound = theano.function(inputs=[ x_in_sym, x_out_sym], \ outputs=outputs) print("Compiling model sampler...") n_samples = T.iscalar("n_samples") samples = draw.sample(n_samples) do_sample = theano.function([n_samples], outputs=samples, allow_input_downcast=True) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("TBM_ES_RESULTS.txt", 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.9 fresh_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 2500.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update fresh_idx += batch_size if (np.max(fresh_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) fresh_idx = np.arange(batch_size) batch_idx = fresh_idx # set sgd and objective function hyperparams for this update zero_ary = np.zeros((1,)) lr_shared.set_value(to_fX(zero_ary + scale*learn_rate)) mom_1_shared.set_value(to_fX(zero_ary + scale*momentum)) mom_2_shared.set_value(to_fX(zero_ary + 0.99)) # perform a minibatch update and record the cost for this batch Xb = to_fX( Xtr.take(batch_idx, axis=0) ) result = train_joint(Xb, Xb) # aggregate costs over multiple minibatches costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 200) == 0): # occasionally dump information about the costs costs = [(v / 200.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " reg_term : {0:.4f}".format(costs[2]) joint_str = "\n".join([str1, str2, str3, str4]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): # compute a small-sample estimate of NLL bound on validation set Xva = row_shuffle(Xva) Xb = to_fX(Xva[:5000]) va_costs = compute_nll_bound(Xb, Xb) str1 = " va_nll_bound : {}".format(va_costs[1]) joint_str = "\n".join([str1]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # draw some independent samples from the model samples = do_sample(16*16) n_iter, N, D = samples.shape samples = samples.reshape( (n_iter, N, 28, 28) ) for j in xrange(n_iter): img = img_grid(samples[j,:,:,:]) img.save("TBM-ES-samples-b%06d-%03d.png" % (i, j))
def __init__(self, rng=None, Xd=None, \ g_net=None, i_net=None, pn_seq=None, \ data_dim=None, prior_dim=None, \ params=None): # setup a rng for this AEDPair self.rng = RandStream(rng.randint(100000)) if (params is None): self.params = {} else: self.params = params if 'match_type' in params: self.match_type = params['match_type'] else: self.match_type = 'grad_sign' # we can only try to match sign or direction... assert((self.match_type == 'grad_dir') or \ (self.match_type == 'grad_sign')) if self.match_type == 'grad_dir': # we match the direction of the gradient under the assumption # of gaussian observation noise self.mean_transform = lambda x: max_normalize(x, axis=1) assert(g_net.out_type == 'gaussian') else: # we match the sign of the gradient as if it were a collection # of independent binary variables self.mean_transform = lambda x: 2.0 * (x - 0.5) assert(g_net.out_type == 'bernoulli') # record the symbolic variables that will provide inputs to the # computation graph created to describe this AEDPair self.Xd = Xd self.Yd = T.icol('adp_Yd') # labels to pass to the PeaNetSeq self.Xc = 0.0 * self.Xd self.Xm = 0.0 * self.Xd self.obs_count = T.cast(Xd.shape[0], 'floatX') # create a "shared-parameter" clone of the inferencer, set up to # receive input from the appropriate symbolic variables. self.IN = i_net.shared_param_clone(rng=rng, \ Xd=self.Xd, Xc=self.Xc, Xm=self.Xm) self.policy_mean = self.IN.output_mean self.policy_logvar = self.IN.output_logvar # capture a handle for samples from the variational posterior self.Xp = self.IN.output # create a "shared-parameter" clone of the generator, set up to # receive input from samples from the variational posterior self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output) # set up a var for controlling the max-norm bound on perturbations zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lam_mnb = theano.shared(value=zero_ary, \ name='adp_lam_mnb') self.set_lam_mnb(lam_mnb=0.1) # get the perturbations output by the generator network self.Pg = self.mean_transform(self.GN.output) if self.match_type == 'grad_dir': # samples because we're matching gradient via squared error self.Pg_samples = self.mean_transform(self.GN.output_samples) else: # no samples, because we're matching gradient sign self.Pg_samples = self.mean_transform(self.GN.output) # record and validate the data dimensionality parameters self.data_dim = data_dim self.prior_dim = prior_dim # output of the generator and input to the inferencer should both be # equal to self.data_dim assert(self.data_dim == self.GN.mlp_layers[-1].out_dim) assert(self.data_dim == self.IN.shared_layers[0].in_dim) # input of the generator and mu/sigma outputs of the inferencer should # both be equal to self.prior_dim assert(self.prior_dim == self.GN.mlp_layers[0].in_dim) assert(self.prior_dim == self.IN.mu_layers[-1].out_dim) assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim) # make a clone of the target PeaNetSeq that takes perturbed inputs self.PNS = pn_seq.shared_param_clone(rng=rng, seq_len=2, \ seq_Xd=[self.Xd, self.Xd], seq_Yd=[self.Yd, self.Yd], \ no_funcs=True) self.grad_pea_Xd = T.grad(self.PNS.joint_cost, self.Xd) if self.match_type == 'grad_dir': # turn gradient into a unit max-normalized vector self.match_target = max_normalize(self.grad_pea_Xd) else: # transform gradient into binary indicators of sign self.match_target = (self.grad_pea_Xd > 0.0) # get the symbolic vars for passing inputs to self.PNS self.Xd_seq = self.PNS.Xd_seq self.Yd_seq = self.PNS.Yd_seq self.seq_inputs = self.Xd_seq + self.Yd_seq # shared var learning rate for generator and inferencer self.lr_gn = theano.shared(value=zero_ary, name='adp_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='adp_lr_in') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='adp_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='adp_mom_2') self.it_count = theano.shared(value=zero_ary, name='adp_it_count') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_adv = theano.shared(value=zero_ary, name='adp_lam_adv') self.set_lam_adv(lam_adv=1.0) # init shared vars for weighting a penalty on the norms of our learned # policies and a reward to encourage maximizing their entropy. self.lam_kld = theano.shared(value=zero_ary, name='adp_lam_kld') self.set_lam_kld(lam_kld=0.1) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='adp_lam_l2w') self.set_lam_l2w(1e-4) # Grab the full set of "optimizable" parameters from the generator # and inferencer networks that we'll be working with. self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.adv_cost = self.lam_adv[0] * self._construct_adv_cost() self.kld_cost = self.lam_kld[0] * self._construct_kld_cost() self.other_reg_cost = self._construct_other_reg_cost() self.joint_cost = self.adv_cost + self.kld_cost + \ self.other_reg_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1) # Construct the updates for the generator and inferencer networks self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.in_updates = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.joint_updates = OrderedDict() for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] # Construct a function for jointly training the generator/inferencer self.train_joint = self._construct_train_joint() # Construct a function for computing the outputs of the generator # network for a batch of noise. Presumably, the noise will be drawn # from the same distribution that was used in training.... self.sample_from_gn = self.GN.sample_from_model self.sample_from_Xd = self._construct_sample_from_Xd() return
def __init__(self, rng=None, \ x_in=None, y_in=None, \ q_z_given_x=None, \ class_count=None, \ z_dim=None, \ use_samples=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # record the dimensions of various spaces relevant to this model self.class_count = class_count self.z_dim = z_dim self.shared_dim = q_z_given_x.shared_layers[-1].out_dim self.use_samples = use_samples # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.y_in = y_in # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='cm_drop_rate') self.set_drop_rate(0.0) # initialize classification layer parameters init_mat = to_fX(0.01 * npr.randn(self.shared_dim, self.class_count)) init_vec = to_fX( np.zeros((self.class_count,)) ) self.W_class = theano.shared(value=init_mat, name='cm_W_class') self.b_class = theano.shared(value=init_vec, name='cm_b_class') # initialize "optimizable" parameters specific to this CM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='cm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='cm_p_z_logvar') ################# # Setup self.z. # ################# self.q_z_mean, self.q_z_logvar, self.q_z_samples = \ self.q_z_given_x.apply(self.x_in, do_samples=True) self.q_z_samples = self.q_z_given_x.apply_shared(self.x_in) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.q_z_samples.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) # get a droppy version of either z mean or z samples # if self.use_samples: # self.z = self.q_z_samples * drop_mask # else: # self.z = self.q_z_mean * drop_mask self.z = self.q_z_samples * drop_mask # compute class predictions self.y_out = T.dot(self.z, self.W_class) + self.b_class # compute KLds for training via variational free-energy self.kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \ self.q_z_mean, self.q_z_logvar) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='cm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='cm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='cm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='cm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='cm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='cm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='cm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=0.9, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='cm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters self.joint_params = [self.p_z_mean, self.p_z_logvar, \ self.W_class, self.b_class] self.joint_params.extend(self.q_z_given_x.mlp_params) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.lam_nll[0] * self._construct_nll_costs(self.y_in) self.nll_cost = T.mean(self.nll_costs) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_cost = T.mean(self.kld_costs) ################################## # CONSTRUCT THE FINAL JOINT COST # ################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the model parameters self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling class error estimator...") self.class_error = self._construct_class_error() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() # make easy access points for some interesting parameters self.inf_weights = self.q_z_given_x.shared_layers[0].W return
def __init__(self, rng=None, x_d=None, x_t=None, \ i_net=None, g_net=None, d_net=None, \ chain_len=None, data_dim=None, z_dim=None, \ params=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.data_dim = data_dim self.z_dim = z_dim self.p_z_mean = 0.0 self.p_z_logvar = 0.0 if params is None: self.params = {} else: self.params = params if 'cost_decay' in self.params: self.cost_decay = self.params['cost_decay'] else: self.cost_decay = 0.1 if 'chain_type' in self.params: assert((self.params['chain_type'] == 'walkback') or \ (self.params['chain_type'] == 'walkout')) self.chain_type = self.params['chain_type'] else: self.chain_type = 'walkout' if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # grab symbolic input variables self.x_d = x_d # initial input for starting the chain self.x_t = x_t # samples from target distribution self.z_zmuv = T.tensor3() # ZMUV gaussian samples for use in scan # get the number of steps for chain unrolling self.chain_len = chain_len # symbolic matrix of indices for inputs from target distribution self.It = T.arange(self.x_t.shape[0]) # symbolic matrix of indices for noise/generated inputs self.Id = T.arange(self.chain_len * self.x_d.shape[0]) + self.x_t.shape[0] # get a clone of the desired VAE, for easy access self.OSM = OneStageModel(rng=rng, x_in=self.x_d, \ p_x_given_z=g_net, q_z_given_x=i_net, \ x_dim=self.data_dim, z_dim=self.z_dim, \ params=self.params) self.IN = self.OSM.q_z_given_x self.GN = self.OSM.p_x_given_z self.transform_x_to_z = self.OSM.transform_x_to_z self.transform_z_to_x = self.OSM.transform_z_to_x self.bounded_logvar = self.OSM.bounded_logvar ################################################## # self-loop the VAE into a multi-step Markov chain. # ** All VAEs in the chain share the same Xc and Xm, which are the # symbolic inputs for providing the observed portion of the input # and a mask indicating which part of the input is "observed". # These inputs are used for training "reconstruction" policies. ################################################## # Setup the iterative generation loop using scan # ################################################## def chain_step_func(zi_zmuv, xim1): # get mean and logvar of z samples for this step zi_mean, zi_logvar = self.IN.apply(xim1, do_samples=False) # transform ZMUV samples to get desired samples zi = (T.exp(0.5 * zi_logvar) * zi_zmuv) + zi_mean # get the next generated xi (pre-transformation) outputs = self.GN.apply(zi) xti = outputs[-1] # apply the observation "mean" transform xgi = self.xt_transform(xti) # compute NLL for this step if self.chain_type == 'walkout': x_true = self.x_d else: x_true = xim1 nlli = self._log_prob(x_true, xgi).flatten() kldi = T.sum(gaussian_kld(zi_mean, zi_logvar, \ self.p_z_mean, self.p_z_logvar), axis=1) return xgi, nlli, kldi # apply the scan op init_values = [self.x_d, None, None] self.scan_results, self.scan_updates = \ theano.scan(chain_step_func, outputs_info=init_values, \ sequences=self.z_zmuv) # get the outputs of the scan op self.xgi = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi = self.scan_results[2] self.xgi_list = [self.xgi[i] for i in range(self.chain_len)] # make a clone of the desired discriminator network, which will try # to discriminate between samples from the training data and samples # generated by the self-looped VAE chain. self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(self.x_t, *self.xgi_list)) zero_ary = np.zeros((1,)).astype(theano.config.floatX) # init shared var for weighting nll of data given posterior sample self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll') self.set_lam_chain_nll(lam_chain_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld') self.set_lam_chain_kld(lam_chain_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w') self.set_lam_l2w(lam_l2w=1e-4) # shared var learning rates for all networks self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn') self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in') # shared var momentum parameters for all networks self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2') # shared var weights for adversarial classification objective self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn') self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init adversarial cost weights for GN/DN self.set_disc_weights() # set a shared var for regularizing the output of the discriminator self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \ name='vcg_lam_l2d') # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the VCGair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this VCGair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params + self.dn_params # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on adversarial binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # first, build the cost to be optimized by the discriminator network, # in general this will be treated somewhat indepedently of the # optimization of the generator and inferencer networks. self.dn_cost = self.disc_cost_dn + self.disc_reg_cost # construct costs relevant to the optimization of the generator and # discriminator networks self.chain_nll_cost = self.lam_chain_nll[0] * \ self._construct_chain_nll_cost(cost_decay=self.cost_decay) self.chain_kld_cost = self.lam_chain_kld[0] * \ self._construct_chain_kld_cost(cost_decay=self.cost_decay) self.other_reg_cost = self._construct_other_reg_cost() self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \ self.chain_kld_cost + self.other_reg_cost # compute total cost on the discriminator and VB generator/inferencer self.joint_cost = self.dn_cost + self.osm_cost print("Computing VCGLoop joint_grad...") # grab the gradients for all parameters to optimize self.joint_grads = OrderedDict() for p in self.dn_params: self.joint_grads[p] = T.grad(self.dn_cost, p) for p in self.in_params: self.joint_grads[p] = T.grad(self.osm_cost, p) for p in self.gn_params: self.joint_grads[p] = T.grad(self.osm_cost, p) # construct the updates for the discriminator, generator and # inferencer networks. all networks share the same first/second # moment momentum and iteration count. the networks each have their # own learning rates, which lets you turn their learning on/off. self.dn_updates = get_adam_updates(params=self.dn_params, \ grads=self.joint_grads, alpha=self.lr_dn, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) self.in_updates = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) # bag up all the updates required for training self.joint_updates = OrderedDict() for k in self.dn_updates: self.joint_updates[k] = self.dn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] print("Compiling VCGLoop train_joint...") # construct the function for training on training data self.train_joint = self._construct_train_joint() return
def __init__(self, rng=None, pea_net=None, seq_len=2, seq_Xd=None, \ seq_Yd=None, no_noise=False, no_funcs=False, params=None, \ shared_param_dict=None): assert(not (rng is None)) assert(not (pea_net is None)) assert(seq_len >= 2) if not (seq_Xd is None): # if symbolic inputs for the sequence to receive are given when # the sequence is created, check if it's the right amount. assert(len(seq_Xd) == seq_len) if not (seq_Yd is None): # if symbolic inputs for the sequence to receive are given when # the sequence is created, check if it's the right amount. assert(len(seq_Yd) == seq_len) self.params = params # setup a rng for this PeaNetSeq self.rng = RandStream(rng.randint(100000)) if shared_param_dict is None: self.is_clone = False self.shared_param_dict = {} else: print("Inititalizing a PeaNetSeq clone...") self.is_clone = True self.shared_param_dict = shared_param_dict # make param dict for a noiseless version of the PNSeq new_pn_params = pea_net.params.copy() if no_noise: for sc in new_pn_params['spawn_configs']: sc['input_noise'] = 0.0 sc['bias_noise'] = 0.0 sc['do_dropout'] = False # setup the sequence of PeaNet clones self.seq_len = seq_len self.Xd_seq = [] self.Yd_seq = [] self.PN_seq = [] for i in range(self.seq_len): if seq_Xd is None: # make new symbolic inputs if none were given Xd_i = T.matrix(name="Xd_{0:d}".format(i)) else: # otherwise, use the given symbolic inputs Xd_i = seq_Xd[i] if seq_Yd is None: # create a label vector to be associated with this clone Yd_i = T.icol(name="Yd_{0:d}".format(i)) else: # otherwise, use the given symbolic inputs Yd_i = seq_Yd[i] # add observation/label inputs and the clone to the sequence self.Xd_seq.append(Xd_i) self.Yd_seq.append(Yd_i) self.PN_seq.append(pea_net.shared_param_clone(rng=rng, Xd=Xd_i, \ params=new_pn_params)) self.PN = self.PN_seq[0] # create the full list of symbolic inputs required for training self.seq_inputs = self.Xd_seq + self.Yd_seq if not self.is_clone: # shared var learning rate for the base network zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lr_pn = theano.shared(value=zero_ary, name='pnseq_lr_pn') # shared var momentum parameters for the base network self.mom_1 = theano.shared(value=zero_ary, name='pnseq_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='pnseq_mom_2') self.it_count = theano.shared(value=zero_ary, name='pnseq_it_count') # init parameters for controlling learning dynamics self.set_pn_sgd_params() # init shared var for weighting PEA cost on supervised inputs self.lam_pea_su = theano.shared(value=zero_ary, name='pnseq_lam_pea_su') self.set_lam_pea_su(lam_pea_su=1.0) # init shared var for weighting PEA cost on unsupervised inputs self.lam_pea_un = theano.shared(value=zero_ary, name='pnseq_lam_pea_un') self.set_lam_pea_un(lam_pea_un=1.0) # init shared var for weighting entropy cost on unsupervised inputs self.lam_ent = theano.shared(value=zero_ary, name='pnseq_lam_ent') self.set_lam_ent(lam_ent=0.0) # init shared var for weighting classification cost on supervised inputs self.lam_class = theano.shared(value=zero_ary, name='pnseq_lam_class') self.set_lam_class(lam_class=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='pnseq_lam_l2w') self.set_lam_l2w(1e-4) # make the dict for passing around shared lambdas self.shared_param_dict['lr_pn'] = self.lr_pn self.shared_param_dict['mom_1'] = self.mom_1 self.shared_param_dict['mom_2'] = self.mom_2 self.shared_param_dict['it_count'] = self.it_count self.shared_param_dict['lam_pea_su'] = self.lam_pea_su self.shared_param_dict['lam_pea_un'] = self.lam_pea_un self.shared_param_dict['lam_ent'] = self.lam_ent self.shared_param_dict['lam_class'] = self.lam_class self.shared_param_dict['lam_l2w'] = self.lam_l2w else: # copy shared lambdas from the cloning dict self.lr_pn = self.shared_param_dict['lr_pn'] self.mom_1 = self.shared_param_dict['mom_1'] self.mom_2 = self.shared_param_dict['mom_2'] self.it_count = self.shared_param_dict['it_count'] self.lam_pea_su = self.shared_param_dict['lam_pea_su'] self.lam_pea_un = self.shared_param_dict['lam_pea_un'] self.lam_ent = self.shared_param_dict['lam_ent'] self.lam_class = self.shared_param_dict['lam_class'] self.lam_l2w = self.shared_param_dict['lam_l2w'] # grab the full set of "optimizable" parameters from the base network self.mlp_params = [p for p in self.PN.proto_params] ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.pea_su_cost, self.pea_un_cost = self._construct_pea_costs() self.pea_cost = (self.lam_pea_su[0] * self.pea_su_cost) + \ (self.lam_pea_un[0] * self.pea_un_cost) self.ent_cost = self.lam_ent[0] * self._construct_ent_cost() self.class_cost = self.lam_class[0] * self._construct_class_cost() self.other_reg_cost = self._construct_other_reg_cost() self.joint_cost = self.pea_cost + self.ent_cost + self.class_cost + \ self.other_reg_cost ###################################################### # Construct updates for the shared PeaNet parameters # ###################################################### self.mlp_grads = OrderedDict() for p in self.mlp_params: self.mlp_grads[p] = T.grad(self.joint_cost, p).clip(-1.0,1.0) # Construct the updates for the generator and inferencer networks self.mlp_updates = get_adam_updates(params=self.mlp_params, \ grads=self.mlp_grads, alpha=self.lr_pn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) # Construct a function for training the base network to minimize the # sequential PEAR cost if no_funcs: self.train_joint = None self.get_pn_output = None else: self.train_joint = self._construct_train_joint() # make a function for computing outputs of the main PeaNet self.get_pn_output = theano.function([self.PN.Xd], \ outputs=self.PN.output_proto) return
def __init__(self, rng=None, x_in=None, x_out=None, p_h_given_z=None, p_x_given_h=None, q_z_given_x=None, q_h_given_z_x=None, x_dim=None, z_dim=None, h_dim=None, h_det_dim=None, params=None, shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.h_dim = h_dim self.h_det_dim = h_det_dim # grab handles to the relevant HydraNets self.q_z_given_x = q_z_given_x self.q_h_given_z_x = q_h_given_z_x self.p_h_given_z = p_h_given_z self.p_x_given_h = p_x_given_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((1,self.z_dim)) ) self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar') self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "hidden" latent state (from both p and q) z_q_mean, z_q_logvar = self.q_z_given_x.apply(self.x_in) z_q = reparametrize(z_q_mean, z_q_logvar, rng=self.rng) z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0) z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0) z_p = reparametrize(z_p_mean, z_p_logvar, rng=self.rng) self.z = (self.train_switch[0] * z_q) + \ ((1.0 - self.train_switch[0]) * z_p) # compute relevant KLds for this step self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, z_p_mean, z_p_logvar) self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, z_q_mean, z_q_logvar) # samples of "hidden" latent state (from both p and q) h_p_mean, h_p_logvar = self.p_h_given_z.apply(self.z) h_p = reparametrize(h_p_mean, h_p_logvar, rng=self.rng) h_q_mean, h_q_logvar = self.q_h_given_z_x.apply( T.concatenate([h_p_mean, self.x_out], axis=1)) h_q = reparametrize(h_q_mean, h_q_logvar, rng=self.rng) # compute "stochastic" and "deterministic" parts of latent state h_sto = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) h_det = h_p_mean if self.h_det_dim is None: # don't pass forward any deterministic state self.h = h_sto else: # pass forward some deterministic state self.h = T.concatenate([h_det[:,:self.h_det_dim], h_sto[:,self.h_det_dim:]], axis=1) # compute relevant KLds for this step self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, h_p_mean, h_p_logvar) self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, h_q_mean, h_q_logvar) # p_x_given_h generates an observation x conditioned on the "hidden" # latent variables h. self.x_gen, _ = self.p_x_given_h.apply(self.h) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='tsm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w') self.set_lam_l2w(1e-5) # get optimizable parameters belonging to the TwoStageModel self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar] # get optimizable parameters belonging to the underlying networks child_params = [] child_params.extend(self.q_z_given_x.mlp_params) child_params.extend(self.q_h_given_z_x.mlp_params) child_params.extend(self.p_h_given_z.mlp_params) child_params.extend(self.p_x_given_h.mlp_params) # make a joint list of all optimizable parameters self.joint_params = self_params + child_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = T.sum(self.kld_z, axis=1) + \ T.sum(self.kld_h, axis=1) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # construct the updates for the generator and inferencer networks all_updates = get_adam_updates(params=self.joint_params, grads=self.joint_grads, alpha=self.lr, beta1=self.mom_1, beta2=self.mom_2, mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0) self.joint_updates = OrderedDict() for k in all_updates: self.joint_updates[k] = all_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() return
def __init__(self, rng=None, x_in=None, \ p_s0_obs_given_z_obs=None, p_hi_given_si=None, p_sip1_given_si_hi=None, \ p_x_given_si_hi=None, q_z_given_x=None, q_hi_given_x_si=None, \ obs_dim=None, z_dim=None, h_dim=None, \ model_init_obs=True, ir_steps=2, \ params=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # TODO: implement functionality for working with "latent" si assert(p_x_given_si_hi is None) # decide whether to initialize from a model or from a "constant" self.model_init_obs = model_init_obs # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) # record the dimensions of various spaces relevant to this model self.obs_dim = obs_dim self.z_dim = z_dim self.h_dim = h_dim self.ir_steps = ir_steps # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x = x_in self.batch_reps = T.lscalar() # setup switching variable for changing between sampling/training zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a weight for pulling priors over hi given si towards a # shared global prior -- e.g. zero mean and unit variance. self.kzg_weight = theano.shared(value=zero_ary, name='msm_kzg_weight') self.set_kzg_weight(0.1) # this weight balances l1 vs. l2 penalty on posterior KLds self.l1l2_weight = theano.shared(value=zero_ary, name='msm_l1l2_weight') self.set_l1l2_weight(1.0) # this parameter controls dropout rate in the generator read function self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) ############################# # Setup self.z and self.s0. # ############################# print("Building MSM step 0...") obs_scale = 0.0 if self.model_init_obs: # initialize obs state from generative model obs_scale = 1.0 self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x) self.z = self.q_z_given_x.output self.p_s0_obs_given_z_obs = p_s0_obs_given_z_obs.shared_param_clone( \ rng=rng, Xd=self.z) _s0_obs_model = self.p_s0_obs_given_z_obs.output_mean _s0_obs_const = self.p_s0_obs_given_z_obs.mu_layers[-1].b self.s0_obs = (obs_scale * _s0_obs_model) + \ ((1.0 - obs_scale) * _s0_obs_const) self.output_logvar = self.p_s0_obs_given_z_obs.sigma_layers[-1].b self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.output_logvar) ############################################################### # Setup the iterative refinement loop, starting from self.s0. # ############################################################### self.p_hi_given_si = [] # holds p_hi_given_si for each i self.p_sip1_given_si_hi = [] # holds p_sip1_given_si_hi for each i self.q_hi_given_x_si = [] # holds q_hi_given_x_si for each i self.si = [self.s0_obs] # holds si for each i self.hi = [] # holds hi for each i for i in range(self.ir_steps): print("Building MSM step {0:d}...".format(i+1)) si_obs = self.si[i] # get samples of next hi, conditioned on current si self.p_hi_given_si.append( \ p_hi_given_si.shared_param_clone(rng=rng, \ Xd=self.obs_transform(si_obs))) hi_p = self.p_hi_given_si[i].output # now we build the model for variational hi given si grad_ll = self.x - self.obs_transform(si_obs) self.q_hi_given_x_si.append(\ q_hi_given_x_si.shared_param_clone(rng=rng, \ Xd=T.horizontal_stack( \ grad_ll, self.obs_transform(si_obs)))) hi_q = self.q_hi_given_x_si[i].output # make hi samples that can be switched between hi_p and hi_q self.hi.append( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on hi. self.p_sip1_given_si_hi.append( \ p_sip1_given_si_hi.shared_param_clone(rng=rng, \ Xd=self.hi[i])) # construct the update from si_obs to sip1_obs sip1_obs = si_obs + self.p_sip1_given_si_hi[i].output_mean # record the updated state of the generative process self.si.append(sip1_obs) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_1 = theano.shared(value=zero_ary, name='msm_lam_kld_1') self.lam_kld_2 = theano.shared(value=zero_ary, name='msm_lam_kld_2') self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.group_1_params = [] self.group_1_params.extend(self.q_z_given_x.mlp_params) self.group_1_params.extend(self.p_s0_obs_given_z_obs.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.group_2_params = [] for i in range(self.ir_steps): self.group_2_params.extend(self.q_hi_given_x_si[i].mlp_params) self.group_2_params.extend(self.p_hi_given_si[i].mlp_params) self.group_2_params.extend(self.p_sip1_given_si_hi[i].mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.group_1_params + self.group_2_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z, self.kld_hi_cond, self.kld_hi_glob = \ self._construct_kld_costs() self.kld_cost = (self.lam_kld_1[0] * T.mean(self.kld_z)) + \ (self.lam_kld_2[0] * (T.mean(self.kld_hi_cond) + \ (self.kzg_weight[0] * T.mean(self.kld_hi_glob)))) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs() self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.group_1_updates = get_adam_updates(params=self.group_1_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.group_2_updates = get_adam_updates(params=self.group_2_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.group_1_updates: self.joint_updates[k] = self.group_1_updates[k] for k in self.group_2_updates: self.joint_updates[k] = self.group_2_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() self.compute_post_klds = self._construct_compute_post_klds() self.compute_fe_terms = self._construct_compute_fe_terms() self.sample_from_prior = self._construct_sample_from_prior() # make easy access points for some interesting parameters self.inf_1_weights = self.q_z_given_x.shared_layers[0].W self.gen_1_weights = self.p_s0_obs_given_z_obs.mu_layers[-1].W self.inf_2_weights = self.q_hi_given_x_si[0].shared_layers[0].W self.gen_2_weights = self.p_sip1_given_si_hi[0].mu_layers[-1].W self.gen_inf_weights = self.p_hi_given_si[0].shared_layers[0].W return
def __init__(self, rng=None, \ Xd=None, Xc=None, Xm=None, \ g_net=None, i_net=None, \ data_dim=None, prior_dim=None, \ g_net_2=None, i_net_2=None, \ prior_dim_2=None, \ params=None, shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) if params is None: self.params = {} else: self.params = params # record the symbolic variables that will provide inputs to the # computation graph created to describe this GIPair self.Xd = Xd self.Xc = Xc self.Xm = Xm # check whether we'll be working with "encoded" inputs self.use_encoder = i_net.use_encoder print("i_net.use_encoder: {0:s}, g_net.use_decoder: {1:s}".format( \ str(i_net.use_encoder), str(g_net.use_decoder))) assert(self.use_encoder == g_net.use_decoder) # create a "shared-parameter" clone of the inferencer, set up to # receive input from the appropriate symbolic variables. self.IN = i_net.shared_param_clone(rng=rng, \ Xd=apply_mask(Xd=self.Xd, Xc=self.Xc, Xm=self.Xm)) self.posterior_means = self.IN.output_mean self.posterior_sigmas = self.IN.output_sigma self.posterior_norms = T.sqrt(T.sum(self.posterior_means**2.0, axis=1, keepdims=1)) self.posterior_klds = self.IN.kld_cost self.kld2_scale = self.IN.kld2_scale # capture a handle for samples from the variational posterior self.Xp = self.IN.output # create a "shared-parameter" clone of the generator, set up to # receive input from samples from the variational posterior self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output) # capture a handle for sampled reconstructions from the generator self.Xg = self.GN.output # construct a second GIPair stacked on top of the first GIPair, which # learns to model the posterior samples emitted by the inferencer in # the first GIPair self.IN2 = i_net_2.shared_param_clone(rng=rng, Xd=apply_mask(Xd=self.Xp, \ Xc=T.zeros_like(self.Xp), Xm=T.zeros_like(self.Xp))) # capture a handle for samples from the top's variational posterior self.Xp2 = self.IN2.output # feed these variational posterior samples into the top's generator self.GN2 = g_net_2.shared_param_clone(rng=rng, Xp=self.Xp2) # capture a handle for sampled (latent) reconstructions from GN2 self.Xg2 = self.GN2.output # record and validate the data dimensionality parameters self.data_dim = data_dim self.prior_dim = prior_dim self.prior_dim_2 = prior_dim_2 # output of the generator and input to the inferencer should both be # equal to self.data_dim assert(self.data_dim == self.GN.mlp_layers[-1].out_dim) assert(self.data_dim == self.IN.shared_layers[0].in_dim) # input of the generator and mu/sigma outputs of the inferencer should # both be equal to self.prior_dim assert(self.prior_dim == self.GN.mlp_layers[0].in_dim) assert(self.prior_dim == self.IN.mu_layers[-1].out_dim) assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim) # input of the generator and mu/sigma outputs of the inferencer should # both be equal to self.prior_dim assert(self.prior_dim_2 == self.GN2.mlp_layers[0].in_dim) assert(self.prior_dim_2 == self.IN2.mu_layers[-1].out_dim) assert(self.prior_dim_2 == self.IN2.sigma_layers[-1].out_dim) # determine whether this GIPair is a clone or an original if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True if not self.is_clone: # shared var learning rate for generator and inferencer zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lr_gn = theano.shared(value=zero_ary, name='gip_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='gip_lr_in') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gip_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gip_mom_2') self.it_count_bot = theano.shared(value=zero_ary, name='gip_it_count_bot') self.it_count_top = theano.shared(value=zero_ary, name='gip_it_count_top') self.it_count_joint = theano.shared(value=zero_ary, name='gip_it_count_joint') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gip_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld = theano.shared(value=zero_ary, name='gip_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='gip_lam_l2w') self.set_lam_l2w(1e-4) # record shared parameters that are to be shared among clones self.shared_param_dicts['gip_lr_gn'] = self.lr_gn self.shared_param_dicts['gip_lr_in'] = self.lr_in self.shared_param_dicts['gip_mom_1'] = self.mom_1 self.shared_param_dicts['gip_mom_2'] = self.mom_2 self.shared_param_dicts['gip_it_count_bot'] = self.it_count_bot self.shared_param_dicts['gip_it_count_top'] = self.it_count_top self.shared_param_dicts['gip_it_count_joint'] = self.it_count_joint self.shared_param_dicts['gip_lam_nll'] = self.lam_nll self.shared_param_dicts['gip_lam_kld'] = self.lam_kld self.shared_param_dicts['gip_lam_l2w'] = self.lam_l2w else: # use some shared parameters that are shared among all clones of # some "base" GIPair self.lr_gn = self.shared_param_dicts['gip_lr_gn'] self.lr_in = self.shared_param_dicts['gip_lr_in'] self.mom_1 = self.shared_param_dicts['gip_mom_1'] self.mom_2 = self.shared_param_dicts['gip_mom_2'] self.it_count_bot = self.shared_param_dicts['gip_it_count_bot'] self.it_count_top = self.shared_param_dicts['gip_it_count_top'] self.it_count_joint = self.shared_param_dicts['gip_it_count_joint'] self.lam_nll = self.shared_param_dicts['gip_lam_nll'] self.lam_kld = self.shared_param_dicts['gip_lam_kld'] self.lam_l2w = self.shared_param_dicts['gip_lam_l2w'] # grab the optimizable parameters in the bottom GIPair self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] self.bot_params = self.in_params + self.gn_params # grab the optimizable parameters in the top GIPair self.in2_params = [p for p in self.IN2.mlp_params] self.gn2_params = [p for p in self.GN2.mlp_params] self.top_params = self.in2_params + self.gn2_params # get the optimizable parameters of bottom + top GIPair self.joint_params = self.top_params + self.bot_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.data_nll_cost_bot = self.lam_nll[0] * \ self._construct_data_nll_cost(which_gip='bot') self.data_nll_cost_top = self.lam_nll[0] * \ self._construct_data_nll_cost(which_gip='top') self.post_kld_cost_bot = self.lam_kld[0] * \ self._construct_post_kld_cost(which_gip='bot', kld2_scale=self.kld2_scale) self.post_kld_cost_top = self.lam_kld[0] * \ self._construct_post_kld_cost(which_gip='top', kld2_scale=self.kld2_scale) self.other_reg_cost_bot = \ self._construct_other_reg_cost(which_gip='bot') self.other_reg_cost_top = \ self._construct_other_reg_cost(which_gip='top') # summed costs for bottom, top, and joint objectives self.bot_cost = self.data_nll_cost_bot + self.post_kld_cost_bot + \ self.other_reg_cost_bot self.top_cost = self.data_nll_cost_top + self.post_kld_cost_top + \ self.other_reg_cost_top self.joint_cost = self.bot_cost + self.top_cost ######################################### # CONSTRUCT THE GRADIENTS FOR THE COSTS # ######################################### self.bot_grads = OrderedDict() for p in self.bot_params: self.bot_grads[p] = T.grad(self.bot_cost, p).clip(-0.1, 0.1) # Get the gradient of the top cost for all relevant parameters self.top_grads = OrderedDict() for p in self.top_params: self.top_grads[p] = T.grad(self.top_cost, p).clip(-0.1, 0.1) # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1) ####################################### # CONSTRUCT THE UPDATES FOR THE COSTS # ####################################### # construct updates for the bottom GIPair, for the bottom cost self.gn_updates_bot = get_adam_updates(params=self.gn_params, \ grads=self.bot_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_bot, \ mom2_init=1e-3, smoothing=1e-8) self.in_updates_bot = get_adam_updates(params=self.in_params, \ grads=self.bot_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_bot, \ mom2_init=1e-3, smoothing=1e-8) # construct updates for the top GIPair, for the top cost self.gn2_updates_top = get_adam_updates(params=self.gn2_params, \ grads=self.top_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_top, \ mom2_init=1e-3, smoothing=1e-8) self.in2_updates_top = get_adam_updates(params=self.in2_params, \ grads=self.top_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_top, \ mom2_init=1e-3, smoothing=1e-8) # construct updates for the bottom GIPair, for the joint cost self.gn_updates_joint = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_joint, \ mom2_init=1e-3, smoothing=1e-8) self.in_updates_joint = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_joint, \ mom2_init=1e-3, smoothing=1e-8) # construct updates for the top GIPair, for the joint cost self.gn2_updates_joint = get_adam_updates(params=self.gn2_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_joint, \ mom2_init=1e-3, smoothing=1e-8) self.in2_updates_joint = get_adam_updates(params=self.in2_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_joint, \ mom2_init=1e-3, smoothing=1e-8) # Merge the bottom updates for easier application self.bot_updates = OrderedDict() for k in self.gn_updates_bot: self.bot_updates[k] = self.gn_updates_bot[k] for k in self.in_updates_bot: self.bot_updates[k] = self.in_updates_bot[k] self.bot_updates[self.IN.kld_mean] = self.IN.kld_mean_update # Merge the top updates for easier application self.top_updates = OrderedDict() for k in self.gn2_updates_top: self.top_updates[k] = self.gn2_updates_top[k] for k in self.in2_updates_top: self.top_updates[k] = self.in2_updates_top[k] self.top_updates[self.IN2.kld_mean] = self.IN2.kld_mean_update # Merge the joint updates for easier application self.joint_updates = OrderedDict() for k in self.gn_updates_joint: self.joint_updates[k] = self.gn_updates_joint[k] for k in self.in_updates_joint: self.joint_updates[k] = self.in_updates_joint[k] for k in self.gn2_updates_joint: self.joint_updates[k] = self.gn2_updates_joint[k] for k in self.in2_updates_joint: self.joint_updates[k] = self.in2_updates_joint[k] self.joint_updates[self.IN.kld_mean] = self.IN.kld_mean_update self.joint_updates[self.IN2.kld_mean] = self.IN2.kld_mean_update # Construct a function for jointly training the generator/inferencer self.train_bot = self._construct_train_bot() self.train_top = self._construct_train_top() self.train_joint = self._construct_train_joint() self.compute_costs = self._construct_compute_costs() return
def __init__(self, rng=None, x_out=None, \ p_z_given_x=None, \ p_x_given_z=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this WalkoutModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.walkout_steps = self.params['walkout_steps'] self.x_type = self.params['x_type'] self.shared_param_dicts = shared_param_dicts if 'x_transform' in self.params: assert((self.params['x_transform'] == 'sigmoid') or \ (self.params['x_transform'] == 'none')) if self.params['x_transform'] == 'sigmoid': self.x_transform = lambda x: T.nnet.sigmoid(x) else: self.x_transform = lambda x: x else: self.x_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.x_transform = lambda x: T.nnet.sigmoid(x) assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) assert ((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant networks self.p_z_given_x = p_z_given_x self.p_x_given_z = p_x_given_z # record the symbolic variables that will provide inputs to the # computation graph created for this WalkoutModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for walk-out wobble if self.shared_param_dicts is None: # initialize the parameters "owned" by this model zero_ary = to_fX(np.zeros((1, ))) self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar') self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) ############################################################### # Setup the forwards (i.e. training) walk-out loop using scan # ############################################################### def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw): # get samples of next zi, according to the forwards model zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv) # check reverse direction probability p(xi_fw | zi_fw) xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_bw_mean = self.x_transform(xi_bw_mean) nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \ log_vars=xi_bw_logvar, mask=None) nll_xi_bw = nll_xi_bw.flatten() # get samples of next xi, according to the forwards model xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_fw_mean = self.x_transform(xi_fw_mean) xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv) # check reverse direction probability p(zi_fw | xi_fw) zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \ log_vars=zi_bw_logvar, mask=None) nll_zi_bw = nll_zi_bw.flatten() # each loop iteration produces the following values: # xi_fw: xi generated fom zi by forwards walk # zi_fw: zi generated fom xi by forwards walk # xi_fw_mean: ---- # xi_fw_logvar: ---- # zi_fw_mean: ---- # zi_fw_logvar: ---- # nll_xi_bw: NLL for reverse step zi_fw -> xi_fw # nll_zi_bw: NLL for reverse step xi_fw -> zi_fw return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw # initialize states for x/z self.x0 = self.x_out self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim) # setup initial values to pass to scan op outputs_init = [self.x0, self.z0, None, None, None, None, None, None] sequences_init = [self.xi_zmuv, self.zi_zmuv] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan(forwards_loop, \ outputs_info=outputs_init, \ sequences=sequences_init) # grab results of the scan op. all values are computed for each step self.xi = self.scan_results[0] self.zi = self.scan_results[1] self.xi_fw_mean = self.scan_results[2] self.xi_fw_logvar = self.scan_results[3] self.zi_fw_mean = self.scan_results[4] self.zi_fw_logvar = self.scan_results[5] self.nll_xi_bw = self.scan_results[6] self.nll_zi_bw = self.scan_results[7] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='srr_lr') # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w') self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs( p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return
def __init__(self, rng=None, x_in=None, x_mask=None, x_out=None, \ p_zi_given_xi=None, \ p_sip1_given_zi=None, \ q_zi_given_xi=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.imp_steps = self.params['imp_steps'] self.step_type = self.params['step_type'] self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) self.shared_param_dicts = shared_param_dicts # grab handles to the relevant InfNets self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.x_mask = x_mask self.zi_zmuv = T.tensor3() # setup switching variable for changing between sampling/training zero_ary = to_fX(np.zeros((1, ))) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize parameters "owned" by this model s0_init = to_fX(np.zeros((self.x_dim, ))) init_ary = to_fX(np.zeros((self.x_dim, ))) self.x_null = theano.shared(value=init_ary, name='gpis_xn') self.grad_null = theano.shared(value=init_ary, name='gpsi_gn') self.s0 = theano.shared(value=s0_init, name='gpsi_s0') self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar') self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['x_null'] = self.x_null self.shared_param_dicts['grad_null'] = self.grad_null self.shared_param_dicts['s0'] = self.s0 self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.x_null = self.shared_param_dicts['x_null'] self.grad_null = self.shared_param_dicts['grad_null'] self.s0 = self.shared_param_dicts['s0'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) ################################################## # Setup the iterative imputation loop using scan # ################################################## self.ones_mask = T.ones_like(self.x_mask) def imp_step_func(zi_zmuv, si): si_as_x = self._si_as_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = self.x_mask * grad_unmasked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.concatenate([xi_masked, xi_unmasked], axis=1)) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step elif (self.step_type == 'add'): # add steps just update the guesses additively sip1 = si + si_step elif (self.step_type == 'lstm'): # LSTM-style updates with write and erase gates write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1]) erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) elif (self.step_type == 'layer'): alpha_gate = T.nnet.sigmoid(hydra_out[1]) sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step) else: assert False, "Unknown step type!" # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g # apply scan op for the sequential imputation loop self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0 init_vals = [self.s0_full, None, None, None, None] self.scan_results, self.scan_updates = theano.scan(imp_step_func, \ outputs_info=init_vals, sequences=self.zi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] self.kldi_p2g = self.scan_results[4] # get the initial imputation state self.x0 = (self.x_mask * self.x_in) + \ ((1.0 - self.x_mask) * self._si_as_x(self.s0_full)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='gpsi_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g') self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.joint_params = [self.s0, self.obs_logvar] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling best step cost computer...") self.compute_per_step_cost = self._construct_compute_per_step_cost() print("Compiling data-guided imputer sampler...") self.sample_imputer = self._construct_sample_imputer() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return
def __init__(self, rng=None, x_in=None, x_mask=None, x_out=None, \ p_zi_given_xi=None, \ p_sip1_given_zi=None, \ q_zi_given_xi=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.imp_steps = self.params['imp_steps'] self.step_type = self.params['step_type'] self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) self.shared_param_dicts = shared_param_dicts # grab handles to the relevant InfNets self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.x_mask = x_mask self.zi_zmuv = T.tensor3() # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize parameters "owned" by this model s0_init = to_fX( np.zeros((self.x_dim,)) ) init_ary = to_fX( np.zeros((self.x_dim,)) ) self.x_null = theano.shared(value=init_ary, name='gpis_xn') self.grad_null = theano.shared(value=init_ary, name='gpsi_gn') self.s0 = theano.shared(value=s0_init, name='gpsi_s0') self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['x_null'] = self.x_null self.shared_param_dicts['grad_null'] = self.grad_null self.shared_param_dicts['s0'] = self.s0 self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.x_null = self.shared_param_dicts['x_null'] self.grad_null = self.shared_param_dicts['grad_null'] self.s0 = self.shared_param_dicts['s0'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) ################################################## # Setup the iterative imputation loop using scan # ################################################## self.ones_mask = T.ones_like(self.x_mask) def imp_step_func(zi_zmuv, si): si_as_x = self._si_as_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = self.x_mask * grad_unmasked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.concatenate([xi_masked, xi_unmasked], axis=1)) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step elif (self.step_type == 'add'): # add steps just update the guesses additively sip1 = si + si_step elif (self.step_type == 'lstm'): # LSTM-style updates with write and erase gates write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1]) erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) elif (self.step_type == 'layer'): alpha_gate = T.nnet.sigmoid(hydra_out[1]) sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step) else: assert False, "Unknown step type!" # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g # apply scan op for the sequential imputation loop self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0 init_vals = [self.s0_full, None, None, None, None] self.scan_results, self.scan_updates = theano.scan(imp_step_func, \ outputs_info=init_vals, sequences=self.zi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] self.kldi_p2g = self.scan_results[4] # get the initial imputation state self.x0 = (self.x_mask * self.x_in) + \ ((1.0 - self.x_mask) * self._si_as_x(self.s0_full)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='gpsi_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g') self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.joint_params = [self.s0, self.obs_logvar] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling best step cost computer...") self.compute_per_step_cost = self._construct_compute_per_step_cost() print("Compiling data-guided imputer sampler...") self.sample_imputer = self._construct_sample_imputer() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return
def __init__(self, rng=None, x_in=None, \ p_x_given_z=None, q_z_given_x=None, \ x_dim=None, z_dim=None, \ params=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters if params is None: self.params = {} else: self.params = params if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10.0 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim # set parameters for the isotropic Gaussian prior over z self.prior_mean = 0.0 self.prior_logvar = 0.0 # record the symbolic variables that will provide inputs to the # computation graph created to describe this OneStageModel self.x_in = x_in ##################################################################### # Setup the computation graph that provides values in our objective # ##################################################################### # inferencer model for latent variables given observations self.q_z_given_x = q_z_given_x self.z_mean, self.z_logvar = self.q_z_given_x.apply(self.x_in) # reparametrize ZMUV Gaussian samples to get latent samples... self.z = reparametrize(self.z_mean, self.z_logvar, rng=self.rng) # generator model for observations given latent variables self.p_x_given_z = p_x_given_z self.xt, _ = self.p_x_given_z.apply(self.z) # construct the final output of generator, conditioned on z if self.x_type == 'bernoulli': self.xg = T.nnet.sigmoid(self.xt) else: self.xg = self.xt_transform(self.xt) # self.output_logvar modifies the output distribution zero_ary = to_fX( np.zeros((1,)) ) self.output_logvar = theano.shared(value=zero_ary, name='osm_output_logvar') self.bounded_logvar = self.logvar_bound * \ T.tanh(self.output_logvar[0] / self.logvar_bound) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='osm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='osm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='osm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='osm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting controlling KL(q(z|x) || p(z)) self.lam_kld = theano.shared(value=zero_ary, name='osm_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='osm_lam_l2w') self.set_lam_l2w(1e-4) # grab a list of all the parameters to optimize self.joint_params = [self.output_logvar] self.joint_params.extend(self.q_z_given_x.mlp_params) self.joint_params.extend(self.p_x_given_z.mlp_params) ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### # first, do NLL self.nll_costs = self.lam_nll[0] * self._construct_nll_costs() self.nll_cost = T.mean(self.nll_costs) # second, do KLd self.kld_costs = self.lam_kld[0] * self._construct_kld_costs() self.kld_cost = T.mean(self.kld_costs) # third, do regularization self.reg_cost = self.lam_l2w[0] * self._construct_reg_costs() # finally, combine them for the joint cost. self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) # Construct a function for jointly training the generator/inferencer print("Compiling self.train_joint...") self.train_joint = self._construct_train_joint() print("Compiling self.compute_fe_terms...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling self.compute_post_klds...") self.compute_post_klds = self._construct_compute_post_klds() print("Compiling self.sample_from_prior...") self.sample_from_prior = self._construct_sample_from_prior() self.transform_x_to_z = theano.function(inputs=[self.x_in], \ outputs=self.z_mean) self.transform_z_to_x = theano.function(inputs=[self.z], \ outputs=self.xg) self.inf_weights = self.q_z_given_x.shared_layers[0].W self.gen_weights = self.p_x_given_z.output_layers[-1].W return
def __init__(self, rng=None, Xd=None, \ g_net=None, i_net=None, pn_seq=None, \ data_dim=None, prior_dim=None, \ params=None): # setup a rng for this ADPair self.rng = RandStream(rng.randint(100000)) if (params is None): self.params = {} else: self.params = params if 'mean_transform' in self.params: # apply a user-defined transform to the GenNet output prior to # rescaling by self.lam_mnb... self.mean_transform = self.params['mean_transform'] else: # default transform is sigmoid -> shift -> scale so that # perturbations (for each dimension) are in range -1 --> 1. self.mean_transform = lambda x: 2.0 * (apply_sigmoid(x) - 0.5) # record the symbolic variables that will provide inputs to the # computation graph created to describe this ADPair self.Xd = Xd self.Yd = T.icol('adp_Yd') # labels to pass to the PeaNetSeq self.Xc = 0.0 * self.Xd self.Xm = 0.0 * self.Xd self.obs_count = T.cast(Xd.shape[0], 'floatX') # create a "shared-parameter" clone of the inferencer, set up to # receive input from the appropriate symbolic variables. self.IN = i_net.shared_param_clone(rng=rng, \ Xd=self.Xd, Xc=self.Xc, Xm=self.Xm) # capture a handle for samples from the variational posterior self.Xp = self.IN.output # create a "shared-parameter" clone of the generator, set up to # receive input from samples from the variational posterior self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output) assert(self.GN.out_type == 'gaussian') # check for right output # set up a var for controlling the max-norm bound on perturbations zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lam_mnb = theano.shared(value=zero_ary, \ name='adp_lam_mnb') self.set_lam_mnb(lam_mnb=0.1) # rescale the perturbations, to make them adjustably norm-bounded self.Xg = self.lam_mnb[0] * self.mean_transform(self.GN.output_mean) # record and validate the data dimensionality parameters self.data_dim = data_dim self.prior_dim = prior_dim # output of the generator and input to the inferencer should both be # equal to self.data_dim assert(self.data_dim == self.GN.mlp_layers[-1].out_dim) assert(self.data_dim == self.IN.shared_layers[0].in_dim) # input of the generator and mu/sigma outputs of the inferencer should # both be equal to self.prior_dim assert(self.prior_dim == self.GN.mlp_layers[0].in_dim) assert(self.prior_dim == self.IN.mu_layers[-1].out_dim) assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim) # make a clone of the target PeaNetSeq that takes perturbed inputs self.PNS = pn_seq.shared_param_clone(rng=rng, seq_len=2, \ seq_Xd=[self.Xd, (self.Xd + self.Xg)]) # get the symbolic vars for passing inputs to self.PNS self.Xd_seq = self.PNS.Xd_seq self.Yd_seq = self.PNS.Yd_seq self.seq_inputs = self.Xd_seq + self.Yd_seq # shared var learning rate for generator and inferencer self.lr_gn = theano.shared(value=zero_ary, name='adp_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='adp_lr_in') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='adp_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='adp_mom_2') self.it_count = theano.shared(value=zero_ary, name='adp_it_count') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_adv = theano.shared(value=zero_ary, name='adp_lam_adv') self.set_lam_adv(lam_adv=1.0) # init shared var for weighting Gaussian prior over the policy self.lam_kld = theano.shared(value=zero_ary, name='adp_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='adp_lam_l2w') self.set_lam_l2w(1e-4) # Grab the full set of "optimizable" parameters from the generator # and inferencer networks that we'll be working with. self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.adv_cost = self.lam_adv[0] * self._construct_adv_cost() self.kld_cost = self.lam_kld[0] * self._construct_kld_cost() self.other_reg_cost = self._construct_other_reg_cost() self.joint_cost = self.adv_cost + self.kld_cost + \ self.other_reg_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.05, 0.05) # Construct the updates for the generator and inferencer networks self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.in_updates = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.joint_updates = OrderedDict() for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] # Construct a function for jointly training the generator/inferencer self.train_joint = self._construct_train_joint() # Construct a function for computing the outputs of the generator # network for a batch of noise. Presumably, the noise will be drawn # from the same distribution that was used in training.... self.sample_from_gn = self.GN.sample_from_model self.sample_from_Xd = self._construct_sample_from_Xd() return
def __init__( self, rng=None, x_out=None, p_zi_given_xi=None, p_sip1_given_zi=None, p_x_given_si=None, q_zi_given_xi=None, params=None, shared_param_dicts=None, ): # setup a rng for this SRRModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params["x_dim"] self.z_dim = self.params["z_dim"] self.s_dim = self.params["s_dim"] self.use_p_x_given_si = self.params["use_p_x_given_si"] self.step_type = self.params["step_type"] self.x_type = self.params["x_type"] if self.use_p_x_given_si: print("Constructing hypotheses indirectly in s-space...") else: print("Constructing hypotheses directly in x-space...") assert self.s_dim == self.x_dim if "obs_transform" in self.params: assert (self.params["obs_transform"] == "sigmoid") or (self.params["obs_transform"] == "none") if self.params["obs_transform"] == "sigmoid": self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == "bernoulli": self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # Deal with revelation scheduling if ("rev_masks" in self.params) and (self.params["rev_masks"] is not None): rmp = self.params["rev_masks"][0].astype(theano.config.floatX) rmq = self.params["rev_masks"][1].astype(theano.config.floatX) self.rev_masks_p = theano.shared(value=rmp, name="srrm_rev_masks_p") self.rev_masks_q = theano.shared(value=rmq, name="srrm_rev_masks_q") self.rev_sched = None self.use_rev_masks = True else: self.rev_sched = self.params["rev_sched"] self.rev_masks_p = None self.rev_masks_q = None self.use_rev_masks = False nice_nums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] # "validate" the set of revelation block descriptions for rev_block in self.rev_sched: assert rev_block[0] in nice_nums assert (rev_block[1] >= 0.0) and (rev_block[1] <= 1.01) assert (self.x_type == "bernoulli") or (self.x_type == "gaussian") assert (self.step_type == "add") or (self.step_type == "jump") # grab handles to the relevant networks self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.p_x_given_si = p_x_given_si self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created for this SRRModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for policy wobble self.p_masks = T.tensor3() # revelation masks for primary policy self.q_masks = T.tensor3() # revelation masks for guide policy if self.use_rev_masks: self.total_steps = self.params["rev_masks"][0].shape[0] else: self.total_steps = sum([rb[0] for rb in self.rev_sched]) # setup switching variable for changing between sampling/training zero_ary = to_fX(np.zeros((1,))) self.train_switch = theano.shared(value=zero_ary, name="srrm_train_switch") self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize the parameters "owned" by this model s0_init = to_fX(np.zeros((self.s_dim,))) ss_init = to_fX(0.5 * np.ones((self.total_steps,))) self.s0 = theano.shared(value=s0_init, name="srrm_s0") self.obs_logvar = theano.shared(value=zero_ary, name="srrm_obs_logvar") self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0]) self.step_scales = theano.shared(value=ss_init, name="srrm_step_scales") self.shared_param_dicts = {} self.shared_param_dicts["s0"] = self.s0 self.shared_param_dicts["obs_logvar"] = self.obs_logvar self.shared_param_dicts["step_scales"] = self.step_scales else: # grab the parameters required by this model from a given dict self.s0 = self.shared_param_dicts["s0"] self.obs_logvar = self.shared_param_dicts["obs_logvar"] self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0]) self.step_scales = self.shared_param_dicts["step_scales"] ################################################################## # Setup the sequential revelation and refinement loop using scan # ################################################################## # ss: This is a sequence of scalars that will be used to rescale the # "gradient" input to the primary and guide policies. # # zi_zmuv: This is a sequence of ZMUV gaussian samples that will be # reparametrized to sample actions from the policies. # # p_masks: This is a sequence of "unmasking" masks. When one of these # masking variables is 1, the corresponding value in self.x_out # will be "revealed" to the primary policy. Prediction error # is measured for a value only the first time it is revealed. # Once revealed, a value remains "visible" to the policy. # The final step should reveal all values. # # q_masks: This is a sequence of "unmasking" masks. These are similar # to p_masks, but control which values are revealed to the # guide policy. The guide policy masking sequence should be # constructed to stay "ahead of" the primary policy's masking # sequence. The guide policy needs to know which values will # be revealed to the primary policy so that it can focus its # reconstruction efforts on those values. Otherwise, the guide # policy will immediately reconstruct the entire target. # # si: This is the current "belief state" for each trial in the training # batch. The belief state is updated in each iteration, and passed # forward through the recurrence. # # mi_p: This is the current revelation mask for the primary policy. # # mi_q: This is the current revelation mask for the guide policy. # def srr_step_func(ss, zi_zmuv, p_masks, q_masks, si, mi_p, mi_q): # transform the current belief state into an observation si_as_x = self._from_si_to_x(si) full_grad = T.log(1.0 + T.exp(ss)) * (self.x_out - si_as_x) # get the masked belief state and gradient for primary policy xi_for_p = (mi_p * self.x_out) + ((1.0 - mi_p) * si_as_x) grad_for_p = mi_p * full_grad # update the guide policy's revelation mask new_to_q = (1.0 - mi_q) * q_masks mip1_q = mi_q + new_to_q # get the masked belief state and gradient for guide policy # xi_for_q = (mip1_q * self.x_out) + ((1.0 - mip1_q) * si_as_x) xi_for_q = xi_for_p grad_for_q = mip1_q * full_grad # get samples of next zi, according to the primary policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( T.horizontal_stack(xi_for_p, grad_for_p), do_samples=False ) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.horizontal_stack(xi_for_q, grad_for_q), do_samples=False ) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = (self.train_switch[0] * zi_q) + ((1.0 - self.train_switch[0]) * zi_p) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || N(0, I)) # compute next si, given sampled zi (i.e. update the belief state) hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if self.step_type == "jump": # jump steps always do a full swap of belief state sip1 = si_step else: # additive steps adjust the belief state like an LSTM write_gate = T.nnet.sigmoid(2.0 + hydra_out[1]) erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) # update the primary policy's revelation mask new_to_p = (1.0 - mi_p) * p_masks mip1_p = mi_p + new_to_p # compute NLL only for the newly revealed values nlli = self._construct_nll_costs(sip1, self.x_out, new_to_p) # each loop iteration produces the following values: # sip1: belief state at end of current step # mip1_p: revealed values mask to use in next step (primary) # mip1_q: revealed values mask to use in next step (guide) # nlli: NLL for values revealed at end of current step # kldi_q2p: KL(q || p) for the current step # kldi_p2q: KL(p || q) for the current step # kldi_p2g: KL(p || N(0,I)) for the current step return sip1, mip1_p, mip1_q, nlli, kldi_q2p, kldi_p2q, kldi_p2g # initialize belief state to self.s0 self.s0_full = T.alloc(0.0, self.x_out.shape[0], self.s_dim) + self.s0 # initialize revelation masks to 0 for all values in all trials self.m0_full = T.zeros_like(self.x_out) # setup initial values to pass to scan op outputs_init = [self.s0_full, self.m0_full, self.m0_full, None, None, None, None] sequences_init = [self.step_scales, self.zi_zmuv, self.p_masks, self.q_masks] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan( srr_step_func, outputs_info=outputs_init, sequences=sequences_init ) # grab results of the scan op. all values are computed for each step self.si = self.scan_results[0] # belief states self.mi_p = self.scan_results[1] # primary revelation masks self.mi_q = self.scan_results[2] # guide revelation masks self.nlli = self.scan_results[3] # NLL on newly revealed values self.kldi_q2p = self.scan_results[4] # KL(q || p) self.kldi_p2q = self.scan_results[5] # KL(p || q) self.kldi_p2g = self.scan_results[6] # KL(p || N(0,I)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1,))) self.lr = theano.shared(value=zero_ary, name="srr_lr") # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name="srr_mom_1") self.mom_2 = theano.shared(value=zero_ary, name="srr_mom_2") # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name="srr_lam_kld_p") self.lam_kld_q = theano.shared(value=zero_ary, name="srr_lam_kld_q") self.lam_kld_g = theano.shared(value=zero_ary, name="srr_lam_kld_g") self.lam_kld_s = theano.shared(value=zero_ary, name="srr_lam_kld_s") self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name="srr_lam_l2w") self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0) self.kld_costs = ( (self.lam_kld_p[0] * self.kld_p) + (self.lam_kld_q[0] * self.kld_q) + (self.lam_kld_g[0] * self.kld_g) + (self.lam_kld_s[0] * self.kld_s) ) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates( params=self.joint_params, grads=self.joint_grads, alpha=self.lr, beta1=self.mom_1, beta2=self.mom_2, mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0, ) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters # self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return