def srr_step_func(ss, zi_zmuv, p_masks, q_masks, si, mi_p, mi_q): # transform the current belief state into an observation si_as_x = self._from_si_to_x(si) full_grad = T.log(1.0 + T.exp(ss)) * (self.x_out - si_as_x) # get the masked belief state and gradient for primary policy xi_for_p = (mi_p * self.x_out) + ((1.0 - mi_p) * si_as_x) grad_for_p = mi_p * full_grad # update the guide policy's revelation mask new_to_q = (1.0 - mi_q) * q_masks mip1_q = mi_q + new_to_q # get the masked belief state and gradient for guide policy # xi_for_q = (mip1_q * self.x_out) + ((1.0 - mip1_q) * si_as_x) xi_for_q = xi_for_p grad_for_q = mip1_q * full_grad # get samples of next zi, according to the primary policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( T.horizontal_stack(xi_for_p, grad_for_p), do_samples=False ) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.horizontal_stack(xi_for_q, grad_for_q), do_samples=False ) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = (self.train_switch[0] * zi_q) + ((1.0 - self.train_switch[0]) * zi_p) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || N(0, I)) # compute next si, given sampled zi (i.e. update the belief state) hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if self.step_type == "jump": # jump steps always do a full swap of belief state sip1 = si_step else: # additive steps adjust the belief state like an LSTM write_gate = T.nnet.sigmoid(2.0 + hydra_out[1]) erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) # update the primary policy's revelation mask new_to_p = (1.0 - mi_p) * p_masks mip1_p = mi_p + new_to_p # compute NLL only for the newly revealed values nlli = self._construct_nll_costs(sip1, self.x_out, new_to_p) # each loop iteration produces the following values: # sip1: belief state at end of current step # mip1_p: revealed values mask to use in next step (primary) # mip1_q: revealed values mask to use in next step (guide) # nlli: NLL for values revealed at end of current step # kldi_q2p: KL(q || p) for the current step # kldi_p2q: KL(p || q) for the current step # kldi_p2g: KL(p || N(0,I)) for the current step return sip1, mip1_p, mip1_q, nlli, kldi_q2p, kldi_p2q, kldi_p2g
def imp_step_func(zi_zmuv, si): si_as_x = self._si_as_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = self.x_mask * grad_unmasked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.concatenate([xi_masked, xi_unmasked], axis=1)) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step elif (self.step_type == 'add'): # add steps just update the guesses additively sip1 = si + si_step elif (self.step_type == 'lstm'): # LSTM-style updates with write and erase gates write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1]) erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) elif (self.step_type == 'layer'): alpha_gate = T.nnet.sigmoid(hydra_out[1]) sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step) else: assert False, "Unknown step type!" # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g
def _construct_compute_fe_terms(self): """ Construct theano function to compute the log-likelihood and posterior KL-divergence terms for the variational free-energy. """ # setup some symbolic variables for theano to deal with Xd = T.matrix() Xc = T.zeros_like(Xd) Xm = T.zeros_like(Xd) # construct values to output if self.x_type == 'bernoulli': ll_term = log_prob_bernoulli(self.x, self.xg) else: ll_term = log_prob_gaussian2(self.x, self.xg, \ log_vars=self.bounded_logvar) all_klds = gaussian_kld(self.q_z_given_x.output_mean, \ self.q_z_given_x.output_logvar, \ self.prior_mean, self.prior_logvar) kld_term = T.sum(all_klds, axis=1) # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[Xd], \ outputs=[ll_term, kld_term], \ givens={self.Xd: Xd, self.Xc: Xc, self.Xm: Xm}) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(X, sample_count): ll_sum = np.zeros((X.shape[0],)) kld_sum = np.zeros((X.shape[0],)) for i in range(sample_count): result = fe_term_sample(X) ll_sum = ll_sum + result[0].ravel() kld_sum = kld_sum + result[1].ravel() mean_nll = -ll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator
def _construct_chain_kld_cost(self, cost_decay=0.1): """ Construct the posterior KL-d from prior part of cost to minimize. This is for operation in "free chain" mode, where a seed point is used to initialize a long(ish) running markov chain. """ assert((cost_decay >= 0.0) and (cost_decay <= 1.0)) obs_count = T.cast(self.Xd.shape[0], 'floatX') kld_mean = self.IN.kld_mean[0] kld_costs = [] step_weight = 1.0 step_weights = [] step_decay = cost_decay for i in range(self.chain_len): IN_i = self.IN_chain[i] # basic variational term on KL divergence between post and prior kld_i = gaussian_kld(IN_i.output_mean, IN_i.output_logvar, \ self.prior_mean, self.prior_logvar) kld_i_costs = T.sum(kld_i, axis=1) # sum and reweight the KLd cost for this step in the chain c = T.mean(kld_i_costs) kld_costs.append(step_weight * c) step_weights.append(step_weight) step_weight = step_weight * step_decay kld_cost = sum(kld_costs) / sum(step_weights) return kld_cost
def _construct_chain_kld_cost(self, cost_decay=0.1): """ Construct the posterior KL-d from prior part of cost to minimize. This is for operation in "free chain" mode, where a seed point is used to initialize a long(ish) running markov chain. """ assert ((cost_decay >= 0.0) and (cost_decay <= 1.0)) obs_count = T.cast(self.Xd.shape[0], 'floatX') kld_mean = self.IN.kld_mean[0] kld_costs = [] step_weight = 1.0 step_weights = [] step_decay = cost_decay for i in range(self.chain_len): IN_i = self.IN_chain[i] # basic variational term on KL divergence between post and prior kld_i = gaussian_kld(IN_i.output_mean, IN_i.output_logvar, \ self.prior_mean, self.prior_logvar) kld_i_costs = T.sum(kld_i, axis=1) # sum and reweight the KLd cost for this step in the chain c = T.mean(kld_i_costs) kld_costs.append(step_weight * c) step_weights.append(step_weight) step_weight = step_weight * step_decay kld_cost = sum(kld_costs) / sum(step_weights) return kld_cost
def _construct_compute_fe_terms(self): """ Construct theano function to compute the log-likelihood and posterior KL-divergence terms for the variational free-energy. """ # construct values to output if self.x_type == 'bernoulli': ll_term = log_prob_bernoulli(self.x_in, self.xg) else: ll_term = log_prob_gaussian2(self.x_in, self.xg, \ log_vars=self.bounded_logvar) all_klds = gaussian_kld(self.z_mean, self.z_logvar, \ self.prior_mean, self.prior_logvar) kld_term = T.sum(all_klds, axis=1) # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[self.x_in], \ outputs=[ll_term, kld_term]) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(X, sample_count): X = to_fX(X) ll_sum = np.zeros((X.shape[0], )) kld_sum = np.zeros((X.shape[0], )) for i in range(sample_count): result = fe_term_sample(X) ll_sum = ll_sum + result[0].ravel() kld_sum = kld_sum + result[1].ravel() mean_nll = -ll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator
def _construct_compute_fe_terms(self): """ Construct theano function to compute the log-likelihood and posterior KL-divergence terms for the variational free-energy. """ # construct values to output if self.x_type == 'bernoulli': ll_term = log_prob_bernoulli(self.x_in, self.xg) else: ll_term = log_prob_gaussian2(self.x_in, self.xg, \ log_vars=self.bounded_logvar) all_klds = gaussian_kld(self.z_mean, self.z_logvar, \ self.prior_mean, self.prior_logvar) kld_term = T.sum(all_klds, axis=1) # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[self.x_in], \ outputs=[ll_term, kld_term]) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(X, sample_count): X = to_fX(X) ll_sum = np.zeros((X.shape[0],)) kld_sum = np.zeros((X.shape[0],)) for i in range(sample_count): result = fe_term_sample(X) ll_sum = ll_sum + result[0].ravel() kld_sum = kld_sum + result[1].ravel() mean_nll = -ll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator
def imp_step_func(zi_zmuv, si): si_as_x = self.obs_transform(si) xi_masked = (self.x_mask * self.x_out) + \ ((1.0 - self.x_mask) * si_as_x) #grad_ll = self.x_out - xi_masked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( \ xi_masked, do_samples=False) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_x_xi.apply( \ T.horizontal_stack(xi_masked, self.x_out), \ do_samples=False) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) if self.use_osm_mode: zi = zi_p # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) else: # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, \ zi_p_mean, zi_p_logvar) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, \ zi_q_mean, zi_q_logvar) # compute the next si, given the sampled zi hydra_out = self.p_xip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always do a full swap (like standard VAE) sip1 = si_step else: # additive steps adjust the current guesses incrementally write_gate = T.nnet.sigmoid(2.0 + hydra_out[1]) erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2]) # LSTM-style update sip1 = (erase_gate * si) + (write_gate * si_step) # normal update (this was used in workshop papers) #sip1 = si + si_step # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, 0.0*self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q
def imp_step_func(zi_zmuv, si): si_as_x = self._from_si_to_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = (self.x_mask * grad_unmasked) + \ ((1.0 - self.x_mask) * self.grad_null) # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( \ T.horizontal_stack(xi_masked, grad_masked), \ do_samples=False) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( \ T.horizontal_stack(xi_masked, grad_unmasked), \ do_samples=False) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, \ zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, \ zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, \ 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step else: # additive steps update the current guesses like an LSTM write_gate = T.nnet.sigmoid(3.0 + hydra_out[1]) erase_gate = T.nnet.sigmoid(3.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g
def _construct_kld_costs(self): """ Construct the posterior KL-d from prior part of cost to minimize. """ # compute the KLds between posteriors and priors. we compute the KLd # independently for each input and each latent variable dimension kld_z = gaussian_kld(self.z_mean, self.z_logvar, \ self.prior_mean, self.prior_logvar) kld_costs = T.sum(kld_z, axis=1, keepdims=True) return kld_costs
def _construct_kld_costs(self): """ Construct the posterior KL-divergence part of cost to minimize. """ # construct a penalty that is L2-like near 0 and L1-like away from 0. huber_pen = lambda x, d: \ ((1.0 / (2.0 * d)) * ((T.abs_(x) < d) * (x**2.0))) + \ ((T.abs_(x) >= d) * (T.abs_(x) - (d / 2.0))) # construct KLd cost for the distributions over hi. the prior over # hi is given by a distribution conditioned on si, which we estimate # using self.p_hi_given_si[i]. the conditionals produced by each # self.p_hi_given_si[i] will also be regularized towards a shared # prior, e.g. a Gaussian with zero mean and unit variance. kld_hi_conds = [] kld_hi_globs = [] for i in range(self.ir_steps): kld_hi_cond = gaussian_kld( \ self.q_hi_given_x_si[i].output_mean, \ self.q_hi_given_x_si[i].output_logvar, \ self.p_hi_given_si[i].output_mean, \ self.p_hi_given_si[i].output_logvar) kld_hi_glob = gaussian_kld( \ self.p_hi_given_si[i].output_mean, \ self.p_hi_given_si[i].output_logvar, \ 0.0, 0.0) kld_hi_cond_l1l2 = (self.l1l2_weight[0] * kld_hi_cond) + \ ((1.0 - self.l1l2_weight[0]) * kld_hi_cond**2.0) kld_hi_conds.append(T.sum(kld_hi_cond_l1l2, \ axis=1, keepdims=True)) kld_hi_globs.append(T.sum(kld_hi_glob**2.0, \ axis=1, keepdims=True)) # compute the batch-wise costs kld_hi_cond = sum(kld_hi_conds) kld_hi_glob = sum(kld_hi_globs) # construct KLd cost for the distributions over z kld_z_all = gaussian_kld(self.q_z_given_x.output_mean, \ self.q_z_given_x.output_logvar, \ 0.0, 0.0) kld_z_l1l2 = (self.l1l2_weight[0] * kld_z_all) + \ ((1.0 - self.l1l2_weight[0]) * kld_z_all**2.0) kld_z = T.sum(kld_z_l1l2, \ axis=1, keepdims=True) return [kld_z, kld_hi_cond, kld_hi_glob]
def _construct_compute_post_klds(self): """ Construct theano function to compute the info about the variational approximate posteriors for some inputs. """ # setup some symbolic variables for theano to deal with all_klds = gaussian_kld(self.z_mean, self.z_logvar, \ self.prior_mean, self.prior_logvar) # compile theano function for a one-sample free-energy estimate kld_func = theano.function(inputs=[self.x_in], outputs=all_klds) return kld_func
def _construct_compute_post_klds(self): """ Construct theano function to compute the info about the variational approximate posteriors for some inputs. """ # setup some symbolic variables for theano to deal with x = T.matrix() # construct symbolic expressions for the desired KLds cond_klds = [] glob_klds = [] for i in range(self.ir_steps): kld_hi_cond = gaussian_kld(self.q_hi_given_x_si[i].output_mean, \ self.q_hi_given_x_si[i].output_logvar, \ self.p_hi_given_si[i].output_mean, \ self.p_hi_given_si[i].output_logvar) kld_hi_glob = gaussian_kld(self.p_hi_given_si[i].output_mean, \ self.p_hi_given_si[i].output_logvar, 0.0, 0.0) cond_klds.append(kld_hi_cond) glob_klds.append(kld_hi_glob) # gather conditional and global klds for all IR steps all_klds = cond_klds + glob_klds # gather kld for the initialization step kld_z_all = gaussian_kld(self.q_z_given_x.output_mean, \ self.q_z_given_x.output_logvar, \ 0.0, 0.0) all_klds.append(kld_z_all) # compile theano function for a one-sample free-energy estimate kld_func = theano.function(inputs=[x], outputs=all_klds, \ givens={ self.x: x }) def post_kld_computer(X): f_all_klds = kld_func(X) f_kld_z = f_all_klds[-1] f_kld_hi_cond = np.zeros(f_all_klds[0].shape) f_kld_hi_glob = np.zeros(f_all_klds[0].shape) for j in range(self.ir_steps): f_kld_hi_cond += f_all_klds[j] f_kld_hi_glob += f_all_klds[j + self.ir_steps] return [f_kld_z, f_kld_hi_cond, f_kld_hi_glob] return post_kld_computer
def _construct_kld_cost(self): """ Compute (analytically) the KL divergence between each approximate posterior encoded by self.mu/self.sigma and the isotropic Gaussian distribution with mean 0 and standard deviation self.prior_sigma. """ prior_mu = 0.0 prior_logvar = np.log(self.prior_sigma**2.0) post_klds = gaussian_kld(self.output_mean, self.output_logvar, \ prior_mu, prior_logvar) kld_cost = T.sum(post_klds, axis=1, keepdims=True) return kld_cost
def _construct_kld_costs(self): """ Construct the posterior KL-d from prior part of cost to minimize. """ # compute the KLds between posteriors and priors. we compute the KLd # independently for each input and each latent variable dimension kld_z = gaussian_kld(self.z_mean, self.z_logvar, \ self.prior_mean, self.prior_logvar) # compute the batch-wise L1 and L2 penalties on per-dim KLds kld_l1_costs = T.sum(kld_z, axis=1, keepdims=True) kld_l2_costs = (kld_l1_costs - self.kld_z_mean[0])**2.0 return [kld_l1_costs, kld_l2_costs]
def _construct_kld_costs(self, p=1.0): """ Construct the posterior KL-divergence part of cost to minimize. """ kld_hi_q2ps = [] kld_hi_p2qs = [] for i in range(self.ir_steps): kld_hi_q2p = self.kldi_q2p[i] kld_hi_p2q = self.kldi_p2q[i] kld_hi_q2ps.append(T.sum(kld_hi_q2p**p, \ axis=1, keepdims=True)) kld_hi_p2qs.append(T.sum(kld_hi_p2q**p, \ axis=1, keepdims=True)) # compute the batch-wise costs kld_hi_q2p = sum(kld_hi_q2ps) kld_hi_p2q = sum(kld_hi_p2qs) # construct KLd cost for the distributions over z kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \ self.q_z_mean, self.q_z_logvar) kld_z_q2p = T.sum(kld_z_q2ps**p, axis=1, keepdims=True) kld_z_p2q = T.sum(kld_z_p2qs**p, axis=1, keepdims=True) return [kld_z_q2p, kld_z_p2q, kld_hi_q2p, kld_hi_p2q]
def ir_step_func(hi_zmuv, sim1): # get variables used throughout this refinement step sim1_obs = self.obs_transform(sim1) # transform state -> obs grad_ll = self.x_out - sim1_obs # get samples of next hi, conditioned on current si hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \ sim1_obs, do_samples=False) # now we build the model for variational hi given si hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \ T.horizontal_stack(grad_ll, sim1_obs), \ do_samples=False) hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean # make hi samples that can be switched between hi_p and hi_q hi = ( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on si and hi. ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi) # get the transformed values (for an LSTM style update) i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0) f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0) # perform an LSTM-like update of the state sim1 -> si si = (in_vals * i_gate) + (sim1 * f_gate) # compute generator NLL for this step nlli = self.log_prob_func(self.x_out, self.obs_transform(si)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \ hi_p_mean, hi_p_logvar) kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \ hi_q_mean, hi_q_logvar) return si, nlli, kldi_q2p, kldi_p2q
def _construct_compute_post_klds(self): """ Construct theano function to compute the info about the variational approximate posteriors for some inputs. """ # setup some symbolic variables for theano to deal with Xd = T.matrix() Xc = T.zeros_like(Xd) Xm = T.zeros_like(Xd) all_klds = gaussian_kld(self.q_z_given_x.output_mean, \ self.q_z_given_x.output_logvar, \ self.prior_mean, self.prior_logvar) # compile theano function for a one-sample free-energy estimate kld_func = theano.function(inputs=[Xd], outputs=all_klds, \ givens={self.Xd: Xd, self.Xc: Xc, self.Xm: Xm}) return kld_func
def _construct_kld_costs(self): """ Construct the posterior KL-d from prior part of cost to minimize. """ # compute the KLds between posteriors and priors. we compute the KLd # independently for each input and each latent variable dimension kld_z = gaussian_kld(self.q_z_given_x.output_mean, \ self.q_z_given_x.output_logvar, \ self.prior_mean, self.prior_logvar) # compute the batch-wise L1 and L2 penalties on per-dim KLds kld_l1_costs = T.sum(kld_z, axis=1, keepdims=True) derp1 = T.mean(kld_l1_costs) derp2 = kld_l1_costs > derp1 batch_kld_mean = theano.gradient.disconnected_grad(derp1) mask = theano.gradient.disconnected_grad(derp2) kld_l2_costs = T.sum(((kld_l1_costs-batch_kld_mean)**2.0 * mask), \ axis=1, keepdims=True) return [kld_l1_costs, kld_l2_costs]
def _construct_compute_post_stats(self): """ Construct theano function to compute some stats describing the latent posteriors inferred by this model. """ # construct the stats to compute obs_count = T.cast(self.Xd.shape[0], 'floatX') prior_mean = 0.0 prior_logvar = 0.0 all_klds = gaussian_kld(self.IN.output_mean, self.IN.output_logvar, \ prior_mean, prior_logvar) obs_klds = T.sum(all_klds, axis=1) dim_klds = T.sum(all_klds, axis=0) / obs_count dim_vars = T.sum(self.IN.output_mean**2.0, axis=0) / obs_count # make a theano function to compute them outputs = [all_klds, obs_klds, dim_klds, dim_vars] func = theano.function(inputs=[self.Xd, self.Xc, self.Xm], \ outputs=outputs) return func
def chain_step_func(zi_zmuv, xim1): # get mean and logvar of z samples for this step zi_mean, zi_logvar = self.IN.apply(xim1, do_samples=False) # transform ZMUV samples to get desired samples zi = (T.exp(0.5 * zi_logvar) * zi_zmuv) + zi_mean # get the next generated xi (pre-transformation) outputs = self.GN.apply(zi) xti = outputs[-1] # apply the observation "mean" transform xgi = self.xt_transform(xti) # compute NLL for this step if self.chain_type == 'walkout': x_true = self.x_d else: x_true = xim1 nlli = self._log_prob(x_true, xgi).flatten() kldi = T.sum(gaussian_kld(zi_mean, zi_logvar, \ self.p_z_mean, self.p_z_logvar), axis=1) return xgi, nlli, kldi
def _construct_compute_fe_terms(self): """ Construct theano function to compute the log-likelihood and posterior KL-divergence terms for the variational free-energy. """ # setup some symbolic variables for theano to deal with Xd = T.matrix() Xc = T.zeros_like(Xd) Xm = T.zeros_like(Xd) # construct values to output if self.x_type == 'bernoulli': ll_term = log_prob_bernoulli(self.x, self.xg) else: ll_term = log_prob_gaussian2(self.x, self.xg, \ log_vars=self.bounded_logvar) all_klds = gaussian_kld(self.q_z_given_x.output_mean, \ self.q_z_given_x.output_logvar, \ self.prior_mean, self.prior_logvar) kld_term = T.sum(all_klds, axis=1) # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[Xd], \ outputs=[ll_term, kld_term], \ givens={self.Xd: Xd, self.Xc: Xc, self.Xm: Xm}) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(X, sample_count): ll_sum = np.zeros((X.shape[0], )) kld_sum = np.zeros((X.shape[0], )) for i in range(sample_count): result = fe_term_sample(X) ll_sum = ll_sum + result[0].ravel() kld_sum = kld_sum + result[1].ravel() mean_nll = -ll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator
def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_s0_given_z=None, \ p_hi_given_si=None, \ p_sip1_given_si_hi=None, \ q_z_given_x=None, \ q_hi_given_x_si=None, \ obs_dim=None, \ z_dim=None, h_dim=None, \ ir_steps=4, params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.obs_dim = obs_dim self.z_dim = z_dim self.h_dim = h_dim self.ir_steps = ir_steps # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_hi_given_x_si = q_hi_given_x_si self.p_s0_given_z = p_s0_given_z self.p_hi_given_si = p_hi_given_si self.p_sip1_given_si_hi = p_sip1_given_si_hi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.hi_zmuv = T.tensor3() # for ZMUV Gaussian samples to use in scan # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) # this weight balances l1 vs. l2 penalty on posterior KLds self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2') self.set_lam_kld_l1l2(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar') init_vec = to_fX( np.zeros((self.obs_dim,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) # setup a function for computing reconstruction log likelihood if self.x_type == 'bernoulli': self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_bernoulli(xo, xh)) else: self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar)) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.x_out.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) ############################# # Setup self.z and self.s0. # ############################# print("Building MSM step 0...") drop_x = drop_mask * self.x_in self.q_z_mean, self.q_z_logvar, self.z = \ self.q_z_given_x.apply(drop_x, do_samples=True) # get initial observation state self.s0, _ = self.p_s0_given_z.apply(self.z, do_samples=False) # gather KLd and NLL for the initialization step self.init_klds = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) self.init_nlls = -1.0 * \ self.log_prob_func(self.x_out, self.obs_transform(self.s0)) ################################################## # Setup the iterative generation loop using scan # ################################################## def ir_step_func(hi_zmuv, sim1): # get variables used throughout this refinement step sim1_obs = self.obs_transform(sim1) # transform state -> obs grad_ll = self.x_out - sim1_obs # get samples of next hi, conditioned on current si hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \ sim1_obs, do_samples=False) # now we build the model for variational hi given si hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \ T.horizontal_stack(grad_ll, sim1_obs), \ do_samples=False) hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean # make hi samples that can be switched between hi_p and hi_q hi = ( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on si and hi. ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi) # get the transformed values (for an LSTM style update) i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0) f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0) # perform an LSTM-like update of the state sim1 -> si si = (in_vals * i_gate) + (sim1 * f_gate) # compute generator NLL for this step nlli = self.log_prob_func(self.x_out, self.obs_transform(si)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \ hi_p_mean, hi_p_logvar) kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \ hi_q_mean, hi_q_logvar) return si, nlli, kldi_q2p, kldi_p2q init_values = [self.s0, None, None, None] self.scan_results, self.scan_updates = theano.scan(ir_step_func, \ outputs_info=init_values, sequences=self.hi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z') self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q') self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.q_params = [] self.q_params.extend(self.q_z_given_x.mlp_params) self.q_params.extend(self.q_hi_given_x_si.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.p_params = [self.p_z_mean, self.p_z_logvar] self.p_params.extend(self.p_hi_given_si.mlp_params) self.p_params.extend(self.p_sip1_given_si_hi.mlp_params) self.p_params.extend(self.p_s0_given_z.mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.q_params + self.p_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q, self.kld_hi_q2p, self.kld_hi_p2q = \ self._construct_kld_costs(p=1.0) self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_hi = (self.lam_kld_q2p[0] * self.kld_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kld_hi_p2q) self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_hi # now do l2 KLd costs self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_hi_q2p, self.kl2_hi_p2q = \ self._construct_kld_costs(p=2.0) self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_z_p2q) self.kl2_hi = (self.lam_kld_q2p[0] * self.kl2_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_hi_p2q) self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_hi # compute joint l1/l2 KLd cost self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \ ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) self.kl2_cost = T.mean(self.kl2_costs) self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_l1l2_cost + \ self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_l1l2_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.q_updates = get_adam_updates(params=self.q_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.p_updates = get_adam_updates(params=self.p_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.q_updates: self.joint_updates[k] = self.q_updates[k] for k in self.p_updates: self.joint_updates[k] = self.p_updates[k] # add scan updates, which seem to be required for k in self.scan_updates: self.joint_updates[k] = self.scan_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_klds = self._construct_raw_klds() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() print("Compiling data-guided model sampler...") self.sample_from_input = self._construct_sample_from_input() return
def __init__(self, rng=None, x_in=None, x_out=None, p_h_given_z=None, p_x_given_h=None, q_z_given_x=None, q_h_given_z_x=None, x_dim=None, z_dim=None, h_dim=None, h_det_dim=None, params=None, shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.h_dim = h_dim self.h_det_dim = h_det_dim # grab handles to the relevant HydraNets self.q_z_given_x = q_z_given_x self.q_h_given_z_x = q_h_given_z_x self.p_h_given_z = p_h_given_z self.p_x_given_h = p_x_given_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((1,self.z_dim)) ) self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar') self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "hidden" latent state (from both p and q) z_q_mean, z_q_logvar = self.q_z_given_x.apply(self.x_in) z_q = reparametrize(z_q_mean, z_q_logvar, rng=self.rng) z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0) z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0) z_p = reparametrize(z_p_mean, z_p_logvar, rng=self.rng) self.z = (self.train_switch[0] * z_q) + \ ((1.0 - self.train_switch[0]) * z_p) # compute relevant KLds for this step self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, z_p_mean, z_p_logvar) self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, z_q_mean, z_q_logvar) # samples of "hidden" latent state (from both p and q) h_p_mean, h_p_logvar = self.p_h_given_z.apply(self.z) h_p = reparametrize(h_p_mean, h_p_logvar, rng=self.rng) h_q_mean, h_q_logvar = self.q_h_given_z_x.apply( T.concatenate([h_p_mean, self.x_out], axis=1)) h_q = reparametrize(h_q_mean, h_q_logvar, rng=self.rng) # compute "stochastic" and "deterministic" parts of latent state h_sto = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) h_det = h_p_mean if self.h_det_dim is None: # don't pass forward any deterministic state self.h = h_sto else: # pass forward some deterministic state self.h = T.concatenate([h_det[:,:self.h_det_dim], h_sto[:,self.h_det_dim:]], axis=1) # compute relevant KLds for this step self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, h_p_mean, h_p_logvar) self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, h_q_mean, h_q_logvar) # p_x_given_h generates an observation x conditioned on the "hidden" # latent variables h. self.x_gen, _ = self.p_x_given_h.apply(self.h) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='tsm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w') self.set_lam_l2w(1e-5) # get optimizable parameters belonging to the TwoStageModel self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar] # get optimizable parameters belonging to the underlying networks child_params = [] child_params.extend(self.q_z_given_x.mlp_params) child_params.extend(self.q_h_given_z_x.mlp_params) child_params.extend(self.p_h_given_z.mlp_params) child_params.extend(self.p_x_given_h.mlp_params) # make a joint list of all optimizable parameters self.joint_params = self_params + child_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = T.sum(self.kld_z, axis=1) + \ T.sum(self.kld_h, axis=1) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # construct the updates for the generator and inferencer networks all_updates = get_adam_updates(params=self.joint_params, grads=self.joint_grads, alpha=self.lr, beta1=self.mom_1, beta2=self.mom_2, mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0) self.joint_updates = OrderedDict() for k in all_updates: self.joint_updates[k] = all_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() return
def __init__(self, rng=None, x_in=None, x_mask=None, x_out=None, \ p_h_given_x=None, \ p_s0_given_h=None, \ p_zi_given_xi=None, \ p_sip1_given_zi=None, \ p_x_given_si=None, \ q_h_given_x=None, \ q_zi_given_xi=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.h_dim = self.params['h_dim'] self.z_dim = self.params['z_dim'] self.s_dim = self.params['s_dim'] self.use_p_x_given_si = self.params['use_p_x_given_si'] self.imp_steps = self.params['imp_steps'] self.step_type = self.params['step_type'] self.x_type = self.params['x_type'] if self.use_p_x_given_si: print("Constructing hypotheses via p_x_given_si...") else: print("Constructing hypotheses directly in x-space...") assert(self.s_dim == self.x_dim) assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts assert((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant InfNets self.p_h_given_x = p_h_given_x self.p_s0_given_h = p_s0_given_h self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.p_x_given_si = p_x_given_si self.q_h_given_x = q_h_given_x self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.x_mask = x_mask self.zi_zmuv = T.tensor3() # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='gpsi_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize parameters "owned" by this model init_ary = to_fX( np.zeros((self.x_dim,)) ) self.s_null = theano.shared(value=init_ary, name='gpis_sn') self.grad_null = theano.shared(value=init_ary, name='gpsi_gn') self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['s_null'] = self.s_null self.shared_param_dicts['grad_null'] = self.grad_null self.shared_param_dicts['obs_logvar'] = self.obs_logvar self.x_null = self._from_si_to_x(self.s_null) else: # grab the parameters required by this model from a given dict self.s_null = self.shared_param_dicts['s_null'] self.grad_null = self.shared_param_dicts['grad_null'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) self.x_null = self._from_si_to_x(self.s_null) ############################################## # Compute results of the initialization step # ############################################## self.x_init = (self.x_mask * self.x_in) + \ ((1.0 - self.x_mask) * self.x_null) # sample from primary and guide conditionals over h h_p_mean, h_p_logvar, h_p = \ self.p_h_given_x.apply(self.x_init, do_samples=True) h_q_mean, h_q_logvar, h_q = \ self.q_h_given_x.apply(self.x_in, do_samples=True) # make h samples that can be switched between h_p and h_q self.h = ((self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p)) # get the emitted initial state s0 (sampled via either p or q) hydra_out = self.p_s0_given_h.apply(self.h) self.s0 = hydra_out[0] # compute NLL reconstruction cost for the initialization step self.nll0 = self._construct_nll_costs(self.s0, self.x_out, self.x_mask) # compute KLds for the initialization step self.kldh_q2p = gaussian_kld(h_q_mean, h_q_logvar, \ h_p_mean, h_p_logvar) # KL(q || p) self.kldh_p2q = gaussian_kld(h_p_mean, h_p_logvar, \ h_q_mean, h_q_logvar) # KL(p || q) self.kldh_p2g = gaussian_kld(h_p_mean, h_p_logvar, \ 0.0, 0.0) # KL(p || global prior) ################################################## # Setup the iterative imputation loop using scan # ################################################## self.ones_mask = T.ones_like(self.x_mask) def imp_step_func(zi_zmuv, si): si_as_x = self._from_si_to_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = (self.x_mask * grad_unmasked) + \ ((1.0 - self.x_mask) * self.grad_null) # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( \ T.horizontal_stack(xi_masked, grad_masked), \ do_samples=False) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( \ T.horizontal_stack(xi_masked, grad_unmasked), \ do_samples=False) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, \ zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, \ zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, \ 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step else: # additive steps update the current guesses like an LSTM write_gate = T.nnet.sigmoid(3.0 + hydra_out[1]) erase_gate = T.nnet.sigmoid(3.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g # apply scan op for the sequential imputation loop init_vals = [self.s0, None, None, None, None] self.scan_results, self.scan_updates = theano.scan(imp_step_func, \ outputs_info=init_vals, sequences=self.zi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] self.kldi_p2g = self.scan_results[4] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='gpsi_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='gpsi_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in the model self.joint_params = [self.s_null, self.grad_null, self.obs_logvar] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = \ self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling best step cost computer...") self.compute_per_step_cost = self._construct_compute_per_step_cost() print("Compiling data-guided imputer sampler...") self.sample_imputer = self._construct_sample_imputer() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return
def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_h_given_z=None, \ p_x_given_h=None, \ q_z_given_x=None, \ q_h_given_z_x=None, \ x_dim=None, \ z_dim=None, \ h_dim=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.h_dim = h_dim # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_h_given_z_x = q_h_given_z_x self.p_h_given_z = p_h_given_z self.p_x_given_h = p_x_given_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((1,self.z_dim)) ) self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar') self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "hidden" latent state (from both p and q) z_q_mean, z_q_logvar, z_q = \ self.q_z_given_x.apply(self.x_in, do_samples=True) z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0) z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0) zmuv = self.rng.normal(size=z_q.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) z_p = (T.exp(0.5*z_p_logvar) * zmuv) + z_p_mean self.z = (self.train_switch[0] * z_q) + \ ((1.0 - self.train_switch[0]) * z_p) # compute relevant KLds for this step self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, \ z_p_mean, z_p_logvar) self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, \ z_q_mean, z_q_logvar) # samples of "hidden" latent state (from both p and q) h_p_mean, h_p_logvar, h_p = self.p_h_given_z.apply(self.z) h_q_mean, h_q_logvar, h_q = self.q_h_given_z_x.apply( \ T.horizontal_stack(h_p_mean, h_p_logvar, self.x_out)) self.h = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) # compute relevant KLds for this step self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, \ h_p_mean, h_p_logvar) self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, \ h_q_mean, h_q_logvar) # p_x_given_h generates an observation x conditioned on the "hidden" # latent variables h. self.x_gen, _ = self.p_x_given_h.apply(self.h, do_samples=False) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='tsm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w') self.set_lam_l2w(1e-5) # get optimizable parameters belonging to the TwoStageModel self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar] # get optimizable parameters belonging to the underlying networks child_params = [] child_params.extend(self.q_z_given_x.mlp_params) child_params.extend(self.q_h_given_z_x.mlp_params) child_params.extend(self.p_h_given_z.mlp_params) child_params.extend(self.p_x_given_h.mlp_params) # make a joint list of all optimizable parameters self.joint_params = self_params + child_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = T.sum(self.kld_z, axis=1) + \ T.sum(self.kld_h, axis=1) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # construct the updates for the generator and inferencer networks all_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0) self.joint_updates = OrderedDict() for k in all_updates: self.joint_updates[k] = all_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() return
def __init__(self, rng=None, \ x_in=None, y_in=None, \ q_z_given_x=None, \ class_count=None, \ z_dim=None, \ use_samples=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # record the dimensions of various spaces relevant to this model self.class_count = class_count self.z_dim = z_dim self.shared_dim = q_z_given_x.shared_layers[-1].out_dim self.use_samples = use_samples # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.y_in = y_in # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='cm_drop_rate') self.set_drop_rate(0.0) # initialize classification layer parameters init_mat = to_fX(0.01 * npr.randn(self.shared_dim, self.class_count)) init_vec = to_fX( np.zeros((self.class_count,)) ) self.W_class = theano.shared(value=init_mat, name='cm_W_class') self.b_class = theano.shared(value=init_vec, name='cm_b_class') # initialize "optimizable" parameters specific to this CM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='cm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='cm_p_z_logvar') ################# # Setup self.z. # ################# self.q_z_mean, self.q_z_logvar, self.q_z_samples = \ self.q_z_given_x.apply(self.x_in, do_samples=True) self.q_z_samples = self.q_z_given_x.apply_shared(self.x_in) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.q_z_samples.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) # get a droppy version of either z mean or z samples # if self.use_samples: # self.z = self.q_z_samples * drop_mask # else: # self.z = self.q_z_mean * drop_mask self.z = self.q_z_samples * drop_mask # compute class predictions self.y_out = T.dot(self.z, self.W_class) + self.b_class # compute KLds for training via variational free-energy self.kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \ self.q_z_mean, self.q_z_logvar) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='cm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='cm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='cm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='cm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='cm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='cm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='cm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=0.9, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='cm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters self.joint_params = [self.p_z_mean, self.p_z_logvar, \ self.W_class, self.b_class] self.joint_params.extend(self.q_z_given_x.mlp_params) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.lam_nll[0] * self._construct_nll_costs(self.y_in) self.nll_cost = T.mean(self.nll_costs) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_cost = T.mean(self.kld_costs) ################################## # CONSTRUCT THE FINAL JOINT COST # ################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the model parameters self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling class error estimator...") self.class_error = self._construct_class_error() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() # make easy access points for some interesting parameters self.inf_weights = self.q_z_given_x.shared_layers[0].W return
def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_s_given_z=None, \ p_h_given_s=None, \ p_x_given_s_h=None, \ q_z_given_x=None, \ q_h_given_x_s=None, \ x_dim=None, \ z_dim=None, \ s_dim=None, \ h_dim=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.s_dim = s_dim self.h_dim = h_dim # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_h_given_x_s = q_h_given_x_s self.p_s_given_z = p_s_given_z self.p_h_given_s = p_h_given_s self.p_x_given_s_h = p_x_given_s_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.batch_reps = T.lscalar() # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) # this weight balances l1 vs. l2 penalty on posterior KLds self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2') self.set_lam_kld_l1l2(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar') init_vec = to_fX( np.zeros((self.x_dim,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.x_out.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "first" latent state drop_x = drop_mask * self.x_in z_q_mean, z_q_logvar, self.z = \ self.q_z_given_x.apply(drop_x, do_samples=True) # compute relevant KLds for this step self.kld_z_q2ps = gaussian_kld(z_q_mean, z_q_logvar, \ self.p_z_mean, self.p_z_logvar) self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \ z_q_mean, z_q_logvar) # transform "first" latent state into "second" latent state self.s, _ = self.p_s_given_z.apply(self.z, do_samples=False) # get samples of h, conditioned on current s h_p_mean, h_p_logvar, h_p = self.p_h_given_s.apply( \ self.s, do_samples=True) # get variational samples of h, given s and x_out h_q_mean, h_q_logvar, h_q = self.q_h_given_x_s.apply( \ T.horizontal_stack(self.x_out, self.s), \ do_samples=True) # make h samples that can be switched between h_p and h_q self.h = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) # compute relevant KLds for this step self.kld_h_q2ps = gaussian_kld(h_q_mean, h_q_logvar, \ h_p_mean, h_p_logvar) self.kld_h_p2qs = gaussian_kld(h_p_mean, h_p_logvar, \ h_q_mean, h_q_logvar) # p_x_given_s_h is conditioned on s and h. self.x_gen, _ = self.p_x_given_s_h.apply( \ T.horizontal_stack(self.s, self.h), \ do_samples=False) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z') self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q') self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.group_1_params = [] self.group_1_params.extend(self.q_z_given_x.mlp_params) self.group_1_params.extend(self.q_h_given_x_s.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.group_2_params = [self.p_z_mean, self.p_z_logvar] self.group_2_params.extend(self.p_s_given_z.mlp_params) self.group_2_params.extend(self.p_h_given_s.mlp_params) self.group_2_params.extend(self.p_x_given_s_h.mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.group_1_params + self.group_2_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q, self.kld_h_q2p, self.kld_h_p2q = \ self._construct_kld_costs(p=1.0) self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_h # now do l2 KLd costs self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_h_q2p, self.kl2_h_p2q = \ self._construct_kld_costs(p=2.0) self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_z_p2q) self.kl2_h = (self.lam_kld_q2p[0] * self.kl2_h_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_h_p2q) self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_h # compute joint l1/l2 KLd cost self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \ ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) self.kl2_cost = T.mean(self.kl2_costs) self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_l1l2_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_l1l2_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.group_1_updates = get_adam_updates(params=self.group_1_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.group_2_updates = get_adam_updates(params=self.group_2_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.group_1_updates: self.joint_updates[k] = self.group_1_updates[k] for k in self.group_2_updates: self.joint_updates[k] = self.group_2_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() print("Compiling data-guided model sampler...") self.sample_from_input = self._construct_sample_from_input() # make easy access points for some interesting parameters self.gen_gen_weights = self.p_x_given_s_h.mu_layers[-1].W return