def compare_speed(): # To run this speed comparison # cd <directory of this file> # THEANO_FLAGS=device=gpu \ # python -c 'import test_rng_curand; test_rng_curand.compare_speed()' mrg = MRG_RandomStreams() crn = CURAND_RandomStreams(234) N = 1000 * 100 dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX)) mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))}, profile='mrg uniform') crn_u = theano.function([], [], updates={dest: crn.uniform((N,))}, profile='crn uniform') mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))}, profile='mrg normal') crn_n = theano.function([], [], updates={dest: crn.normal((N,))}, profile='crn normal') for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost print('DEBUGPRINT') print('----------') theano.printing.debugprint(f) for i in range(100): for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost f.fn.time_thunks = (i > 0) f()
def compare_speed(): # To run this speed comparison # cd <directory of this file> # THEANO_FLAGS=device=gpu \ # python -c 'import test_rng_curand; test_rng_curand.compare_speed()' mrg = MRG_RandomStreams() crn = CURAND_RandomStreams(234) N = 1000 * 100 dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX)) mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))}, profile='mrg uniform') crn_u = theano.function([], [], updates={dest: crn.uniform((N,))}, profile='crn uniform') mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))}, profile='mrg normal') crn_n = theano.function([], [], updates={dest: crn.normal((N,))}, profile='crn normal') for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost print('DEBUGPRINT') print('----------') theano.printing.debugprint(f) for i in range(100): for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost f.fn.time_thunks = (i > 0) f()
def check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False): """ check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False) Runs a basic sanity check on the `uniform` method of a `CURAND_RandomStreams` object. Checks that variates * are in the range [0, 1] * have a mean in the right neighbourhood (near 0.5) * are of the specified shape * successive calls produce different arrays of variates Parameters ---------- shape_as_symbolic : boolean If `True`, est the case that the shape tuple is a symbolic variable rather than known at compile-time. dim_as_symbolic : boolean If `True`, test the case that an element of the shape tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic` is `True`. """ rng = CURAND_RandomStreams(234) if shape_as_symbolic: # instantiate a TensorConstant with the value (10, 10) shape = constant((10, 10)) else: # Only one dimension is symbolic, with the others known if dim_as_symbolic: shape = (10, constant(10)) else: shape = (10, 10) u0 = rng.uniform(shape) u1 = rng.uniform(shape) f0 = theano.function([], u0, mode=mode_with_gpu) f1 = theano.function([], u1, mode=mode_with_gpu) v0list = [f0() for i in range(3)] v1list = [f1() for i in range(3)] #print v0list #print v1list # assert that elements are different in a few ways assert numpy.all(v0list[0] != v0list[1]) assert numpy.all(v1list[0] != v1list[1]) assert numpy.all(v0list[0] != v1list[0]) for v in v0list: assert v.shape == (10, 10) assert v.min() >= 0 assert v.max() <= 1 assert v.min() < v.max() assert .25 <= v.mean() <= .75
def check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False): """ check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False) Runs a basic sanity check on the `uniform` method of a `CURAND_RandomStreams` object. Checks that variates * are in the range [0, 1] * have a mean in the right neighbourhood (near 0.5) * are of the specified shape * successive calls produce different arrays of variates Parameters ---------- shape_as_symbolic : boolean If `True`, est the case that the shape tuple is a symbolic variable rather than known at compile-time. dim_as_symbolic : boolean If `True`, test the case that an element of the shape tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic` is `True`. """ rng = CURAND_RandomStreams(234) if shape_as_symbolic: # instantiate a TensorConstant with the value (10, 10) shape = constant((10, 10)) else: # Only one dimension is symbolic, with the others known if dim_as_symbolic: shape = (10, constant(10)) else: shape = (10, 10) u0 = rng.uniform(shape) u1 = rng.uniform(shape) f0 = theano.function([], u0, mode=mode_with_gpu) f1 = theano.function([], u1, mode=mode_with_gpu) v0list = [f0() for i in range(3)] v1list = [f1() for i in range(3)] # print v0list # print v1list # assert that elements are different in a few ways assert numpy.all(v0list[0] != v0list[1]) assert numpy.all(v1list[0] != v1list[1]) assert numpy.all(v0list[0] != v1list[0]) for v in v0list: assert v.shape == (10, 10) assert v.min() >= 0 assert v.max() <= 1 assert v.min() < v.max() assert .25 <= v.mean() <= .75
class Training(Layer): def __init__(self,rng, W=None,m=1.0, n_samples=50,shape=None,batch_size=1000): if W is None: W = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (shape[0] + shape[1])), high=numpy.sqrt(6. / (shape[0] + shape[1])), size=(shape[0], shape[1])), dtype=theano.config.floatX) self.W = theano.shared(value=W, name='Hashtag_emb', borrow=True) self.batch_size = batch_size self.n_ht = W.shape[0] self.m = m self.n_samples = n_samples self.csrng = CURAND_RandomStreams(123) mask = self.csrng.uniform(size=(self.n_samples,1),low=0.0,high=1.0,dtype=theano.config.floatX) self.rfun = theano.function([],mask.argsort(axis=0)) self.alpha = T.constant(1.0/numpy.arange(start=1,stop=self.n_ht + 1,step=1)) self.weights = [self.W] self.biases = [] def __repr__(self): return "{}: W_shape: {}, m={}, n_samples={}, n_ht={}".format(self.__class__.__name__, self.W.shape.eval(),self.m,self.n_samples,self.n_ht) def output_func(self, input): self.f = T.tensordot(input.dimshuffle(0,'x',1),self.W.dimshuffle('x',0,1),axes=[[1,2],[0,2]]) # cosine sim self.y_pred = T.argmax(self.f,axis=0) return self.y_pred def get_tag_neg(self,f,f_y): cand = f[(f > f_y - self.m).nonzero()] rnk =cand.shape[0] - 1# due to i != y if rnk == 0: return 0 l = T.sum(self.alpha[T.arange(rnk)]) return l/rnk def _warp_loss_cost(self, y,i): f_y = self.f[T.arange(y.shape[0]), y] s = self.m - f_y + self.f[T.arange(i.shape[0]),i] return T.maximum(0.0,s) def warp_loss_cost(self, y, idx): f_y = self.f[T.arange(y.shape[0]), y] f_yy = T.repeat(f_y.dimshuffle(0,'x'),self.f.shape[1],axis=1) f_idx = T.maximum(0.0,f_yy - self.f + self.m) idx = f_idx.argsort(axis=1)[:,0] s = self.m - f_y + self.f[T.arange(idx.shape[0]),idx] return T.maximum(0.0,s) def training_cost(self, y,i): return T.mean(self.warp_loss_cost(y,i))
class Training(Layer): def __init__(self, rng, W=None, m=1.0, n_samples=50, shape=None, batch_size=1000): if W is None: W = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (shape[0] + shape[1])), high=numpy.sqrt(6. / (shape[0] + shape[1])), size=(shape[0], shape[1])), dtype=theano.config.floatX) self.W = theano.shared(value=W, name='Hashtag_emb', borrow=True) self.batch_size = batch_size self.n_ht = W.shape[0] self.m = m self.n_samples = n_samples self.csrng = CURAND_RandomStreams(123) mask = self.csrng.uniform(size=(self.n_samples, 1), low=0.0, high=1.0, dtype=theano.config.floatX) self.rfun = theano.function([], mask.argsort(axis=0)) self.alpha = T.constant( 1.0 / numpy.arange(start=1, stop=self.n_ht + 1, step=1)) self.weights = [self.W] self.biases = [] def __repr__(self): return "{}: W_shape: {}, m={}, n_samples={}, n_ht={}".format( self.__class__.__name__, self.W.shape.eval(), self.m, self.n_samples, self.n_ht) def output_func(self, input): self.f = T.tensordot(input.dimshuffle(0, 'x', 1), self.W.dimshuffle('x', 0, 1), axes=[[1, 2], [0, 2]]) # cosine sim self.y_pred = T.argmax(self.f, axis=0) return self.y_pred def get_tag_neg(self, f, f_y): cand = f[(f > f_y - self.m).nonzero()] rnk = cand.shape[0] - 1 # due to i != y if rnk == 0: return 0 l = T.sum(self.alpha[T.arange(rnk)]) return l / rnk def _warp_loss_cost(self, y, i): f_y = self.f[T.arange(y.shape[0]), y] s = self.m - f_y + self.f[T.arange(i.shape[0]), i] return T.maximum(0.0, s) def warp_loss_cost(self, y, idx): f_y = self.f[T.arange(y.shape[0]), y] f_yy = T.repeat(f_y.dimshuffle(0, 'x'), self.f.shape[1], axis=1) f_idx = T.maximum(0.0, f_yy - self.f + self.m) idx = f_idx.argsort(axis=1)[:, 0] s = self.m - f_y + self.f[T.arange(idx.shape[0]), idx] return T.maximum(0.0, s) def training_cost(self, y, i): return T.mean(self.warp_loss_cost(y, i))
class WalkoutModel(object): """ Controller for training a forwards-backwards chainy model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_out: the goal state for forwards-backwards walking process p_z_given_x: InfNet for stochastic part of step p_x_given_z: HydraNet for deterministic part of step params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of observations to construct z_dim: dimension of latent space for policy wobble walkout_steps: number of steps to walk out x_type: can be "bernoulli" or "gaussian" x_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, x_out=None, \ p_z_given_x=None, \ p_x_given_z=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this WalkoutModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.walkout_steps = self.params['walkout_steps'] self.x_type = self.params['x_type'] self.shared_param_dicts = shared_param_dicts if 'x_transform' in self.params: assert((self.params['x_transform'] == 'sigmoid') or \ (self.params['x_transform'] == 'none')) if self.params['x_transform'] == 'sigmoid': self.x_transform = lambda x: T.nnet.sigmoid(x) else: self.x_transform = lambda x: x else: self.x_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.x_transform = lambda x: T.nnet.sigmoid(x) assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) assert((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant networks self.p_z_given_x = p_z_given_x self.p_x_given_z = p_x_given_z # record the symbolic variables that will provide inputs to the # computation graph created for this WalkoutModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for walk-out wobble if self.shared_param_dicts is None: # initialize the parameters "owned" by this model zero_ary = to_fX( np.zeros((1,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) ############################################################### # Setup the forwards (i.e. training) walk-out loop using scan # ############################################################### def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw): # get samples of next zi, according to the forwards model zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv) # check reverse direction probability p(xi_fw | zi_fw) xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_bw_mean = self.x_transform(xi_bw_mean) nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \ log_vars=xi_bw_logvar, mask=None) nll_xi_bw = nll_xi_bw.flatten() # get samples of next xi, according to the forwards model xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_fw_mean = self.x_transform(xi_fw_mean) xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv) # check reverse direction probability p(zi_fw | xi_fw) zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \ log_vars=zi_bw_logvar, mask=None) nll_zi_bw = nll_zi_bw.flatten() # each loop iteration produces the following values: # xi_fw: xi generated fom zi by forwards walk # zi_fw: zi generated fom xi by forwards walk # xi_fw_mean: ---- # xi_fw_logvar: ---- # zi_fw_mean: ---- # zi_fw_logvar: ---- # nll_xi_bw: NLL for reverse step zi_fw -> xi_fw # nll_zi_bw: NLL for reverse step xi_fw -> zi_fw return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw # initialize states for x/z self.x0 = self.x_out self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim) # setup initial values to pass to scan op outputs_init = [self.x0, self.z0, None, None, None, None, None, None] sequences_init = [self.xi_zmuv, self.zi_zmuv] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan(forwards_loop, \ outputs_info=outputs_init, \ sequences=sequences_init) # grab results of the scan op. all values are computed for each step self.xi = self.scan_results[0] self.zi = self.scan_results[1] self.xi_fw_mean = self.scan_results[2] self.xi_fw_logvar = self.scan_results[3] self.zi_fw_mean = self.scan_results[4] self.zi_fw_logvar = self.scan_results[5] self.nll_xi_bw = self.scan_results[6] self.nll_zi_bw = self.scan_results[7] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='srr_lr') # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w') self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_s self.lam_kld_s.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xo): """ Construct the necessary ZMUV gaussian samples for generating trajectories from this WalkoutModel, for input matrix xo. """ zi_zmuv = self.rng.normal( \ size=(self.total_steps, xo.shape[0], self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return zi_zmuv def _construct_rev_masks(self, xo): """ Compute the sequential revelation masks for the input batch in xo. -- We need to construct mask sequences for both p and q. """ if self.use_rev_masks: # make batch copies of self.rev_masks_p and self.rev_masks_q pmasks = self.rev_masks_p.dimshuffle(0,'x',1).repeat(xo.shape[0], axis=1) qmasks = self.rev_masks_q.dimshuffle(0,'x',1).repeat(xo.shape[0], axis=1) else: pm_list = [] qm_list = [] # make a zero mask that does nothing zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1]) # generate independently sampled masks for each revelation block for rb in self.rev_sched: # make a random binary mask with ones at rate rb[1] rand_vals = self.rng.uniform( \ size=(1, xo.shape[0], xo.shape[1]), \ low=0.0, high=1.0, dtype=theano.config.floatX) rand_mask = rand_vals < rb[1] # append the masks for this revleation block to the mask lists # # the guide policy (in q) gets to peek at the values that will be # revealed to the primary policy (in p) for the entire block. The # primary policy only gets to see these values at end of the final # step of the block. Within a given step, values are revealed to q # at the beginning of the step, and to p at the end. # # e.g. in a revelation block with only a single step, the guide # policy sees the values at the beginning of the step, which allows # it to guide the step. the primary policy only gets to see the # values at the end of the step. # # i.e. a standard variational auto-encoder is equivalent to a # sequential revelation and refinement model with only one # revelation block, which has one step and a reveal rate of 1.0. # for refine_step in range(rb[0]-1): pm_list.append(zero_mask) qm_list.append(rand_mask) pm_list.append(rand_mask) qm_list.append(rand_mask) # concatenate each mask list into a 3-tensor pmasks = T.cast(T.concatenate(pm_list, axis=0), 'floatX') qmasks = T.cast(T.concatenate(qm_list, axis=0), 'floatX') return [pmasks, qmasks] def _construct_nll_costs(self, si, xo, nll_mask): """ Construct the negative log-likelihood part of free energy. -- only check NLL where nll_mask == 1 """ xh = self._from_si_to_x( si ) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar, mask=nll_mask) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_s(self, s_i, s_j): """ Compute KL(s_i || s_j) -- assuming bernoullish outputs """ x_i = self._from_si_to_x( s_i ) x_j = self._from_si_to_x( s_j ) kld_s = (x_i * (T.log(x_i) - T.log(x_j))) + \ ((1.0 - x_i) * (T.log(1.0-x_i) - T.log(1.0-x_j))) sum_kld = T.sum(kld_s, axis=1) return sum_kld def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] kld_sis = [] s0 = 0.0*self.si[0] + self.s0 for i in range(self.total_steps): kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1)) if i == 0: kld_sis.append(self._construct_kld_s(self.si[i], s0)) else: kld_sis.append(self._construct_kld_s(self.si[i], self.si[i-1])) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) kld_si = sum(kld_sis) return [kld_pi, kld_qi, kld_gi, kld_si] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xo ], \ outputs=[nll, kld], \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XO, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XO.shape[0],)) kld_sum = np.zeros((XO.shape[0],)) for i in range(sample_count): result = fe_term_sample(XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # compile theano function for computing the costs all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g] cost_func = theano.function(inputs=[ xo ], \ outputs=all_step_costs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # make a function for computing batch-based estimates of costs. # _step_nlls: the expected NLL cost for each step # _step_klds: the expected KL(q||p) cost for each step # _kld_q2p: the expected KL(q||p) cost for each latent dim # _kld_p2q: the expected KL(p||q) cost for each latent dim # _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim def raw_cost_computer(XO): _all_costs = cost_func(to_fX(XO)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX( np.asarray([k for k in _step_klds]) ) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX( np.asarray([k for k in _step_nlls]) ) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \ self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') return func def _construct_sequence_sampler(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function states = [self._from_si_to_x(self.s0_full)] + \ [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)] masks = [self.m0_full] + [self.mi_p[i] for i in range(self.total_steps)] outputs = states + masks # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') # visualize trajectories generated by the model def sample_func(XO, use_guide_policy=False): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # get belief states and masks generated by the scan loop scan_vals = func(to_fX(XO)) step_count = self.total_steps + 1 seq_shape = (step_count, XO.shape[0], XO.shape[1]) xm_seq = np.zeros(seq_shape).astype(theano.config.floatX) xi_seq = np.zeros(seq_shape).astype(theano.config.floatX) mi_seq = np.zeros(seq_shape).astype(theano.config.floatX) for i in range(step_count): _xi = scan_vals[i] _mi = scan_vals[i + step_count] _xm = (_mi * XO) + ((1.0 - _mi) * _xi) xm_seq[i,:,:] = _xm xi_seq[i,:,:] = _xi mi_seq[i,:,:] = _mi # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return [xm_seq, xi_seq, mi_seq] return sample_func def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert(not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict() child_model_dicts['p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict() child_model_dicts['p_x_given_si'] = self.p_x_given_si.save_to_dict() child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return
class GenConvModule(object): """ Module of one "fractionally strided" convolution layer followed by one regular convolution layer. Inputs to the fractionally strided convolution can optionally be augmented with some random values. Params: filt_shape: shape for convolution filters -- should be square and odd in_chans: number of channels in the inputs to module out_chans: number of channels in the outputs from module rand_chans: number of random channels to augment input use_rand: flag for whether or not to augment inputs apply_bn_1: flag for whether to batch normalize following first conv apply_bn_2: flag for whether to batch normalize following second conv us_stride: upsampling ratio in the fractionally strided convolution use_pooling: whether to use unpooling or fractional striding init_func: function for initializing module parameters mod_name: text name for identifying module in theano graph rand_type: whether to use Gaussian or uniform randomness """ def __init__(self, filt_shape, in_chans, out_chans, rand_chans, use_rand=True, apply_bn_1=True, apply_bn_2=True, us_stride=2, use_pooling=True, init_func=None, mod_name='gm_conv', rand_type='normal'): assert ((filt_shape[0] % 2) > 0), "filter dim should be odd (not even)" self.filt_dim = filt_shape[0] self.in_chans = in_chans self.out_chans = out_chans self.rand_chans = rand_chans self.use_rand = use_rand self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.us_stride = us_stride self.use_pooling = use_pooling self.mod_name = mod_name self.rand_type = rand_type self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ if self.use_rand: # random values will be stacked on exogenous input self.w1 = self.init_func((self.out_chans, (self.in_chans+self.rand_chans), self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name)) else: # random values won't be stacked on exogenous input self.w1 = self.init_func((self.out_chans, self.in_chans, self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name)) self.w2 = self.init_func((self.out_chans, self.out_chans, self.filt_dim, self.filt_dim), "{}_w2".format(self.mod_name)) self.params = [self.w1, self.w2] # make gains and biases for transforms that will get batch normed if self.apply_bn_1: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.out_chans), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.out_chans), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) if self.apply_bn_2: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g2 = gain_ifn((self.out_chans), "{}_g2".format(self.mod_name)) self.b2 = bias_ifn((self.out_chans), "{}_b2".format(self.mod_name)) self.params.extend([self.g2, self.b2]) return def apply(self, input, rand_vals=None): """ Apply this generator module to some input. """ batch_size = input.shape[0] bm = int((self.filt_dim - 1) / 2) # use "same" mode convolutions ss = self.us_stride # stride for "learned upsampling" if self.use_pooling: # "unpool" the input if desired input = input.repeat(ss, axis=2).repeat(ss, axis=3) # get shape for random values that will augment input rand_shape = (batch_size, self.rand_chans, input.shape[2], input.shape[3]) if self.use_rand: # augment input with random channels if rand_vals is None: if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) rand_vals = rand_vals.reshape(rand_shape) # stack random values on top of input full_input = T.concatenate([rand_vals, input], axis=1) else: # don't augment input with random channels full_input = input # apply first convolution, perhaps with fractional striding if self.use_pooling: h1 = dnn_conv(full_input, self.w1, subsample=(1, 1), border_mode=(bm, bm)) else: # apply first conv layer (with fractional stride for upsampling) h1 = deconv(full_input, self.w1, subsample=(ss, ss), border_mode=(bm, bm)) if self.apply_bn_1: h1 = batchnorm(h1, g=self.g1, b=self.b1) h1 = relu(h1) # apply second conv layer h2 = dnn_conv(h1, self.w2, subsample=(1, 1), border_mode=(bm, bm)) if self.apply_bn_2: h2 = batchnorm(h2, g=self.g2, b=self.b2) h2 = relu(h2) return h2
class DAELayer(object): def __init__(self, rng, clean_input=None, fuzzy_input=None, \ in_dim=0, out_dim=0, activation=None, input_noise=0., \ W=None, b_h=None, b_v=None, W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) # Grab the layer input and perturb it with some sort of noise. This # is, afterall, a _denoising_ autoencoder... self.clean_input = clean_input self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise) # Set some basic layer properties self.activation = activation self.in_dim = in_dim self.out_dim = out_dim # Get some random initial weights and biases, if not given if W is None: W_init = np.asarray(1.0 * DCG(rng.standard_normal( \ size=(in_dim, out_dim)), dtype=theano.config.floatX)) W = theano.shared(value=(W_scale*W_init), name='W') if b_h is None: b_init = np.zeros((out_dim,), dtype=theano.config.floatX) b_h = theano.shared(value=b_init, name='b_h') if b_v is None: b_init = np.zeros((in_dim,), dtype=theano.config.floatX) b_v = theano.shared(value=b_init, name='b_v') # Grab pointers to the now-initialized weights and biases self.W = W self.b_h = b_h self.b_v = b_v # Put the learnable/optimizable parameters into a list self.params = [self.W, self.b_h, self.b_v] # Beep boop... layer construction complete... return def compute_costs(self, lam_l1=None): """Compute reconstruction and activation sparsity costs.""" # Get noise-perturbed encoder/decoder parameters W_nz = self._noisy_params(self.W, 0.01) b_nz = self.b_h #self._noisy_params(self.b_h, 0.05) # Compute hidden and visible activations A_v, A_h = self._compute_activations(self.noisy_input, \ W_nz, b_nz, self.b_v) # Compute reconstruction error cost recon_cost = T.sum((self.clean_input - A_v)**2.0) / \ self.clean_input.shape[0] # Compute sparsity penalty (over both population and lifetime) row_l1_sum = T.sum(abs(row_normalize(A_h))) / A_h.shape[0] col_l1_sum = T.sum(abs(col_normalize(A_h))) / A_h.shape[1] sparse_cost = lam_l1[0] * (row_l1_sum + col_l1_sum) return [recon_cost, sparse_cost] def _compute_hidden_acts(self, X, W, b_h): """Compute activations of encoder (at hidden layer).""" A_h = self.activation(T.dot(X, W) + b_h) return A_h def _compute_activations(self, X, W, b_h, b_v): """Compute activations of decoder (at visible layer).""" A_h = self._compute_hidden_acts(X, W, b_h) A_v = T.dot(A_h, W.T) + b_v return [A_v, A_h] def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" if noise_lvl > 1e-3: P_nz = P + DCG(self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX)) else: P_nz = P return P_nz def _get_noisy_input(self, input, p): """p is the probability of dropping elements of input.""" drop_rnd = self.rng.uniform(input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # Cast mask from int to float32, to keep things on GPU noisy_input = input * DCG(drop_mask) return noisy_input
class HiddenLayer(object): def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, b_in=None, s_in=None, \ name="", W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) # setup parameters for controlling zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.input_noise = theano.shared(value=(zero_ary+input_noise), \ name="{0:s}_input_noise".format(name)) self.bias_noise = theano.shared(value=(zero_ary+bias_noise), \ name="{0:s}_bias_noise".format(name)) self.drop_rate = theano.shared(value=(zero_ary+drop_rate), \ name="{0:s}_drop_rate".format(name)) # setup scale and bias params for the input if b_in is None: # input biases are always initialized to zero ary = np.zeros((in_dim,), dtype=theano.config.floatX) b_in = theano.shared(value=ary, name="{0:s}_b_in".format(name)) if s_in is None: # input scales are always initialized to one ary = 0.541325 * np.ones((in_dim,), dtype=theano.config.floatX) s_in = theano.shared(value=ary, name="{0:s}_s_in".format(name)) self.b_in = b_in self.s_in = s_in # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation is None: activation = relu_actfun if self.pool_size <= 1: self.activation = activation else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: # Generate initial filters using orthogonal random trick W_shape = (self.in_dim, self.filt_count) #W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) #W_init = W_scale * npr.normal(0.0, 1.0, W_shape) W_init = ortho_matrix(shape=(self.in_dim, self.filt_count), \ gain=W_scale) #W_init = 0.01 * npr.normal(0.0, 1.0, W_shape) W_init = W_init.astype(theano.config.floatX) W = theano.shared(value=W_init, name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count,), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Feedforward through the layer use_in = input_noise > 0.001 use_bn = bias_noise > 0.001 use_drop = drop_rate > 0.001 self.linear_output, self.noisy_linear, self.output = \ self.apply(input, use_in=use_in, use_bn=use_bn, \ use_drop=use_drop) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.noisy_linear**2.) / self.output.size # Conveniently package layer parameters self.params = [self.W, self.b, self.b_in, self.s_in] self.shared_param_dicts = { \ 'W': self.W, \ 'b': self.b, \ 'b_in': self.b_in, \ 's_in': self.s_in } # Layer construction complete... return def apply(self, input, use_in=False, use_bn=False, use_drop=False): """ Apply feedforward to this input, returning several partial results. """ # Add gaussian noise to the input (if desired) #fancy_input = T.nnet.softplus(self.s_in) * (input + self.b_in) fancy_input = input if use_in: fuzzy_input = fancy_input + self.input_noise[0] * \ self.rng.normal(size=fancy_input.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: fuzzy_input = fancy_input # Apply masking noise to the input (if desired) if use_drop: noisy_input = self._drop_from_input(fuzzy_input, self.drop_rate[0]) else: noisy_input = fuzzy_input self.noisy_input = noisy_input # Compute linear "pre-activation" for this layer linear_output = T.dot(noisy_input, self.W) + self.b # Add noise to the pre-activation features (if desired) if use_bn: noisy_linear = linear_output + self.bias_noise[0] * \ self.rng.normal(size=linear_output.shape, avg=0.0, \ std=1.0, dtype=theano.config.floatX) else: noisy_linear = linear_output # Apply activation function final_output = self.activation(noisy_linear) # package partial results for easy return results = [linear_output, noisy_linear, final_output] return results def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class ClassModel(object): """ Controller for training a fancy pseudo-bayesian classifier. Parameters: rng: numpy.random.RandomState (for reproducibility) x_in: the input data to encode y_in: int labels >= 1 for x_in when available, otherwise 0. q_z_given_x: InfNet for z given x class_count: number of classes to classify into z_dim: dimension of the "initial" latent space use_samples: whether to use z samples or just z mean """ def __init__(self, rng=None, \ x_in=None, y_in=None, \ q_z_given_x=None, \ class_count=None, \ z_dim=None, \ use_samples=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # record the dimensions of various spaces relevant to this model self.class_count = class_count self.z_dim = z_dim self.shared_dim = q_z_given_x.shared_layers[-1].out_dim self.use_samples = use_samples # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.y_in = y_in # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='cm_drop_rate') self.set_drop_rate(0.0) # initialize classification layer parameters init_mat = to_fX(0.01 * npr.randn(self.shared_dim, self.class_count)) init_vec = to_fX( np.zeros((self.class_count,)) ) self.W_class = theano.shared(value=init_mat, name='cm_W_class') self.b_class = theano.shared(value=init_vec, name='cm_b_class') # initialize "optimizable" parameters specific to this CM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='cm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='cm_p_z_logvar') ################# # Setup self.z. # ################# self.q_z_mean, self.q_z_logvar, self.q_z_samples = \ self.q_z_given_x.apply(self.x_in, do_samples=True) self.q_z_samples = self.q_z_given_x.apply_shared(self.x_in) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.q_z_samples.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) # get a droppy version of either z mean or z samples # if self.use_samples: # self.z = self.q_z_samples * drop_mask # else: # self.z = self.q_z_mean * drop_mask self.z = self.q_z_samples * drop_mask # compute class predictions self.y_out = T.dot(self.z, self.W_class) + self.b_class # compute KLds for training via variational free-energy self.kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \ self.q_z_mean, self.q_z_logvar) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='cm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='cm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='cm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='cm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='cm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='cm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='cm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=0.9, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='cm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters self.joint_params = [self.p_z_mean, self.p_z_logvar, \ self.W_class, self.b_class] self.joint_params.extend(self.q_z_given_x.mlp_params) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.lam_nll[0] * self._construct_nll_costs(self.y_in) self.nll_cost = T.mean(self.nll_costs) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_cost = T.mean(self.kld_costs) ################################## # CONSTRUCT THE FINAL JOINT COST # ################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the model parameters self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling class error estimator...") self.class_error = self._construct_class_error() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() # make easy access points for some interesting parameters self.inf_weights = self.q_z_given_x.shared_layers[0].W return def set_sgd_params(self, lr_1=0.01, lr_2=0.01, \ mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rates new_lr_1 = zero_ary + lr_1 self.lr_1.set_value(to_fX(new_lr_1)) new_lr_2 = zero_ary + lr_2 self.lr_2.set_value(to_fX(new_lr_2)) # set momentums new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_nll(self, lam_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_nll self.lam_nll.set_value(to_fX(new_lam)) return def set_lam_kld(self, lam_kld_q2p=1.0, lam_kld_p2q=1.0): """ Set the relative weight of various KL-divergences. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_q2p self.lam_kld_q2p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_p2q self.lam_kld_p2q.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_drop_rate(self, drop_rate=0.0): """ Set the weight for shaping penalty on conditional priors over zt. """ zero_ary = np.zeros((1,)) new_val = zero_ary + drop_rate self.drop_rate.set_value(to_fX(new_val)) return def _construct_nll_costs(self, yi): """ Construct the categorical log-likelihood part of cost. """ y_prob = safe_softmax(self.y_out) row_idx = T.arange(yi.shape[0]) col_idx = yi.flatten() - 1 row_mask = T.neq(yi, 0).reshape((yi.shape[0], 1)) wacky_mat = (y_prob * row_mask) + (1. - row_mask) flat_nlls = -T.log(wacky_mat[row_idx,col_idx]) class_nlls = flat_nlls.reshape((yi.shape[0], 1)) return class_nlls def _construct_kld_costs(self, p=1.0): """ Construct the z KLd part of cost. """ kld_z_q2p = T.sum(self.kld_z_q2ps**p, axis=1, keepdims=True) kld_z_p2q = T.sum(self.kld_z_p2qs**p, axis=1, keepdims=True) return kld_z_q2p, kld_z_p2q def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ inputs = [self.x_in, self.y_in] # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \ self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=inputs, outputs=outputs, \ updates=self.joint_updates) return func def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # construct values to output nll = self._construct_nll_costs(self.y_in) kld = self.kld_z_q2p # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[self.x_in, self.y_in], \ outputs=[nll, kld]) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XI, XO, sample_count): # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XI.shape[0],)) kld_sum = np.zeros((XI.shape[0],)) for i in range(sample_count): result = fe_term_sample(XI, XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator def _construct_class_error(self): """ Compute classification error for a set of observations xi with known labels yi, based on multiple passes through noisy initial model. """ # make a function for computing self.y_out y_func = theano.function([self.x_in], outputs=self.y_out) def multi_sample_error(xi, yi, samples=20): # compute self.y_out for the observations in xi xi = to_fX(xi) yp = y_func(xi) for i in range(samples-1): yp += y_func(xi) yp = yp / float(samples) # get the implied class labels yc = np.argmax(yp, axis=1).flatten() yi = yi.flatten() mask = 1.0 * (yi != 0) yi = yi - 1 # compute the classification error for points with valid labels err_rate = np.sum(((yi != yc) * mask)) / np.sum(mask) return err_rate, yp return multi_sample_error
class GenFCModule(object): """ Module that transforms random values through a single fully connected layer, and then a linear transform (with another relu, optionally). """ def __init__(self, rand_dim, out_dim, fc_dim, apply_bn_1=True, apply_bn_2=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_fc'): self.rand_dim = rand_dim self.out_dim = out_dim self.fc_dim = fc_dim self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ self.w1 = self.init_func((self.rand_dim, self.fc_dim), "{}_w1".format(self.mod_name)) self.w2 = self.init_func((self.fc_dim, self.out_dim), "{}_w2".format(self.mod_name)) self.params = [self.w1, self.w2] # make gains and biases for transforms that will get batch normed if self.apply_bn_1: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.fc_dim), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.fc_dim), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) if self.apply_bn_2: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g2 = gain_ifn((self.out_dim), "{}_g2".format(self.mod_name)) self.b2 = bias_ifn((self.out_dim), "{}_b2".format(self.mod_name)) self.params.extend([self.g2, self.b2]) return def apply(self, batch_size=None, rand_vals=None): """ Apply this generator module. Pass _either_ batch_size or rand_vals. """ assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals" if rand_vals is None: rand_shape = (batch_size, self.rand_dim) if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) else: rand_shape = (rand_vals.shape[0], self.rand_dim) rand_vals = rand_vals.reshape(rand_shape) # transform random values into fc layer h1 = T.dot(rand_vals, self.w1) if self.apply_bn_1: h1 = batchnorm(h1, g=self.g1, b=self.b1) h1 = relu(h1) # transform from fc layer to output h2 = T.dot(h1, self.w2) if self.apply_bn_2: h2 = batchnorm(h2, g=self.g2, b=self.b2) if self.final_relu: h2 = relu(h2) return h2
class GenUniModule(object): """ Module that applies a linear transform followed by an non-linearity. """ def __init__(self, rand_dim, out_dim, apply_bn=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_uni'): self.rand_dim = rand_dim self.out_dim = out_dim self.apply_bn = apply_bn self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ self.w1 = self.init_func((self.rand_dim, self.out_dim), "{}_w1".format(self.mod_name)) self.params = [self.w1] # make gains and biases for transforms that will get batch normed if self.apply_bn: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.out_dim), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.out_dim), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) return def apply(self, batch_size=None, rand_vals=None): """ Apply this generator module. Pass _either_ batch_size or rand_vals. """ assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals" if rand_vals is None: rand_shape = (batch_size, self.rand_dim) if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) else: rand_shape = (rand_vals.shape[0], self.rand_dim) rand_vals = rand_vals.reshape(rand_shape) # transform random values linearly h1 = T.dot(rand_vals, self.w1) if self.apply_bn: h1 = batchnorm(h1, g=self.g1, b=self.b1) if self.final_relu: h1 = relu(h1) return h1 ############## # EYE BUFFER # ##############
class TwoStageModel(object): """ Controller for training a multi-step iterative refinement model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_in: the input data to encode x_out: the target output to decode p_s_given_z: InfNet for initializing "canvas" state p_h_given_s: InfNet for h given s p_x_given_s_h: InfNet for x given s and h q_z_given_x: InfNet for z given x q_h_given_x_s: InfNet for h given x and s x_dim: dimension of the observations to generate z_dim: dimension of the "first" latent space s_dim: dimension of the "second" latent space h_dim: dimension of the "third" latent space params: REQUIRED PARAMS SHOWN BELOW x_type: can be "bernoulli" or "gaussian" obs_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_s_given_z=None, \ p_h_given_s=None, \ p_x_given_s_h=None, \ q_z_given_x=None, \ q_h_given_x_s=None, \ x_dim=None, \ z_dim=None, \ s_dim=None, \ h_dim=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.s_dim = s_dim self.h_dim = h_dim # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_h_given_x_s = q_h_given_x_s self.p_s_given_z = p_s_given_z self.p_h_given_s = p_h_given_s self.p_x_given_s_h = p_x_given_s_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.batch_reps = T.lscalar() # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) # this weight balances l1 vs. l2 penalty on posterior KLds self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2') self.set_lam_kld_l1l2(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar') init_vec = to_fX( np.zeros((self.x_dim,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.x_out.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "first" latent state drop_x = drop_mask * self.x_in z_q_mean, z_q_logvar, self.z = \ self.q_z_given_x.apply(drop_x, do_samples=True) # compute relevant KLds for this step self.kld_z_q2ps = gaussian_kld(z_q_mean, z_q_logvar, \ self.p_z_mean, self.p_z_logvar) self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \ z_q_mean, z_q_logvar) # transform "first" latent state into "second" latent state self.s, _ = self.p_s_given_z.apply(self.z, do_samples=False) # get samples of h, conditioned on current s h_p_mean, h_p_logvar, h_p = self.p_h_given_s.apply( \ self.s, do_samples=True) # get variational samples of h, given s and x_out h_q_mean, h_q_logvar, h_q = self.q_h_given_x_s.apply( \ T.horizontal_stack(self.x_out, self.s), \ do_samples=True) # make h samples that can be switched between h_p and h_q self.h = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) # compute relevant KLds for this step self.kld_h_q2ps = gaussian_kld(h_q_mean, h_q_logvar, \ h_p_mean, h_p_logvar) self.kld_h_p2qs = gaussian_kld(h_p_mean, h_p_logvar, \ h_q_mean, h_q_logvar) # p_x_given_s_h is conditioned on s and h. self.x_gen, _ = self.p_x_given_s_h.apply( \ T.horizontal_stack(self.s, self.h), \ do_samples=False) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z') self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q') self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.group_1_params = [] self.group_1_params.extend(self.q_z_given_x.mlp_params) self.group_1_params.extend(self.q_h_given_x_s.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.group_2_params = [self.p_z_mean, self.p_z_logvar] self.group_2_params.extend(self.p_s_given_z.mlp_params) self.group_2_params.extend(self.p_h_given_s.mlp_params) self.group_2_params.extend(self.p_x_given_s_h.mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.group_1_params + self.group_2_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q, self.kld_h_q2p, self.kld_h_p2q = \ self._construct_kld_costs(p=1.0) self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_h # now do l2 KLd costs self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_h_q2p, self.kl2_h_p2q = \ self._construct_kld_costs(p=2.0) self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_z_p2q) self.kl2_h = (self.lam_kld_q2p[0] * self.kl2_h_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_h_p2q) self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_h # compute joint l1/l2 KLd cost self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \ ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) self.kl2_cost = T.mean(self.kl2_costs) self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_l1l2_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_l1l2_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.group_1_updates = get_adam_updates(params=self.group_1_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.group_2_updates = get_adam_updates(params=self.group_2_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.group_1_updates: self.joint_updates[k] = self.group_1_updates[k] for k in self.group_2_updates: self.joint_updates[k] = self.group_2_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() print("Compiling data-guided model sampler...") self.sample_from_input = self._construct_sample_from_input() # make easy access points for some interesting parameters self.gen_gen_weights = self.p_x_given_s_h.mu_layers[-1].W return def set_sgd_params(self, lr_1=0.01, lr_2=0.01, \ mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rates new_lr_1 = zero_ary + lr_1 self.lr_1.set_value(to_fX(new_lr_1)) new_lr_2 = zero_ary + lr_2 self.lr_2.set_value(to_fX(new_lr_2)) # set momentums new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_nll(self, lam_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_nll self.lam_nll.set_value(to_fX(new_lam)) return def set_lam_kld(self, lam_kld_z=1.0, lam_kld_q2p=1.0, lam_kld_p2q=1.0): """ Set the relative weight of various KL-divergences. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_z self.lam_kld_z.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q2p self.lam_kld_q2p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_p2q self.lam_kld_p2q.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def set_lam_kld_l1l2(self, lam_kld_l1l2=1.0): """ Set the weight for shaping penalty on conditional priors over zt. """ zero_ary = np.zeros((1,)) new_val = zero_ary + lam_kld_l1l2 self.lam_kld_l1l2.set_value(to_fX(new_val)) return def set_drop_rate(self, drop_rate=0.0): """ Set the weight for shaping penalty on conditional priors over zt. """ zero_ary = np.zeros((1,)) new_val = zero_ary + drop_rate self.drop_rate.set_value(to_fX(new_val)) return def _construct_nll_costs(self, xo): """ Construct the negative log-likelihood part of free energy. """ # average log-likelihood over the refinement sequence xh = self.obs_transform(self.x_gen) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar) nll_costs = -ll_costs return nll_costs def _construct_kld_costs(self, p=1.0): """ Construct the posterior KL-divergence part of cost to minimize. """ kld_z_q2p = T.sum(self.kld_z_q2ps**p, axis=1, keepdims=True) kld_z_p2q = T.sum(self.kld_z_p2qs**p, axis=1, keepdims=True) kld_h_q2p = T.sum(self.kld_h_q2ps**p, axis=1, keepdims=True) kld_h_p2q = T.sum(self.kld_h_p2qs**p, axis=1, keepdims=True) return [kld_z_q2p, kld_z_p2q, kld_h_q2p, kld_h_p2q] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \ self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xi, xo, self.batch_reps ], \ outputs=outputs, \ givens={ self.x_in: xi.repeat(self.batch_reps, axis=0), \ self.x_out: xo.repeat(self.batch_reps, axis=0) }, \ updates=self.joint_updates) return func def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # construct values to output nll = self._construct_nll_costs(self.x_out) kld = self.kld_z_q2p + self.kld_h_q2p # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[self.x_in, self.x_out], \ outputs=[nll, kld]) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XI, XO, sample_count): # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XI.shape[0],)) kld_sum = np.zeros((XI.shape[0],)) for i in range(sample_count): result = fe_term_sample(XI, XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator def _construct_sample_from_prior(self): """ Construct a function for drawing independent samples from the distribution generated by this MultiStageModel. This function returns the full sequence of "partially completed" examples. """ z_sym = T.matrix() x_sym = T.matrix() sample_func = theano.function(inputs=[z_sym, x_sym], \ outputs=self.obs_transform(self.x_gen), \ givens={self.z: z_sym, \ self.x_in: T.zeros_like(x_sym), \ self.x_out: T.zeros_like(x_sym)}) def prior_sampler(samp_count): x_samps = to_fX( np.zeros((samp_count, self.x_dim)) ) old_switch = self.train_switch.get_value(borrow=False) # set model to generation mode self.set_train_switch(switch_val=0.0) z_samps = to_fX( npr.randn(samp_count, self.z_dim) ) model_samps = sample_func(z_samps, x_samps) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return model_samps return prior_sampler def _construct_sample_from_input(self): """ Construct a function for drawing samples from the distribution generated by this MultiStageModel, conditioned on some inputs to the initial encoder stage (i.e. self.q_z_given_x). This returns the full sequence of "partially completed" examples. The """ sample_func = theano.function(inputs=[self.x_in, self.x_out], \ outputs=self.obs_transform(self.x_gen)) def conditional_sampler(XI, XO=None, guided_decoding=False): XI = to_fX( XI ) if XO is None: XO = XI XO = to_fX( XO ) # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if guided_decoding: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's generative policy self.set_train_switch(switch_val=0.0) # draw guided/unguided conditional samples model_samps = sample_func(XI, XO) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return model_samps return conditional_sampler
class GenFCModule(object): """ Module that transforms random values through a single fully connected layer, and then a linear transform (with another relu, optionally). """ def __init__(self, rand_dim, out_dim, fc_dim, apply_bn_1=True, apply_bn_2=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_fc'): self.rand_dim = rand_dim self.out_dim = out_dim self.fc_dim = fc_dim self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ self.w1 = self.init_func((self.rand_dim, self.fc_dim), "{}_w1".format(self.mod_name)) self.w2 = self.init_func((self.fc_dim, self.out_dim), "{}_w2".format(self.mod_name)) self.params = [self.w1, self.w2] # make gains and biases for transforms that will get batch normed if self.apply_bn_1: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.fc_dim), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.fc_dim), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) if self.apply_bn_2: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g2 = gain_ifn((self.out_dim), "{}_g2".format(self.mod_name)) self.b2 = bias_ifn((self.out_dim), "{}_b2".format(self.mod_name)) self.params.extend([self.g2, self.b2]) return def apply(self, batch_size=None, rand_vals=None): """ Apply this generator module. Pass _either_ batch_size or rand_vals. """ assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals" if rand_vals is None: rand_shape = (batch_size, self.rand_dim) if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) else: rand_shape = (rand_vals.shape[0], self.rand_dim) rand_vals = rand_vals.reshape(rand_shape) # transform random values into fc layer h1 = T.dot(rand_vals, self.w1) if self.apply_bn_1: h1 = batchnorm(h1, g=self.g1, b=self.b1) h1 = relu(h1) # transform from fc layer to output h2 = T.dot(h1, self.w2) if self.apply_bn_2: h2 = batchnorm(h2, g=self.g2, b=self.b2) if self.final_relu: h2 = relu(h2) return h2
class GenConvModule(object): """ Module of one "fractionally strided" convolution layer followed by one regular convolution layer. Inputs to the fractionally strided convolution can optionally be augmented with some random values. Params: filt_shape: shape for convolution filters -- should be square and odd in_chans: number of channels in the inputs to module out_chans: number of channels in the outputs from module rand_chans: number of random channels to augment input use_rand: flag for whether or not to augment inputs apply_bn_1: flag for whether to batch normalize following first conv apply_bn_2: flag for whether to batch normalize following second conv us_stride: upsampling ratio in the fractionally strided convolution use_pooling: whether to use unpooling or fractional striding init_func: function for initializing module parameters mod_name: text name for identifying module in theano graph rand_type: whether to use Gaussian or uniform randomness """ def __init__(self, filt_shape, in_chans, out_chans, rand_chans, use_rand=True, apply_bn_1=True, apply_bn_2=True, us_stride=2, use_pooling=True, init_func=None, mod_name='gm_conv', rand_type='normal'): assert ((filt_shape[0] % 2) > 0), "filter dim should be odd (not even)" self.filt_dim = filt_shape[0] self.in_chans = in_chans self.out_chans = out_chans self.rand_chans = rand_chans self.use_rand = use_rand self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.us_stride = us_stride self.use_pooling = use_pooling self.mod_name = mod_name self.rand_type = rand_type self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ if self.use_rand: # random values will be stacked on exogenous input self.w1 = self.init_func( (self.out_chans, (self.in_chans + self.rand_chans), self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name)) else: # random values won't be stacked on exogenous input self.w1 = self.init_func( (self.out_chans, self.in_chans, self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name)) self.w2 = self.init_func( (self.out_chans, self.out_chans, self.filt_dim, self.filt_dim), "{}_w2".format(self.mod_name)) self.params = [self.w1, self.w2] # make gains and biases for transforms that will get batch normed if self.apply_bn_1: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.out_chans), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.out_chans), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) if self.apply_bn_2: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g2 = gain_ifn((self.out_chans), "{}_g2".format(self.mod_name)) self.b2 = bias_ifn((self.out_chans), "{}_b2".format(self.mod_name)) self.params.extend([self.g2, self.b2]) return def apply(self, input, rand_vals=None): """ Apply this generator module to some input. """ batch_size = input.shape[0] bm = int((self.filt_dim - 1) / 2) # use "same" mode convolutions ss = self.us_stride # stride for "learned upsampling" if self.use_pooling: # "unpool" the input if desired input = input.repeat(ss, axis=2).repeat(ss, axis=3) # get shape for random values that will augment input rand_shape = (batch_size, self.rand_chans, input.shape[2], input.shape[3]) if self.use_rand: # augment input with random channels if rand_vals is None: if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) rand_vals = rand_vals.reshape(rand_shape) # stack random values on top of input full_input = T.concatenate([rand_vals, input], axis=1) else: # don't augment input with random channels full_input = input # apply first convolution, perhaps with fractional striding if self.use_pooling: h1 = dnn_conv(full_input, self.w1, subsample=(1, 1), border_mode=(bm, bm)) else: # apply first conv layer (with fractional stride for upsampling) h1 = deconv(full_input, self.w1, subsample=(ss, ss), border_mode=(bm, bm)) if self.apply_bn_1: h1 = batchnorm(h1, g=self.g1, b=self.b1) h1 = relu(h1) # apply second conv layer h2 = dnn_conv(h1, self.w2, subsample=(1, 1), border_mode=(bm, bm)) if self.apply_bn_2: h2 = batchnorm(h2, g=self.g2, b=self.b2) h2 = relu(h2) return h2
class ConvPoolLayer(object): """ A simple convolution --> max-pooling layer. The (symbolic) input to this layer must be a theano.tensor.dtensor4 shaped like (batch_size, chan_count, im_dim_1, im_dim_2). filt_def should be a 4-tuple like (filt_count, in_chans, filt_def_1, filt_def_2) pool_def should be a 3-tuple like (pool_dim, pool_stride) """ def __init__(self, rng, input=None, filt_def=None, pool_def=(2, 2), \ activation=None, drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, name="", W_scale=1.0): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + self.rng.normal(size=input.shape, \ avg=0.0, std=input_noise, dtype=theano.config.floatX) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set the activation function for the conv filters if activation: self.activation = activation else: self.activation = lambda x: relu_actfun(x) # initialize weights with random weights W_init = 0.01 * np.asarray(rng.normal( \ size=filt_def), dtype=theano.config.floatX) self.W = theano.shared(value=(W_scale*W_init), \ name="{0:s}_W".format(name)) # the bias is a 1D tensor -- one bias per output feature map b_init = np.zeros((filt_def[0],), dtype=theano.config.floatX) + 0.1 self.b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # convolve input feature maps with filters input_c01b = self.noisy_input.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_c01b = self.W.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_op = FilterActs(stride=1, partial_sum=1) contig_input = gpu_contiguous(input_c01b) contig_filters = gpu_contiguous(filters_c01b) conv_out_c01b = conv_op(contig_input, contig_filters) if (bias_noise > 1e-4): noisy_conv_out_c01b = conv_out_c01b + self.rng.normal( \ size=conv_out_c01b.shape, avg=0.0, std=bias_noise, \ dtype=theano.config.floatX) else: noisy_conv_out_c01b = conv_out_c01b # downsample each feature map individually, using maxpooling pool_op = MaxPool(ds=pool_def[0], stride=pool_def[1]) mp_out_c01b = pool_op(noisy_conv_out_c01b) mp_out_bc01 = mp_out_c01b.dimshuffle(3, 0, 1, 2) # c01b to bc01 # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.noisy_linear_output = mp_out_bc01 + self.b.dimshuffle('x', 0, 'x', 'x') self.linear_output = self.noisy_linear_output self.output = self.activation(self.noisy_linear_output) # store parameters of this layer self.params = [self.W, self.b] return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class WalkoutModel(object): """ Controller for training a forwards-backwards chainy model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_out: the goal state for forwards-backwards walking process p_z_given_x: InfNet for stochastic part of step p_x_given_z: HydraNet for deterministic part of step params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of observations to construct z_dim: dimension of latent space for policy wobble walkout_steps: number of steps to walk out x_type: can be "bernoulli" or "gaussian" x_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, x_out=None, \ p_z_given_x=None, \ p_x_given_z=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this WalkoutModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.walkout_steps = self.params['walkout_steps'] self.x_type = self.params['x_type'] self.shared_param_dicts = shared_param_dicts if 'x_transform' in self.params: assert((self.params['x_transform'] == 'sigmoid') or \ (self.params['x_transform'] == 'none')) if self.params['x_transform'] == 'sigmoid': self.x_transform = lambda x: T.nnet.sigmoid(x) else: self.x_transform = lambda x: x else: self.x_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.x_transform = lambda x: T.nnet.sigmoid(x) assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) assert ((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant networks self.p_z_given_x = p_z_given_x self.p_x_given_z = p_x_given_z # record the symbolic variables that will provide inputs to the # computation graph created for this WalkoutModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for walk-out wobble if self.shared_param_dicts is None: # initialize the parameters "owned" by this model zero_ary = to_fX(np.zeros((1, ))) self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar') self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) ############################################################### # Setup the forwards (i.e. training) walk-out loop using scan # ############################################################### def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw): # get samples of next zi, according to the forwards model zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv) # check reverse direction probability p(xi_fw | zi_fw) xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_bw_mean = self.x_transform(xi_bw_mean) nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \ log_vars=xi_bw_logvar, mask=None) nll_xi_bw = nll_xi_bw.flatten() # get samples of next xi, according to the forwards model xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_fw_mean = self.x_transform(xi_fw_mean) xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv) # check reverse direction probability p(zi_fw | xi_fw) zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \ log_vars=zi_bw_logvar, mask=None) nll_zi_bw = nll_zi_bw.flatten() # each loop iteration produces the following values: # xi_fw: xi generated fom zi by forwards walk # zi_fw: zi generated fom xi by forwards walk # xi_fw_mean: ---- # xi_fw_logvar: ---- # zi_fw_mean: ---- # zi_fw_logvar: ---- # nll_xi_bw: NLL for reverse step zi_fw -> xi_fw # nll_zi_bw: NLL for reverse step xi_fw -> zi_fw return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw # initialize states for x/z self.x0 = self.x_out self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim) # setup initial values to pass to scan op outputs_init = [self.x0, self.z0, None, None, None, None, None, None] sequences_init = [self.xi_zmuv, self.zi_zmuv] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan(forwards_loop, \ outputs_info=outputs_init, \ sequences=sequences_init) # grab results of the scan op. all values are computed for each step self.xi = self.scan_results[0] self.zi = self.scan_results[1] self.xi_fw_mean = self.scan_results[2] self.xi_fw_logvar = self.scan_results[3] self.zi_fw_mean = self.scan_results[4] self.zi_fw_logvar = self.scan_results[5] self.nll_xi_bw = self.scan_results[6] self.nll_zi_bw = self.scan_results[7] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='srr_lr') # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w') self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs( p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1, )) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_s self.lam_kld_s.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1, )) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xo): """ Construct the necessary ZMUV gaussian samples for generating trajectories from this WalkoutModel, for input matrix xo. """ zi_zmuv = self.rng.normal( \ size=(self.total_steps, xo.shape[0], self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return zi_zmuv def _construct_rev_masks(self, xo): """ Compute the sequential revelation masks for the input batch in xo. -- We need to construct mask sequences for both p and q. """ if self.use_rev_masks: # make batch copies of self.rev_masks_p and self.rev_masks_q pmasks = self.rev_masks_p.dimshuffle(0, 'x', 1).repeat(xo.shape[0], axis=1) qmasks = self.rev_masks_q.dimshuffle(0, 'x', 1).repeat(xo.shape[0], axis=1) else: pm_list = [] qm_list = [] # make a zero mask that does nothing zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1]) # generate independently sampled masks for each revelation block for rb in self.rev_sched: # make a random binary mask with ones at rate rb[1] rand_vals = self.rng.uniform( \ size=(1, xo.shape[0], xo.shape[1]), \ low=0.0, high=1.0, dtype=theano.config.floatX) rand_mask = rand_vals < rb[1] # append the masks for this revleation block to the mask lists # # the guide policy (in q) gets to peek at the values that will be # revealed to the primary policy (in p) for the entire block. The # primary policy only gets to see these values at end of the final # step of the block. Within a given step, values are revealed to q # at the beginning of the step, and to p at the end. # # e.g. in a revelation block with only a single step, the guide # policy sees the values at the beginning of the step, which allows # it to guide the step. the primary policy only gets to see the # values at the end of the step. # # i.e. a standard variational auto-encoder is equivalent to a # sequential revelation and refinement model with only one # revelation block, which has one step and a reveal rate of 1.0. # for refine_step in range(rb[0] - 1): pm_list.append(zero_mask) qm_list.append(rand_mask) pm_list.append(rand_mask) qm_list.append(rand_mask) # concatenate each mask list into a 3-tensor pmasks = T.cast(T.concatenate(pm_list, axis=0), 'floatX') qmasks = T.cast(T.concatenate(qm_list, axis=0), 'floatX') return [pmasks, qmasks] def _construct_nll_costs(self, si, xo, nll_mask): """ Construct the negative log-likelihood part of free energy. -- only check NLL where nll_mask == 1 """ xh = self._from_si_to_x(si) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar, mask=nll_mask) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_s(self, s_i, s_j): """ Compute KL(s_i || s_j) -- assuming bernoullish outputs """ x_i = self._from_si_to_x(s_i) x_j = self._from_si_to_x(s_j) kld_s = (x_i * (T.log(x_i) - T.log(x_j))) + \ ((1.0 - x_i) * (T.log(1.0-x_i) - T.log(1.0-x_j))) sum_kld = T.sum(kld_s, axis=1) return sum_kld def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] kld_sis = [] s0 = 0.0 * self.si[0] + self.s0 for i in range(self.total_steps): kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1)) if i == 0: kld_sis.append(self._construct_kld_s(self.si[i], s0)) else: kld_sis.append( self._construct_kld_s(self.si[i], self.si[i - 1])) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) kld_si = sum(kld_sis) return [kld_pi, kld_qi, kld_gi, kld_si] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xo ], \ outputs=[nll, kld], \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XO, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XO.shape[0], )) kld_sum = np.zeros((XO.shape[0], )) for i in range(sample_count): result = fe_term_sample(XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # compile theano function for computing the costs all_step_costs = [ self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g ] cost_func = theano.function(inputs=[ xo ], \ outputs=all_step_costs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # make a function for computing batch-based estimates of costs. # _step_nlls: the expected NLL cost for each step # _step_klds: the expected KL(q||p) cost for each step # _kld_q2p: the expected KL(q||p) cost for each latent dim # _kld_p2q: the expected KL(p||q) cost for each latent dim # _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim def raw_cost_computer(XO): _all_costs = cost_func(to_fX(XO)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX(np.asarray([k for k in _step_klds])) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX(np.asarray([k for k in _step_nlls])) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \ self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') return func def _construct_sequence_sampler(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function states = [self._from_si_to_x(self.s0_full)] + \ [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)] masks = [self.m0_full ] + [self.mi_p[i] for i in range(self.total_steps)] outputs = states + masks # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') # visualize trajectories generated by the model def sample_func(XO, use_guide_policy=False): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # get belief states and masks generated by the scan loop scan_vals = func(to_fX(XO)) step_count = self.total_steps + 1 seq_shape = (step_count, XO.shape[0], XO.shape[1]) xm_seq = np.zeros(seq_shape).astype(theano.config.floatX) xi_seq = np.zeros(seq_shape).astype(theano.config.floatX) mi_seq = np.zeros(seq_shape).astype(theano.config.floatX) for i in range(step_count): _xi = scan_vals[i] _mi = scan_vals[i + step_count] _xm = (_mi * XO) + ((1.0 - _mi) * _xi) xm_seq[i, :, :] = _xm xi_seq[i, :, :] = _xi mi_seq[i, :, :] = _mi # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return [xm_seq, xi_seq, mi_seq] return sample_func def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert (not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict() child_model_dicts[ 'p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict() child_model_dicts['p_x_given_si'] = self.p_x_given_si.save_to_dict() child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return
class MultiStageModel(object): """ Controller for training a multi-step iterative refinement model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_in: the input data to encode x_out: the target output to decode p_s0_given_z: InfNet for initializing "canvas" state p_hi_given_si: InfNet for hi given si p_sip1_given_si_hi: HydraNet for sip1 given si and hi q_z_given_x: InfNet for z given x q_hi_given_x_si: InfNet for hi given x and si obs_dim: dimension of the observations to generate z_dim: dimension of the "initial" latent space h_dim: dimension of the "primary" latent space ir_steps: number of "iterative refinement" steps to perform params: REQUIRED PARAMS SHOWN BELOW x_type: can be "bernoulli" or "gaussian" obs_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_s0_given_z=None, \ p_hi_given_si=None, \ p_sip1_given_si_hi=None, \ q_z_given_x=None, \ q_hi_given_x_si=None, \ obs_dim=None, \ z_dim=None, h_dim=None, \ ir_steps=4, params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.obs_dim = obs_dim self.z_dim = z_dim self.h_dim = h_dim self.ir_steps = ir_steps # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_hi_given_x_si = q_hi_given_x_si self.p_s0_given_z = p_s0_given_z self.p_hi_given_si = p_hi_given_si self.p_sip1_given_si_hi = p_sip1_given_si_hi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.hi_zmuv = T.tensor3() # for ZMUV Gaussian samples to use in scan # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) # this weight balances l1 vs. l2 penalty on posterior KLds self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2') self.set_lam_kld_l1l2(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar') init_vec = to_fX( np.zeros((self.obs_dim,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) # setup a function for computing reconstruction log likelihood if self.x_type == 'bernoulli': self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_bernoulli(xo, xh)) else: self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar)) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.x_out.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) ############################# # Setup self.z and self.s0. # ############################# print("Building MSM step 0...") drop_x = drop_mask * self.x_in self.q_z_mean, self.q_z_logvar, self.z = \ self.q_z_given_x.apply(drop_x, do_samples=True) # get initial observation state self.s0, _ = self.p_s0_given_z.apply(self.z, do_samples=False) # gather KLd and NLL for the initialization step self.init_klds = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) self.init_nlls = -1.0 * \ self.log_prob_func(self.x_out, self.obs_transform(self.s0)) ################################################## # Setup the iterative generation loop using scan # ################################################## def ir_step_func(hi_zmuv, sim1): # get variables used throughout this refinement step sim1_obs = self.obs_transform(sim1) # transform state -> obs grad_ll = self.x_out - sim1_obs # get samples of next hi, conditioned on current si hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \ sim1_obs, do_samples=False) # now we build the model for variational hi given si hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \ T.horizontal_stack(grad_ll, sim1_obs), \ do_samples=False) hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean # make hi samples that can be switched between hi_p and hi_q hi = ( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on si and hi. ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi) # get the transformed values (for an LSTM style update) i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0) f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0) # perform an LSTM-like update of the state sim1 -> si si = (in_vals * i_gate) + (sim1 * f_gate) # compute generator NLL for this step nlli = self.log_prob_func(self.x_out, self.obs_transform(si)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \ hi_p_mean, hi_p_logvar) kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \ hi_q_mean, hi_q_logvar) return si, nlli, kldi_q2p, kldi_p2q init_values = [self.s0, None, None, None] self.scan_results, self.scan_updates = theano.scan(ir_step_func, \ outputs_info=init_values, sequences=self.hi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z') self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q') self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.q_params = [] self.q_params.extend(self.q_z_given_x.mlp_params) self.q_params.extend(self.q_hi_given_x_si.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.p_params = [self.p_z_mean, self.p_z_logvar] self.p_params.extend(self.p_hi_given_si.mlp_params) self.p_params.extend(self.p_sip1_given_si_hi.mlp_params) self.p_params.extend(self.p_s0_given_z.mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.q_params + self.p_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q, self.kld_hi_q2p, self.kld_hi_p2q = \ self._construct_kld_costs(p=1.0) self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_hi = (self.lam_kld_q2p[0] * self.kld_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kld_hi_p2q) self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_hi # now do l2 KLd costs self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_hi_q2p, self.kl2_hi_p2q = \ self._construct_kld_costs(p=2.0) self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_z_p2q) self.kl2_hi = (self.lam_kld_q2p[0] * self.kl2_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_hi_p2q) self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_hi # compute joint l1/l2 KLd cost self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \ ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) self.kl2_cost = T.mean(self.kl2_costs) self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_l1l2_cost + \ self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_l1l2_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.q_updates = get_adam_updates(params=self.q_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.p_updates = get_adam_updates(params=self.p_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.q_updates: self.joint_updates[k] = self.q_updates[k] for k in self.p_updates: self.joint_updates[k] = self.p_updates[k] # add scan updates, which seem to be required for k in self.scan_updates: self.joint_updates[k] = self.scan_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_klds = self._construct_raw_klds() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() print("Compiling data-guided model sampler...") self.sample_from_input = self._construct_sample_from_input() return def set_sgd_params(self, lr_1=0.01, lr_2=0.01, \ mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rates new_lr_1 = zero_ary + lr_1 self.lr_1.set_value(to_fX(new_lr_1)) new_lr_2 = zero_ary + lr_2 self.lr_2.set_value(to_fX(new_lr_2)) # set momentums new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_nll(self, lam_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_nll self.lam_nll.set_value(to_fX(new_lam)) return def set_lam_kld(self, lam_kld_z=1.0, lam_kld_q2p=1.0, lam_kld_p2q=1.0): """ Set the relative weight of various KL-divergences. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_z self.lam_kld_z.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q2p self.lam_kld_q2p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_p2q self.lam_kld_p2q.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def set_lam_kld_l1l2(self, lam_kld_l1l2=1.0): """ Set the weight for shaping penalty on conditional priors over zt. """ zero_ary = np.zeros((1,)) new_val = zero_ary + lam_kld_l1l2 self.lam_kld_l1l2.set_value(to_fX(new_val)) return def set_drop_rate(self, drop_rate=0.0): """ Set the weight for shaping penalty on conditional priors over zt. """ zero_ary = np.zeros((1,)) new_val = zero_ary + drop_rate self.drop_rate.set_value(to_fX(new_val)) return def _construct_zmuv_samples(self, xi, br): """ Construct the necessary (symbolic) samples for computing through this MultiStageModel for input (sybolic) matrix X. """ z_zmuv = self.rng.normal( \ size=(xi.shape[0]*br, self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) hi_zmuv = self.rng.normal( \ size=(self.ir_steps, xi.shape[0]*br, self.h_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return z_zmuv, hi_zmuv def _construct_nll_costs(self, si, xo): """ Construct the negative log-likelihood part of free energy. """ # average log-likelihood over the refinement sequence xh = self.obs_transform(si) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar) nll_costs = -ll_costs return nll_costs def _construct_kld_costs(self, p=1.0): """ Construct the posterior KL-divergence part of cost to minimize. """ kld_hi_q2ps = [] kld_hi_p2qs = [] for i in range(self.ir_steps): kld_hi_q2p = self.kldi_q2p[i] kld_hi_p2q = self.kldi_p2q[i] kld_hi_q2ps.append(T.sum(kld_hi_q2p**p, \ axis=1, keepdims=True)) kld_hi_p2qs.append(T.sum(kld_hi_p2q**p, \ axis=1, keepdims=True)) # compute the batch-wise costs kld_hi_q2p = sum(kld_hi_q2ps) kld_hi_p2q = sum(kld_hi_p2qs) # construct KLd cost for the distributions over z kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \ self.q_z_mean, self.q_z_logvar) kld_z_q2p = T.sum(kld_z_q2ps**p, axis=1, keepdims=True) kld_z_p2q = T.sum(kld_z_p2qs**p, axis=1, keepdims=True) return [kld_z_q2p, kld_z_p2q, kld_hi_q2p, kld_hi_p2q] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() br = T.lscalar() # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \ self.reg_cost, self.obs_costs] # compile the theano function _, hi_zmuv = self._construct_zmuv_samples(xi, br) func = theano.function(inputs=[ xi, xo, br ], \ outputs=outputs, \ givens={ self.x_in: xi.repeat(br, axis=0), \ self.x_out: xo.repeat(br, axis=0), \ self.hi_zmuv: hi_zmuv }, \ updates=self.joint_updates) return func def _construct_raw_klds(self): """ Construct function for computing KLd per latent dimension. """ # gather step-wise costs into a single list (init costs at the end) all_step_costs = [self.init_klds, self.kldi_q2p, self.kldi_p2q] # compile theano function for computing all relevant costs inputs = [self.x_in, self.x_out, self.hi_zmuv] cost_func = theano.function(inputs=inputs, outputs=all_step_costs, \ updates=self.scan_updates) def raw_kld_computer(XI, XO): hi_zmuv = to_fX( npr.randn(self.ir_steps, XI.shape[0], self.h_dim) ) _all_costs = cost_func(XI, XO, hi_zmuv) _init_klds = _all_costs[0] _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) results = [_init_klds, _kld_q2p, _kld_p2q] return results return raw_kld_computer def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() _, hi_zmuv = self._construct_zmuv_samples(xi, 1) # construct values to output nll = self.nlli[-1] kld = self.kld_z.flatten() + self.kld_hi_q2p.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xi, xo ], \ outputs=[nll, kld], \ givens={self.x_in: xi, \ self.x_out: xo, \ self.hi_zmuv: hi_zmuv}, \ updates=self.scan_updates) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XI, XO, sample_count): # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XI.shape[0],)) kld_sum = np.zeros((XI.shape[0],)) for i in range(sample_count): result = fe_term_sample(XI, XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator def _construct_sample_from_prior(self): """ Construct a function for drawing independent samples from the distribution generated by this MultiStageModel. This function returns the full sequence of "partially completed" examples. """ z_sym = T.matrix() x_sym = T.matrix() irs = self.ir_steps oputs = [self.obs_transform(self.s0)] oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)]) _, hi_zmuv = self._construct_zmuv_samples(x_sym, 1) sample_func = theano.function(inputs=[z_sym, x_sym], outputs=oputs, \ givens={ self.z: z_sym, \ self.x_in: T.zeros_like(x_sym), \ self.x_out: T.zeros_like(x_sym), \ self.hi_zmuv: hi_zmuv }, \ updates=self.scan_updates) def prior_sampler(samp_count): x_samps = to_fX( np.zeros((samp_count, self.obs_dim)) ) old_switch = self.train_switch.get_value(borrow=False) # set model to generation mode self.set_train_switch(switch_val=0.0) z_samps = to_fX( npr.randn(samp_count, self.z_dim) ) model_samps = sample_func(z_samps, x_samps) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return model_samps return prior_sampler def _construct_sample_from_input(self): """ Construct a function for drawing samples from the distribution generated by this MultiStageModel, conditioned on some inputs to the initial encoder stage (i.e. self.q_z_given_x). This returns the full sequence of "partially completed" examples. """ xi = T.matrix() xo = T.matrix() irs = self.ir_steps oputs = [self.obs_transform(self.s0)] oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)]) _, hi_zmuv = self._construct_zmuv_samples(xi, 1) sample_func = theano.function(inputs=[xi, xo], outputs=oputs, \ givens={ self.x_in: xi, \ self.x_out: xo, \ self.hi_zmuv: hi_zmuv }, \ updates=self.scan_updates) def conditional_sampler(XI, XO=None, guided_decoding=False): XI = to_fX( XI ) if XO is None: XO = XI XO = to_fX( XO ) # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if guided_decoding: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's generative policy self.set_train_switch(switch_val=0.0) # draw guided/unguided conditional samples model_samps = sample_func(XI, XO) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return model_samps return conditional_sampler
class HiddenLayer(object): def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, \ use_bias=True, name=""): # Setup a shared random generator for this layer #self.srng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.srng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + \ (input_noise * self.srng.normal(size=input.shape, \ dtype=theano.config.floatX)) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation: self.activation = activation else: if self.pool_size <= 1: self.activation = lambda x: relu_actfun(x) else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: if self.pool_size <= 1: # Generate random initial filters in a typical way W_init = np.asarray(0.04 * rng.standard_normal( \ size=(self.in_dim, self.filt_count)), \ dtype=theano.config.floatX) else: # Generate groups of random filters to pool over such that # intra-group correlations are stronger than inter-group # correlations, to encourage pooling over similar filters... filters = [] for g_num in range(self.pool_count): g_filt = 0.01 * rng.standard_normal(size=(self.in_dim,1)) for f_num in range(self.pool_size): f_filt = g_filt + (0.005 * rng.standard_normal( \ size=(self.in_dim,1))) filters.append(f_filt) W_init = np.hstack(filters).astype(theano.config.floatX) W = theano.shared(value=W_init, name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count,), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer if use_bias: self.linear_output = T.dot(self.noisy_input, self.W) + self.b else: self.linear_output = T.dot(self.noisy_input, self.W) # Add noise to the pre-activation features (if desired) self.noisy_linear = self.linear_output + \ (bias_noise * self.srng.normal(size=self.linear_output.shape, \ dtype=theano.config.floatX)) # Apply activation function self.output = self.activation(self.noisy_linear) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.output**2.) / self.output.size self.row_l1_sum = T.sum(abs(row_normalize(self.output))) / \ self.output.shape[0] self.col_l1_sum = T.sum(abs(col_normalize(self.output))) / \ self.output.shape[1] # Conveniently package layer parameters if use_bias: self.params = [self.W, self.b] else: self.params = [self.W] # Layer construction complete... return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p #drop_mask = self.srng.binomial(n=1, p=1-p, size=input.shape, \ # dtype=theano.config.floatX) noise_rnd = self.srng.uniform(input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = noise_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" #P_nz = P + self.srng.normal(size=P.shape, avg=0., std=noise_lvl, \ # dtype=theano.config.floatX) P_nz = P + self.srng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class HiddenLayer(object): def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, b_in=None, s_in=None, name="", W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) # setup scale and bias params for the input if b_in is None: # input biases are always initialized to zero ary = np.zeros((in_dim, ), dtype=theano.config.floatX) b_in = theano.shared(value=ary, name="{0:s}_b_in".format(name)) if s_in is None: # input scales are always initialized to one ary = 0.541325 * np.ones((in_dim, ), dtype=theano.config.floatX) s_in = theano.shared(value=ary, name="{0:s}_s_in".format(name)) self.b_in = b_in self.s_in = s_in # allow an early shift and rescale for inputs to this layer #self.clean_input = T.nnet.softplus(self.s_in) * (input + self.b_in) # use the input directly self.clean_input = input zero_ary = np.zeros((1, )).astype(theano.config.floatX) self.input_noise = theano.shared(value=(zero_ary+input_noise), \ name="{0:s}_input_noise".format(name)) self.bias_noise = theano.shared(value=(zero_ary+bias_noise), \ name="{0:s}_bias_noise".format(name)) self.drop_rate = theano.shared(value=(zero_ary+drop_rate), \ name="{0:s}_bias_noise".format(name)) # Add gaussian noise to the input (if desired) self.fuzzy_input = self.clean_input + (self.input_noise[0] * \ self.rng.normal(size=self.clean_input.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) # Apply masking noise to the input (if desired) self.noisy_input = self._drop_from_input(self.fuzzy_input, \ self.drop_rate[0]) # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation is None: activation = relu_actfun if self.pool_size <= 1: self.activation = activation else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: # Generate initial filters using orthogonal random trick #W_shape = (self.in_dim, self.filt_count) #W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) #W_init = W_scale * npr.normal(0.0, 1.0, W_shape) W_init = ortho_matrix(shape=(self.in_dim, self.filt_count), \ gain=W_scale) W_init = W_init.astype(theano.config.floatX) W = theano.shared(value=W_init, name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count, ), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer self.linear_output = T.dot(self.noisy_input, self.W) + self.b # Add noise to the pre-activation features (if desired) self.noisy_linear = self.linear_output + (self.bias_noise[0] * \ self.rng.normal(size=self.linear_output.shape, avg=0.0, \ std=1.0, dtype=theano.config.floatX)) # Apply activation function self.output = self.activation(self.noisy_linear) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.noisy_linear**2.) / self.output.size # Conveniently package layer parameters self.params = [self.W, self.b, self.b_in, self.s_in] # Layer construction complete... return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class HiddenLayer(object): def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, name="", W_scale=1.0): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + self.rng.normal(size=input.shape, \ avg=0.0, std=input_noise, dtype=theano.config.floatX) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation: self.activation = activation else: if self.pool_size <= 1: self.activation = lambda x: relu_actfun(x) else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: if self.pool_size <= 1: # Generate random initial filters in a typical way W_init = 0.01 * np.asarray(rng.normal( \ size=(self.in_dim, self.filt_count)), \ dtype=theano.config.floatX) else: # Generate groups of random filters to pool over such that # intra-group correlations are stronger than inter-group # correlations, to encourage pooling over similar filters... filters = [] f_size = (self.in_dim, 1) for g_num in range(self.pool_count): g_filt = 0.01 * rng.normal(size=f_size) for f_num in range(self.pool_size): f_filt = g_filt + 0.003 * rng.normal(size=f_size) filters.append(f_filt) W_init = np.hstack(filters).astype(theano.config.floatX) W = theano.shared(value=(W_scale * W_init), name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count, ), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer self.linear_output = T.dot(self.noisy_input, self.W) + self.b # Add noise to the pre-activation features (if desired) if bias_noise > 1e-3: self.noisy_linear = self.linear_output + \ self.rng.normal(size=self.linear_output.shape, \ avg=0.0, std=bias_noise, dtype=theano.config.floatX) else: self.noisy_linear = self.linear_output # Apply activation function self.output = self.activation(self.noisy_linear) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.output**2.) / self.output.size self.row_l1_sum = T.sum(abs(row_normalize(self.output))) / \ self.output.shape[0] self.col_l1_sum = T.sum(abs(col_normalize(self.output))) / \ self.output.shape[1] # Conveniently package layer parameters self.params = [self.W, self.b] # Layer construction complete... return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class GenUniModule(object): """ Module that applies a linear transform followed by an non-linearity. """ def __init__(self, rand_dim, out_dim, apply_bn=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_uni'): self.rand_dim = rand_dim self.out_dim = out_dim self.apply_bn = apply_bn self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ self.w1 = self.init_func((self.rand_dim, self.out_dim), "{}_w1".format(self.mod_name)) self.params = [ self.w1 ] # make gains and biases for transforms that will get batch normed if self.apply_bn: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.out_dim), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.out_dim), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) return def apply(self, batch_size=None, rand_vals=None): """ Apply this generator module. Pass _either_ batch_size or rand_vals. """ assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals" if rand_vals is None: rand_shape = (batch_size, self.rand_dim) if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) else: rand_shape = (rand_vals.shape[0], self.rand_dim) rand_vals = rand_vals.reshape(rand_shape) # transform random values linearly h1 = T.dot(rand_vals, self.w1) if self.apply_bn: h1 = batchnorm(h1, g=self.g1, b=self.b1) if self.final_relu: h1 = relu(h1) return h1 ############## # EYE BUFFER # ##############
class ConvPoolLayer(object): """ A simple convolution --> max-pooling layer. The (symbolic) input to this layer must be a theano.tensor.dtensor4 shaped like (batch_size, chan_count, im_dim_1, im_dim_2). filt_def should be a 4-tuple like (filt_count, in_chans, filt_def_1, filt_def_2) pool_def should be a 3-tuple like (pool_dim, pool_stride) """ def __init__(self, rng, input=None, filt_def=None, pool_def=(2, 2), \ activation=None, drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, name="", W_scale=1.0): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + self.rng.normal(size=input.shape, \ avg=0.0, std=input_noise, dtype=theano.config.floatX) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set the activation function for the conv filters if activation: self.activation = activation else: self.activation = lambda x: relu_actfun(x) # initialize weights with random weights W_init = 0.01 * np.asarray(rng.normal( \ size=filt_def), dtype=theano.config.floatX) self.W = theano.shared(value=(W_scale*W_init), \ name="{0:s}_W".format(name)) # the bias is a 1D tensor -- one bias per output feature map b_init = np.zeros((filt_def[0], ), dtype=theano.config.floatX) + 0.1 self.b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # convolve input feature maps with filters input_c01b = self.noisy_input.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_c01b = self.W.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_op = FilterActs(stride=1, partial_sum=1) contig_input = gpu_contiguous(input_c01b) contig_filters = gpu_contiguous(filters_c01b) conv_out_c01b = conv_op(contig_input, contig_filters) if (bias_noise > 1e-4): noisy_conv_out_c01b = conv_out_c01b + self.rng.normal( \ size=conv_out_c01b.shape, avg=0.0, std=bias_noise, \ dtype=theano.config.floatX) else: noisy_conv_out_c01b = conv_out_c01b # downsample each feature map individually, using maxpooling pool_op = MaxPool(ds=pool_def[0], stride=pool_def[1]) mp_out_c01b = pool_op(noisy_conv_out_c01b) mp_out_bc01 = mp_out_c01b.dimshuffle(3, 0, 1, 2) # c01b to bc01 # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.noisy_linear_output = mp_out_bc01 + self.b.dimshuffle( 'x', 0, 'x', 'x') self.linear_output = self.noisy_linear_output self.output = self.activation(self.noisy_linear_output) # store parameters of this layer self.params = [self.W, self.b] return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class DAELayer(object): def __init__(self, rng, clean_input=None, fuzzy_input=None, \ in_dim=0, out_dim=0, activation=None, input_noise=0., \ W=None, b_h=None, b_v=None): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) # Grab the layer input and perturb it with some sort of noise. This # is, afterall, a _denoising_ autoencoder... self.clean_input = clean_input self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise) # Set some basic layer properties self.activation = activation self.in_dim = in_dim self.out_dim = out_dim # Get some random initial weights and biases, if not given if W is None: W_init = np.asarray(0.01 * rng.standard_normal( \ size=(in_dim, out_dim)), dtype=theano.config.floatX) W = theano.shared(value=W_init, name='W') if b_h is None: b_init = np.zeros((out_dim, ), dtype=theano.config.floatX) b_h = theano.shared(value=b_init, name='b_h') if b_v is None: b_init = np.zeros((in_dim, ), dtype=theano.config.floatX) b_v = theano.shared(value=b_init, name='b_v') # Grab pointers to the now-initialized weights and biases self.W = W self.b_h = b_h self.b_v = b_v # Put the learnable/optimizable parameters into a list self.params = [self.W, self.b_h, self.b_v] # Beep boop... layer construction complete... return def compute_costs(self, lam_l1=None): """Compute reconstruction and activation sparsity costs.""" # Get noise-perturbed encoder/decoder parameters W_nz = self._noisy_params(self.W, 0.01) b_nz = self.b_h #self._noisy_params(self.b_h, 0.05) # Compute hidden and visible activations A_v, A_h = self._compute_activations(self.noisy_input, \ W_nz, b_nz, self.b_v) # Compute reconstruction error cost recon_cost = T.sum((self.clean_input - A_v)**2.0) / \ self.clean_input.shape[0] # Compute sparsity penalty (over both population and lifetime) row_l1_sum = T.sum(abs(row_normalize(A_h))) / A_h.shape[0] col_l1_sum = T.sum(abs(col_normalize(A_h))) / A_h.shape[1] sparse_cost = lam_l1[0] * (row_l1_sum + col_l1_sum) return [recon_cost, sparse_cost] def _compute_hidden_acts(self, X, W, b_h): """Compute activations of encoder (at hidden layer).""" A_h = self.activation(T.dot(X, W) + b_h) return A_h def _compute_activations(self, X, W, b_h, b_v): """Compute activations of decoder (at visible layer).""" A_h = self._compute_hidden_acts(X, W, b_h) A_v = T.dot(A_h, W.T) + b_v return [A_v, A_h] def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" if noise_lvl > 1e-3: P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) else: P_nz = P return P_nz def _get_noisy_input(self, input, p): """p is the probability of dropping elements of input.""" drop_rnd = self.rng.uniform(input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # Cast mask from int to float32, to keep things on GPU noisy_input = input * drop_mask return noisy_input
class HiddenLayer(object): def __init__(self, rng, layer_description, W=None, b=None, b_in=None, s_in=None, name="", W_scale=1.0): # parse options from layer_description assert 'layer_type' in layer_description, \ "layer_description must provide layer_type" assert ((layer_description['layer_type'] == 'fc') or \ (layer_description['layer_type'] == 'conv')), \ "layer_type must be fc or conv" self.layer_description = layer_description self.layer_type = layer_description['layer_type'] self.in_chans = layer_description['in_chans'] self.out_chans = layer_description['out_chans'] self.activation = layer_description['activation'] self.filt_dim = layer_description.get('filt_dim', None) self.conv_stride = layer_description.get('conv_stride', None) self.apply_bn = layer_description.get('apply_bn', False) self.drop_rate = layer_description.get('drop_rate', 0.0) self.shape_func_in = layer_description.get('shape_func_in', None) self.shape_func_out = layer_description.get('shape_func_out', None) # setup additional params self.rng = RandStream(rng.randint(1000000)) self.W_scale = W_scale self.name = name if self.layer_type == 'fc': self.W, self.b, self.b_in, self.s_in = \ self._init_fc_params(W=W, b=b, b_in=b_in, s_in=s_in) else: self.W, self.b, self.b_in, self.s_in = \ self._init_conv_params(W=W, b=b, b_in=b_in, s_in=s_in) # Conveniently package layer parameters self.params = [self.W, self.b, self.b_in, self.s_in] self.shared_param_dicts = { 'W': self.W, 'b': self.b, 'b_in': self.b_in, 's_in': self.s_in } # Layer construction complete... return def _init_fc_params(self, W=None, b=None, b_in=None, s_in=None): """ Initialize all parameters that may be required for feedforward through a fully-connected hidden layer. """ # Get some random initial weights and biases, if not given if W is None: # Generate initial filters using orthogonal random trick W_shape = (self.in_chans, self.out_chans) if self.W_scale == 'xg': W_np = glorot_matrix(W_shape) else: #W_np = (self.W_scale * (1.0 / np.sqrt(self.in_chans))) * \ # npr.normal(0.0, 1.0, W_shape) W_np = ortho_matrix(shape=W_shape, gain=self.W_scale) W_np = W_np.astype(theano.config.floatX) W = theano.shared(value=W_np, name="{0:s}_W".format(self.name)) if b is None: b_np = np.zeros((self.out_chans,), dtype=theano.config.floatX) b = theano.shared(value=b_np, name="{0:s}_b".format(self.name)) # setup scale and bias params for after batch normalization if b_in is None: # batch normalization reshifts are initialized to zero ary = np.zeros((self.out_chans,), dtype=theano.config.floatX) b_in = theano.shared(value=ary, name="{0:s}_b_in".format(self.name)) if s_in is None: # batch normalization rescales are initialized to zero ary = np.zeros((self.out_chans,), dtype=theano.config.floatX) s_in = theano.shared(value=ary, name="{0:s}_s_in".format(self.name)) return W, b, b_in, s_in def _init_conv_params(self, W=None, b=None, b_in=None, s_in=None): """ Initialize all parameters that may be required for feedforward through a convolutional hidden layer. """ if W is None: W_shape = (self.out_chans, self.in_chans, self.filt_dim, self.filt_dim) ary = npr.normal(0.0, self.W_scale*0.02, W_shape).astype(theano.config.floatX) W = theano.shared(value=ary, name="{0:s}_W".format(self.name)) if b is None: b_shape = (self.out_chans,) ary = npr.normal(0.0, 0.01, b_shape).astype(theano.config.floatX) b = theano.shared(value=ary, name="{0:s}_b".format(self.name)) # setup scale and bias params for after batch normalization if b_in is None: # batch normalization reshifts are initialized to zero ary = np.zeros((self.out_chans,), dtype=theano.config.floatX) b_in = theano.shared(value=ary, name="{0:s}_b_in".format(self.name)) if s_in is None: # batch normalization rescales are initialized to zero ary = np.zeros((self.out_chans,), dtype=theano.config.floatX) s_in = theano.shared(value=ary, name="{0:s}_s_in".format(self.name)) return W, b, b_in, s_in def apply(self, input, use_drop=False): """ Apply feedforward to this input, returning several partial results. """ # Reshape input if a reshape command was provided if not (self.shape_func_in is None): input = self.shape_func_in(input) # Apply masking noise to the input (if desired) if use_drop: input = self._drop_from_input(input, self.drop_rate) if self.layer_type == 'fc': # Feedforward through fully-connected layer linear_output = T.dot(input, self.W) + self.b elif self.layer_type == 'conv': # Feedforward through convolutional layer, with adjustable stride bm = int((self.filt_dim - 1) / 2) # use "same" mode convolutions if self.conv_stride == 'double': linear_output = dnn_conv(input, self.W, subsample=(2, 2), border_mode=(bm, bm)) elif self.conv_stride == 'single': linear_output = dnn_conv(input, self.W, subsample=(1, 1), border_mode=(bm, bm)) elif self.conv_stride == 'half': linear_output = deconv(input, self.W, subsample=(2, 2), border_mode=(bm, bm)) else: assert False, "Unknown stride type!" linear_output = linear_output + self.b.dimshuffle('x',0,'x','x') else: assert False, "Unknown layer type!" # Apply batch normalization if desired if self.apply_bn: linear_output = batchnorm(linear_output, rescale=self.s_in, reshift=self.b_in, u=None, s=None) # Apply activation function final_output = self.activation(linear_output) # Reshape output if a reshape command was provided if not (self.shape_func_out is None): linear_output = self.shape_func_out(linear_output) final_output = self.shape_func_out(final_output) return final_output, linear_output def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input
class SRRModel(object): """ Controller for training a sequential revelation and refinement model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_out: the goal state for iterative refinement p_zi_given_xi: InfNet for stochastic part of step p_sip1_given_zi: HydraNet for deterministic part of step p_x_given_si: HydraNet for transform from s-space to x-space q_zi_given_xi: InfNet for the guide policy params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of observations to construct z_dim: dimension of latent space for policy wobble s_dim: dimension of space in which to perform construction use_p_x_given_si: boolean for whether to use p_x_given_si rev_sched: list of "revelation" blocks. each block is described by the number of steps prior to revelation, and the percentage of remaining pixels to reveal. rev_masks: matrix of revelation masks. the row i provides the mask for iteration i of the srr loop. when this argument is passed, rev_sched is ignored and the revelation schedule is determined by rev_masks. step_type: either "add" or "jump" x_type: can be "bernoulli" or "gaussian" obs_transform: can be 'none' or 'sigmoid' """ def __init__( self, rng=None, x_out=None, p_zi_given_xi=None, p_sip1_given_zi=None, p_x_given_si=None, q_zi_given_xi=None, params=None, shared_param_dicts=None, ): # setup a rng for this SRRModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params["x_dim"] self.z_dim = self.params["z_dim"] self.s_dim = self.params["s_dim"] self.use_p_x_given_si = self.params["use_p_x_given_si"] self.step_type = self.params["step_type"] self.x_type = self.params["x_type"] if self.use_p_x_given_si: print("Constructing hypotheses indirectly in s-space...") else: print("Constructing hypotheses directly in x-space...") assert self.s_dim == self.x_dim if "obs_transform" in self.params: assert (self.params["obs_transform"] == "sigmoid") or (self.params["obs_transform"] == "none") if self.params["obs_transform"] == "sigmoid": self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == "bernoulli": self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # Deal with revelation scheduling if ("rev_masks" in self.params) and (self.params["rev_masks"] is not None): rmp = self.params["rev_masks"][0].astype(theano.config.floatX) rmq = self.params["rev_masks"][1].astype(theano.config.floatX) self.rev_masks_p = theano.shared(value=rmp, name="srrm_rev_masks_p") self.rev_masks_q = theano.shared(value=rmq, name="srrm_rev_masks_q") self.rev_sched = None self.use_rev_masks = True else: self.rev_sched = self.params["rev_sched"] self.rev_masks_p = None self.rev_masks_q = None self.use_rev_masks = False nice_nums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] # "validate" the set of revelation block descriptions for rev_block in self.rev_sched: assert rev_block[0] in nice_nums assert (rev_block[1] >= 0.0) and (rev_block[1] <= 1.01) assert (self.x_type == "bernoulli") or (self.x_type == "gaussian") assert (self.step_type == "add") or (self.step_type == "jump") # grab handles to the relevant networks self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.p_x_given_si = p_x_given_si self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created for this SRRModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for policy wobble self.p_masks = T.tensor3() # revelation masks for primary policy self.q_masks = T.tensor3() # revelation masks for guide policy if self.use_rev_masks: self.total_steps = self.params["rev_masks"][0].shape[0] else: self.total_steps = sum([rb[0] for rb in self.rev_sched]) # setup switching variable for changing between sampling/training zero_ary = to_fX(np.zeros((1,))) self.train_switch = theano.shared(value=zero_ary, name="srrm_train_switch") self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize the parameters "owned" by this model s0_init = to_fX(np.zeros((self.s_dim,))) ss_init = to_fX(0.5 * np.ones((self.total_steps,))) self.s0 = theano.shared(value=s0_init, name="srrm_s0") self.obs_logvar = theano.shared(value=zero_ary, name="srrm_obs_logvar") self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0]) self.step_scales = theano.shared(value=ss_init, name="srrm_step_scales") self.shared_param_dicts = {} self.shared_param_dicts["s0"] = self.s0 self.shared_param_dicts["obs_logvar"] = self.obs_logvar self.shared_param_dicts["step_scales"] = self.step_scales else: # grab the parameters required by this model from a given dict self.s0 = self.shared_param_dicts["s0"] self.obs_logvar = self.shared_param_dicts["obs_logvar"] self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0]) self.step_scales = self.shared_param_dicts["step_scales"] ################################################################## # Setup the sequential revelation and refinement loop using scan # ################################################################## # ss: This is a sequence of scalars that will be used to rescale the # "gradient" input to the primary and guide policies. # # zi_zmuv: This is a sequence of ZMUV gaussian samples that will be # reparametrized to sample actions from the policies. # # p_masks: This is a sequence of "unmasking" masks. When one of these # masking variables is 1, the corresponding value in self.x_out # will be "revealed" to the primary policy. Prediction error # is measured for a value only the first time it is revealed. # Once revealed, a value remains "visible" to the policy. # The final step should reveal all values. # # q_masks: This is a sequence of "unmasking" masks. These are similar # to p_masks, but control which values are revealed to the # guide policy. The guide policy masking sequence should be # constructed to stay "ahead of" the primary policy's masking # sequence. The guide policy needs to know which values will # be revealed to the primary policy so that it can focus its # reconstruction efforts on those values. Otherwise, the guide # policy will immediately reconstruct the entire target. # # si: This is the current "belief state" for each trial in the training # batch. The belief state is updated in each iteration, and passed # forward through the recurrence. # # mi_p: This is the current revelation mask for the primary policy. # # mi_q: This is the current revelation mask for the guide policy. # def srr_step_func(ss, zi_zmuv, p_masks, q_masks, si, mi_p, mi_q): # transform the current belief state into an observation si_as_x = self._from_si_to_x(si) full_grad = T.log(1.0 + T.exp(ss)) * (self.x_out - si_as_x) # get the masked belief state and gradient for primary policy xi_for_p = (mi_p * self.x_out) + ((1.0 - mi_p) * si_as_x) grad_for_p = mi_p * full_grad # update the guide policy's revelation mask new_to_q = (1.0 - mi_q) * q_masks mip1_q = mi_q + new_to_q # get the masked belief state and gradient for guide policy # xi_for_q = (mip1_q * self.x_out) + ((1.0 - mip1_q) * si_as_x) xi_for_q = xi_for_p grad_for_q = mip1_q * full_grad # get samples of next zi, according to the primary policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( T.horizontal_stack(xi_for_p, grad_for_p), do_samples=False ) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.horizontal_stack(xi_for_q, grad_for_q), do_samples=False ) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = (self.train_switch[0] * zi_q) + ((1.0 - self.train_switch[0]) * zi_p) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || N(0, I)) # compute next si, given sampled zi (i.e. update the belief state) hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if self.step_type == "jump": # jump steps always do a full swap of belief state sip1 = si_step else: # additive steps adjust the belief state like an LSTM write_gate = T.nnet.sigmoid(2.0 + hydra_out[1]) erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) # update the primary policy's revelation mask new_to_p = (1.0 - mi_p) * p_masks mip1_p = mi_p + new_to_p # compute NLL only for the newly revealed values nlli = self._construct_nll_costs(sip1, self.x_out, new_to_p) # each loop iteration produces the following values: # sip1: belief state at end of current step # mip1_p: revealed values mask to use in next step (primary) # mip1_q: revealed values mask to use in next step (guide) # nlli: NLL for values revealed at end of current step # kldi_q2p: KL(q || p) for the current step # kldi_p2q: KL(p || q) for the current step # kldi_p2g: KL(p || N(0,I)) for the current step return sip1, mip1_p, mip1_q, nlli, kldi_q2p, kldi_p2q, kldi_p2g # initialize belief state to self.s0 self.s0_full = T.alloc(0.0, self.x_out.shape[0], self.s_dim) + self.s0 # initialize revelation masks to 0 for all values in all trials self.m0_full = T.zeros_like(self.x_out) # setup initial values to pass to scan op outputs_init = [self.s0_full, self.m0_full, self.m0_full, None, None, None, None] sequences_init = [self.step_scales, self.zi_zmuv, self.p_masks, self.q_masks] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan( srr_step_func, outputs_info=outputs_init, sequences=sequences_init ) # grab results of the scan op. all values are computed for each step self.si = self.scan_results[0] # belief states self.mi_p = self.scan_results[1] # primary revelation masks self.mi_q = self.scan_results[2] # guide revelation masks self.nlli = self.scan_results[3] # NLL on newly revealed values self.kldi_q2p = self.scan_results[4] # KL(q || p) self.kldi_p2q = self.scan_results[5] # KL(p || q) self.kldi_p2g = self.scan_results[6] # KL(p || N(0,I)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1,))) self.lr = theano.shared(value=zero_ary, name="srr_lr") # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name="srr_mom_1") self.mom_2 = theano.shared(value=zero_ary, name="srr_mom_2") # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name="srr_lam_kld_p") self.lam_kld_q = theano.shared(value=zero_ary, name="srr_lam_kld_q") self.lam_kld_g = theano.shared(value=zero_ary, name="srr_lam_kld_g") self.lam_kld_s = theano.shared(value=zero_ary, name="srr_lam_kld_s") self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name="srr_lam_l2w") self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0) self.kld_costs = ( (self.lam_kld_p[0] * self.kld_p) + (self.lam_kld_q[0] * self.kld_q) + (self.lam_kld_g[0] * self.kld_g) + (self.lam_kld_s[0] * self.kld_s) ) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates( params=self.joint_params, grads=self.joint_grads, alpha=self.lr, beta1=self.mom_1, beta2=self.mom_2, mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0, ) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters # self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def _from_si_to_x(self, si): """ Convert the given si from s-space to x-space. """ if self.use_p_x_given_si: x_pre_trans, _ = self.p_x_given_si.apply(si) else: x_pre_trans = si x_post_trans = self.obs_transform(x_pre_trans) return x_post_trans def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_s self.lam_kld_s.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if switch_val < 0.5: switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xo): """ Construct the necessary ZMUV gaussian samples for generating trajectories from this SRRModel, for input matrix xo. """ zi_zmuv = self.rng.normal( size=(self.total_steps, xo.shape[0], self.z_dim), avg=0.0, std=1.0, dtype=theano.config.floatX ) return zi_zmuv def _construct_rev_masks(self, xo): """ Compute the sequential revelation masks for the input batch in xo. -- We need to construct mask sequences for both p and q. """ if self.use_rev_masks: # make batch copies of self.rev_masks_p and self.rev_masks_q pmasks = self.rev_masks_p.dimshuffle(0, "x", 1).repeat(xo.shape[0], axis=1) qmasks = self.rev_masks_q.dimshuffle(0, "x", 1).repeat(xo.shape[0], axis=1) else: pm_list = [] qm_list = [] # make a zero mask that does nothing zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1]) # generate independently sampled masks for each revelation block for rb in self.rev_sched: # make a random binary mask with ones at rate rb[1] rand_vals = self.rng.uniform( size=(1, xo.shape[0], xo.shape[1]), low=0.0, high=1.0, dtype=theano.config.floatX ) rand_mask = rand_vals < rb[1] # append the masks for this revleation block to the mask lists # # the guide policy (in q) gets to peek at the values that will be # revealed to the primary policy (in p) for the entire block. The # primary policy only gets to see these values at end of the final # step of the block. Within a given step, values are revealed to q # at the beginning of the step, and to p at the end. # # e.g. in a revelation block with only a single step, the guide # policy sees the values at the beginning of the step, which allows # it to guide the step. the primary policy only gets to see the # values at the end of the step. # # i.e. a standard variational auto-encoder is equivalent to a # sequential revelation and refinement model with only one # revelation block, which has one step and a reveal rate of 1.0. # for refine_step in range(rb[0] - 1): pm_list.append(zero_mask) qm_list.append(rand_mask) pm_list.append(rand_mask) qm_list.append(rand_mask) # concatenate each mask list into a 3-tensor pmasks = T.cast(T.concatenate(pm_list, axis=0), "floatX") qmasks = T.cast(T.concatenate(qm_list, axis=0), "floatX") return [pmasks, qmasks] def _construct_nll_costs(self, si, xo, nll_mask): """ Construct the negative log-likelihood part of free energy. -- only check NLL where nll_mask == 1 """ xh = self._from_si_to_x(si) if self.x_type == "bernoulli": ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask) else: ll_costs = log_prob_gaussian2(xo, xh, log_vars=self.bounded_logvar, mask=nll_mask) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_s(self, s_i, s_j): """ Compute KL(s_i || s_j) -- assuming bernoullish outputs """ x_i = self._from_si_to_x(s_i) x_j = self._from_si_to_x(s_j) kld_s = (x_i * (T.log(x_i) - T.log(x_j))) + ((1.0 - x_i) * (T.log(1.0 - x_i) - T.log(1.0 - x_j))) sum_kld = T.sum(kld_s, axis=1) return sum_kld def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] kld_sis = [] s0 = 0.0 * self.si[0] + self.s0 for i in range(self.total_steps): kld_pis.append(T.sum(self.kldi_p2q[i] ** p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i] ** p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i] ** p, axis=1)) if i == 0: kld_sis.append(self._construct_kld_s(self.si[i], s0)) else: kld_sis.append(self._construct_kld_s(self.si[i], self.si[i - 1])) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) kld_si = sum(kld_sis) return [kld_pi, kld_qi, kld_gi, kld_si] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p ** 2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function( inputs=[xo], outputs=[nll, kld], givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks}, updates=self.scan_updates, on_unused_input="ignore", ) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XO, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XO.shape[0],)) kld_sum = np.zeros((XO.shape[0],)) for i in range(sample_count): result = fe_term_sample(XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # compile theano function for computing the costs all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g] cost_func = theano.function( inputs=[xo], outputs=all_step_costs, givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks}, updates=self.scan_updates, on_unused_input="ignore", ) # make a function for computing batch-based estimates of costs. # _step_nlls: the expected NLL cost for each step # _step_klds: the expected KL(q||p) cost for each step # _kld_q2p: the expected KL(q||p) cost for each latent dim # _kld_p2q: the expected KL(p||q) cost for each latent dim # _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim def raw_cost_computer(XO): _all_costs = cost_func(to_fX(XO)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX(np.asarray([k for k in _step_klds])) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX(np.asarray([k for k in _step_nlls])) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function( inputs=[xo], outputs=outputs, givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks}, updates=self.joint_updates, on_unused_input="ignore", ) return func def _construct_sequence_sampler(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function states = [self._from_si_to_x(self.s0_full)] + [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)] masks = [self.m0_full] + [self.mi_p[i] for i in range(self.total_steps)] outputs = states + masks # compile the theano function func = theano.function( inputs=[xo], outputs=outputs, givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks}, updates=self.joint_updates, on_unused_input="ignore", ) # visualize trajectories generated by the model def sample_func(XO, use_guide_policy=False): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # get belief states and masks generated by the scan loop scan_vals = func(to_fX(XO)) step_count = self.total_steps + 1 seq_shape = (step_count, XO.shape[0], XO.shape[1]) xm_seq = np.zeros(seq_shape).astype(theano.config.floatX) xi_seq = np.zeros(seq_shape).astype(theano.config.floatX) mi_seq = np.zeros(seq_shape).astype(theano.config.floatX) for i in range(step_count): _xi = scan_vals[i] _mi = scan_vals[i + step_count] _xm = (_mi * XO) + ((1.0 - _mi) * _xi) xm_seq[i, :, :] = _xm xi_seq[i, :, :] = _xi mi_seq[i, :, :] = _mi # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return [xm_seq, xi_seq, mi_seq] return sample_func def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert not (f_name is None) f_handle = file(f_name, "wb") # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts["p_zi_given_xi"] = self.p_zi_given_xi.save_to_dict() child_model_dicts["p_sip1_given_zi"] = self.p_sip1_given_zi.save_to_dict() child_model_dicts["p_x_given_si"] = self.p_x_given_si.save_to_dict() child_model_dicts["q_zi_given_xi"] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return