def __init__(self, rng=None, x_out=None, \ p_z_given_x=None, \ p_x_given_z=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this WalkoutModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.walkout_steps = self.params['walkout_steps'] self.x_type = self.params['x_type'] self.shared_param_dicts = shared_param_dicts if 'x_transform' in self.params: assert((self.params['x_transform'] == 'sigmoid') or \ (self.params['x_transform'] == 'none')) if self.params['x_transform'] == 'sigmoid': self.x_transform = lambda x: T.nnet.sigmoid(x) else: self.x_transform = lambda x: x else: self.x_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.x_transform = lambda x: T.nnet.sigmoid(x) assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) assert ((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant networks self.p_z_given_x = p_z_given_x self.p_x_given_z = p_x_given_z # record the symbolic variables that will provide inputs to the # computation graph created for this WalkoutModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for walk-out wobble if self.shared_param_dicts is None: # initialize the parameters "owned" by this model zero_ary = to_fX(np.zeros((1, ))) self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar') self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) ############################################################### # Setup the forwards (i.e. training) walk-out loop using scan # ############################################################### def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw): # get samples of next zi, according to the forwards model zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv) # check reverse direction probability p(xi_fw | zi_fw) xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_bw_mean = self.x_transform(xi_bw_mean) nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \ log_vars=xi_bw_logvar, mask=None) nll_xi_bw = nll_xi_bw.flatten() # get samples of next xi, according to the forwards model xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_fw_mean = self.x_transform(xi_fw_mean) xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv) # check reverse direction probability p(zi_fw | xi_fw) zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \ log_vars=zi_bw_logvar, mask=None) nll_zi_bw = nll_zi_bw.flatten() # each loop iteration produces the following values: # xi_fw: xi generated fom zi by forwards walk # zi_fw: zi generated fom xi by forwards walk # xi_fw_mean: ---- # xi_fw_logvar: ---- # zi_fw_mean: ---- # zi_fw_logvar: ---- # nll_xi_bw: NLL for reverse step zi_fw -> xi_fw # nll_zi_bw: NLL for reverse step xi_fw -> zi_fw return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw # initialize states for x/z self.x0 = self.x_out self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim) # setup initial values to pass to scan op outputs_init = [self.x0, self.z0, None, None, None, None, None, None] sequences_init = [self.xi_zmuv, self.zi_zmuv] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan(forwards_loop, \ outputs_info=outputs_init, \ sequences=sequences_init) # grab results of the scan op. all values are computed for each step self.xi = self.scan_results[0] self.zi = self.scan_results[1] self.xi_fw_mean = self.scan_results[2] self.xi_fw_logvar = self.scan_results[3] self.zi_fw_mean = self.scan_results[4] self.zi_fw_logvar = self.scan_results[5] self.nll_xi_bw = self.scan_results[6] self.nll_zi_bw = self.scan_results[7] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='srr_lr') # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w') self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs( p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return
def __init__(self, rng=None, \ Xd=None, Xc=None, Xm=None, \ p_x_given_z=None, q_z_given_x=None, \ x_dim=None, z_dim=None, \ params=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters if params is None: self.params = {} else: self.params = params if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # record the dimensions of various spaces relevant to this model self.z_dim = z_dim self.x_dim = x_dim # set parameters for the isotropic Gaussian prior over z self.prior_mean = 0.0 self.prior_logvar = 0.0 # record the symbolic variables that will provide inputs to the # computation graph created to describe this OneStageModel self.Xd = Xd self.Xc = Xc self.Xm = Xm self.batch_reps = T.lscalar() self.x = apply_mask(self.Xd, self.Xc, self.Xm) ##################################################################### # Setup the computation graph that provides values in our objective # ##################################################################### # inferencer model for latent prototypes given instances self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x) self.z = self.q_z_given_x.output self.z_mean = self.q_z_given_x.output_mean self.z_logvar = self.q_z_given_x.output_logvar # generator model for prototypes given latent prototypes self.p_x_given_z = p_x_given_z.shared_param_clone(rng=rng, Xd=self.z) self.xt = self.p_x_given_z.output_mean # use deterministic output # construct the final output of generator, conditioned on z if self.x_type == 'bernoulli': self.xg = T.nnet.sigmoid(self.xt) else: self.xg = self.xt_transform(self.xt) # self.output_logvar modifies the output distribution self.output_logvar = self.p_x_given_z.sigma_layers[-1].b self.bounded_logvar = self.logvar_bound * \ T.tanh(self.output_logvar[0] / self.logvar_bound) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = np.zeros((1, )).astype(theano.config.floatX) self.lr_1 = theano.shared(value=zero_ary, name='osm_lr_1') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='osm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='osm_mom_2') self.it_count = theano.shared(value=zero_ary, name='osm_it_count') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='osm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_1 = theano.shared(value=zero_ary, name='osm_lam_kld_1') self.lam_kld_2 = theano.shared(value=zero_ary, name='osm_lam_kld_2') self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='osm_lam_l2w') self.set_lam_l2w(1e-4) # Grab all of the "optimizable" parameters in "group 1" self.group_1_params = [] self.group_1_params.extend(self.q_z_given_x.mlp_params) self.group_1_params.extend(self.p_x_given_z.mlp_params) # Make a joint list of parameters self.joint_params = self.group_1_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.nll_costs = self.lam_nll[0] * self._construct_nll_costs() self.nll_cost = T.mean(self.nll_costs) self.kld_costs_1, self.kld_costs_2 = self._construct_kld_costs() self.kld_costs = (self.lam_kld_1[0] * self.kld_costs_1) + \ (self.lam_kld_2[0] * self.kld_costs_2) self.kld_cost = T.mean(self.kld_costs) act_reg_cost, param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost # Get the gradient of the joint cost for all optimizable parameters print("Computing OneStageModel cost gradients...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_param_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) # Construct a function for jointly training the generator/inferencer print("Compiling OneStageModel theano functions...") self.train_joint = self._construct_train_joint() self.compute_fe_terms = self._construct_compute_fe_terms() self.compute_post_klds = self._construct_compute_post_klds() self.sample_from_prior = self._construct_sample_from_prior() self.transform_x_to_z = theano.function([self.q_z_given_x.Xd], \ outputs=self.q_z_given_x.output_mean) self.transform_z_to_x = theano.function([self.p_x_given_z.Xd], \ outputs=self.xt_transform(self.p_x_given_z.output_mean)) self.inf_weights = self.q_z_given_x.shared_layers[0].W self.gen_weights = self.p_x_given_z.mu_layers[-1].W return
def __init__(self, rng=None, \ Xd=None, \ params=None, \ shared_param_dicts=None): # First, setup a shared random number generator for this layer self.rng = RandStream(rng.randint(100000)) ################################################ # Process user-suplied parameters for this net # ################################################ assert (not (params is None)) assert (len(params['proto_configs']) == 1) # permit only one proto-net assert (len(params['spawn_configs']) <= 2) # use one or two spawn nets assert (len(params['spawn_configs']) > 0) self.Xd = Xd # symbolic input to this computation graph self.params = params lam_l2a = params['lam_l2a'] if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.2 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.5 if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 self.proto_configs = params['proto_configs'] self.spawn_configs = params['spawn_configs'] # Compute some "structural" properties of this ensemble self.max_proto_depth = max([(len(pc) - 1) for pc in self.proto_configs]) self.spawn_count = len(self.spawn_configs) # Check if the params for this net were given a priori. This option # will be used for creating "clones" of a generative network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = [] self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True ######################################## # Initialize all of the proto-networks # ######################################## self.proto_nets = [] # Construct the proto-networks from which to generate spawn-sembles for (pn_num, proto_config) in enumerate(self.proto_configs): layer_defs = [ld for ld in proto_config] layer_connect_defs = zip(layer_defs[:-1], layer_defs[1:]) layer_num = 0 proto_net = [] next_input = self.Xd for in_def, out_def in layer_connect_defs: last_layer = (layer_num == (len(layer_connect_defs) - 1)) pnl_name = "pn{0:d}l{1:d}".format(pn_num, layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 i_scale = (1.0 / np.sqrt(in_dim)) * self.init_scale # Add a new layer to the regular model if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=None, pool_size=pool_size, \ drop_rate=0., input_noise=0., bias_noise=0., \ in_dim=in_dim, out_dim=out_dim, \ name=pnl_name, W_scale=i_scale) proto_net.append(new_layer) self.shared_param_dicts.append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts[layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=None, pool_size=pool_size, \ drop_rate=0., input_noise=0., bias_noise=0., \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=pnl_name, W_scale=i_scale) proto_net.append(new_layer) next_input = proto_net[-1].output layer_num = layer_num + 1 # Add this network to the list of proto-networks, and add its # param dict to the list of pro-net param dicts, if not a clone self.proto_nets.append(proto_net) ################################################################# # Initialize all of the spawned (i.e. noise-perturbed) networks # ################################################################# self.spawn_nets = [] self.proto_keys = [] for spawn_config in self.spawn_configs: proto_key = spawn_config['proto_key'] self.proto_keys.append(proto_key) print("spawned from proto-net: {0:d} (of {1:d})".format(proto_key, \ len(self.proto_nets))) input_noise = spawn_config['input_noise'] bias_noise = spawn_config['bias_noise'] do_dropout = spawn_config['do_dropout'] assert ((proto_key >= 0) and (proto_key < len(self.proto_nets))) # Get info about the proto-network to spawn from layer_num = 0 spawn_net = [] next_input = self.Xd proto_net = self.proto_nets[proto_key] for proto_layer in proto_net: last_layer = (layer_num == (len(proto_net) - 1)) layer_in = input_noise if (layer_num == 0) else 0.0 d_prob = self.vis_drop if (layer_num == 0) else self.hid_drop drop_prob = d_prob if do_dropout else 0.0 # Get important properties from the relevant proto-layer actfun = proto_layer.activation pool_size = proto_layer.pool_size in_dim = proto_layer.in_dim out_dim = proto_layer.out_dim # Add a new layer to the regular model spawn_net.append(HiddenLayer(rng=rng, \ input=next_input, activation=actfun, \ pool_size=pool_size, drop_rate=drop_prob, \ input_noise=layer_in, bias_noise=bias_noise, \ W=proto_layer.W, b=proto_layer.b, \ b_in=proto_layer.b_in, s_in=proto_layer.s_in, \ in_dim=in_dim, out_dim=out_dim)) next_input = spawn_net[-1].output layer_num = layer_num + 1 # Add this network to the list of spawn-networks self.spawn_nets.append(spawn_net) # Mash all the parameters together, into a list. Also make a list # comprising only parameters located in final/classification layers # of the proto-networks (for use in fine-tuning, probably). self.proto_params = [] self.class_params = [] for pn in self.proto_nets: for (i, pl) in enumerate(pn): self.proto_params.extend(pl.params) if (i == (len(pn) - 1)): self.class_params.extend(pl.params) # Build loss functions for denoising autoencoder training. This sets up # a cost function for each possible layer, as determined by the maximum # number of layers in any proto-network. The DAE cost for layer i will # be the mean DAE cost over all i'th layers in the proto-networks. self.dae_lam_l1 = theano.shared( \ value=np.asarray([0.2]).astype(theano.config.floatX)) self._construct_dae_layers(rng, lam_l1=self.dae_lam_l1, nz_lvl=0.25) # create symbolic "hooks" for observing the output of this network, # either without perturbations or subject to perturbations self.output_proto = self.proto_nets[0][-1].linear_output self.output_spawn = [sn[-1].linear_output for sn in self.spawn_nets] # get a cost function for encouraging "pseudo-ensemble agreement" self.pea_reg_cost = self._ear_cost() # get a cost function for penalizing/rewarding prediction entropy self.ent_reg_cost = self._ent_cost() self.act_reg_cost = lam_l2a * self._act_reg_cost() # construct a function for sampling from a categorical self.sample_posterior = self._construct_sample_posterior() return
def __init__(self, rng=None, Xd=None, Xc=None, Xm=None, Xt=None, \ i_net=None, g_net=None, d_net=None, chain_len=None, \ data_dim=None, prior_dim=None, params=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.data_dim = data_dim self.prior_dim = prior_dim self.prior_mean = 0.0 self.prior_logvar = 0.0 if params is None: self.params = {} else: self.params = params if 'cost_decay' in self.params: self.cost_decay = self.params['cost_decay'] else: self.cost_decay = 0.1 if 'chain_type' in self.params: assert((self.params['chain_type'] == 'walkback') or \ (self.params['chain_type'] == 'walkout')) self.chain_type = self.params['chain_type'] else: self.chain_type = 'walkout' if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # symbolic var for inputting samples for initializing the VAE chain self.Xd = Xd # symbolic var for masking subsets of the state variables self.Xm = Xm # symbolic var for controlling subsets of the state variables self.Xc = Xc # symbolic var for inputting samples from the target distribution self.Xt = Xt # integer number of times to cycle the VAE loop self.chain_len = chain_len # symbolic matrix of indices for data inputs self.It = T.arange(self.Xt.shape[0]) # symbolic matrix of indices for noise/generated inputs self.Id = T.arange( self.chain_len * self.Xd.shape[0]) + self.Xt.shape[0] # get a clone of the desired VAE, for easy access self.OSM = OneStageModel(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \ p_x_given_z=g_net, q_z_given_x=i_net, x_dim=self.data_dim, \ z_dim=self.prior_dim, params=self.params) self.IN = self.OSM.q_z_given_x self.GN = self.OSM.p_x_given_z self.transform_x_to_z = self.OSM.transform_x_to_z self.transform_z_to_x = self.OSM.transform_z_to_x self.bounded_logvar = self.OSM.bounded_logvar # self-loop some clones of the main VAE into a chain. # ** All VAEs in the chain share the same Xc and Xm, which are the # symbolic inputs for providing the observed portion of the input # and a mask indicating which part of the input is "observed". # These inputs are used for training "reconstruction" policies. self.IN_chain = [] self.GN_chain = [] self.Xg_chain = [] _Xd = self.Xd print("Unrolling chain...") for i in range(self.chain_len): # create a VAE infer/generate pair with _Xd as input and with # masking variables shared by all VAEs in this chain _IN = self.IN.shared_param_clone(rng=rng, \ Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm), \ build_funcs=False) _GN = self.GN.shared_param_clone(rng=rng, Xd=_IN.output, \ build_funcs=False) _Xd = self.xt_transform(_GN.output_mean) self.IN_chain.append(_IN) self.GN_chain.append(_GN) self.Xg_chain.append(_Xd) print(" step {}...".format(i)) # make a clone of the desired discriminator network, which will try # to discriminate between samples from the training data and samples # generated by the self-looped VAE chain. self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(self.Xt, *self.Xg_chain)) zero_ary = np.zeros((1, )).astype(theano.config.floatX) # init shared var for weighting nll of data given posterior sample self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll') self.set_lam_chain_nll(lam_chain_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld') self.set_lam_chain_kld(lam_chain_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w') self.set_lam_l2w(lam_l2w=1e-4) # shared var learning rates for all networks self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn') self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in') # shared var momentum parameters for all networks self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2') self.it_count = theano.shared(value=zero_ary, name='vcg_it_count') # shared var weights for adversarial classification objective self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn') self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn') # init parameters for controlling learning dynamics self.set_all_sgd_params() self.set_disc_weights() # init adversarial cost weights for GN/DN # set a shared var for regularizing the output of the discriminator self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \ name='vcg_lam_l2d') # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the VCGair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this VCGair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.in_params = [p for p in self.IN.mlp_params] self.in_params.append(self.OSM.output_logvar) self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params + self.dn_params # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on adversarial binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # first, build the cost to be optimized by the discriminator network, # in general this will be treated somewhat indepedently of the # optimization of the generator and inferencer networks. self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + \ self.disc_reg_cost # construct costs relevant to the optimization of the generator and # discriminator networks self.chain_nll_cost = self.lam_chain_nll[0] * \ self._construct_chain_nll_cost(cost_decay=self.cost_decay) self.chain_kld_cost = self.lam_chain_kld[0] * \ self._construct_chain_kld_cost(cost_decay=self.cost_decay) self.other_reg_cost = self._construct_other_reg_cost() self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \ self.chain_kld_cost + self.other_reg_cost # compute total cost on the discriminator and VB generator/inferencer self.joint_cost = self.dn_cost + self.osm_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() print("Computing VCGLoop DN cost gradients...") grad_list = T.grad(self.dn_cost, self.dn_params, disconnected_inputs='warn') for i, p in enumerate(self.dn_params): self.joint_grads[p] = grad_list[i] print("Computing VCGLoop IN cost gradients...") grad_list = T.grad(self.osm_cost, self.in_params, disconnected_inputs='warn') for i, p in enumerate(self.in_params): self.joint_grads[p] = grad_list[i] print("Computing VCGLoop GN cost gradients...") grad_list = T.grad(self.osm_cost, self.gn_params, disconnected_inputs='warn') for i, p in enumerate(self.gn_params): self.joint_grads[p] = grad_list[i] # construct the updates for the discriminator, generator and # inferencer networks. all networks share the same first/second # moment momentum and iteration count. the networks each have their # own learning rates, which lets you turn their learning on/off. self.dn_updates = get_param_updates(params=self.dn_params, \ grads=self.joint_grads, alpha=self.lr_dn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.gn_updates = get_param_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.in_updates = get_param_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) # bag up all the updates required for training self.joint_updates = OrderedDict() for k in self.dn_updates: self.joint_updates[k] = self.dn_updates[k] for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] # construct an update for tracking the mean KL divergence of # approximate posteriors for this chain new_kld_mean = (0.98 * self.IN.kld_mean) + ((0.02 / self.chain_len) * \ sum([T.mean(I_N.kld_cost) for I_N in self.IN_chain])) self.joint_updates[self.IN.kld_mean] = T.cast(new_kld_mean, 'floatX') # construct the function for training on training data print("Compiling VCGLoop theano functions....") self.train_joint = self._construct_train_joint() return
def __init__(self, rng=None, x_in=None, \ p_s0_obs_given_z_obs=None, p_hi_given_si=None, p_sip1_given_si_hi=None, \ p_x_given_si_hi=None, q_z_given_x=None, q_hi_given_x_si=None, \ obs_dim=None, z_rnn_dim=None, z_obs_dim=None, h_dim=None, \ model_init_obs=True, model_init_rnn=True, ir_steps=2, \ params=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # TODO: implement functionality for working with "latent" si assert (p_x_given_si_hi is None) # decide whether to initialize from a model or from a "constant" self.model_init_obs = model_init_obs self.model_init_rnn = model_init_rnn # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) # record the dimensions of various spaces relevant to this model self.obs_dim = obs_dim self.rnn_dim = z_rnn_dim self.z_dim = z_rnn_dim + z_obs_dim self.z_rnn_dim = z_rnn_dim self.z_obs_dim = z_obs_dim self.h_dim = h_dim self.ir_steps = ir_steps # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x = x_in self.batch_reps = T.lscalar() # setup switching variable for changing between sampling/training zero_ary = np.zeros((1, )).astype(theano.config.floatX) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a weight for pulling priors over hi given si towards a # shared global prior -- e.g. zero mean and unit variance. self.kzg_weight = theano.shared(value=zero_ary, name='msm_kzg_weight') self.set_kzg_weight(0.1) # this weight balances l1 vs. l2 penalty on posterior KLds self.l1l2_weight = theano.shared(value=zero_ary, name='msm_l1l2_weight') self.set_l1l2_weight(1.0) ############################# # Setup self.z and self.s0. # ############################# print("Building MSM step 0...") obs_scale = 0.0 rnn_scale = 0.0 if self.model_init_obs: # initialize obs state from generative model obs_scale = 1.0 if self.model_init_rnn: # initialize rnn state from generative model rnn_scale = 1.0 self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x) self.z = self.q_z_given_x.output self.z_rnn = self.z[:, :self.z_rnn_dim] self.z_obs = self.z[:, self.z_rnn_dim:] self.p_s0_obs_given_z_obs = p_s0_obs_given_z_obs.shared_param_clone( \ rng=rng, Xd=self.z_obs) _s0_obs_model = self.p_s0_obs_given_z_obs.output_mean _s0_obs_const = self.p_s0_obs_given_z_obs.mu_layers[-1].b self.s0_obs = (obs_scale * _s0_obs_model) + \ ((1.0 - obs_scale) * _s0_obs_const) _s0_rnn_model = self.z_rnn _s0_rnn_const = self.q_z_given_x.mu_layers[-1].b[:self.z_rnn_dim] self.s0_rnn = (rnn_scale * _s0_rnn_model) + \ ((1.0 - rnn_scale) * _s0_rnn_const) self.s0_jnt = T.horizontal_stack(self.s0_obs, self.s0_rnn) self.output_logvar = self.p_s0_obs_given_z_obs.sigma_layers[-1].b self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.output_logvar) ############################################################### # Setup the iterative refinement loop, starting from self.s0. # ############################################################### self.p_hi_given_si = [] # holds p_hi_given_si for each i self.p_sip1_given_si_hi = [] # holds p_sip1_given_si_hi for each i self.q_hi_given_x_si = [] # holds q_hi_given_x_si for each i self.si = [self.s0_jnt] # holds si for each i self.hi = [] # holds hi for each i for i in range(self.ir_steps): print("Building MSM step {0:d}...".format(i + 1)) _si = self.si[i] si_obs = _si[:, :self.obs_dim] si_rnn = _si[:, self.obs_dim:] # get samples of next hi, conditioned on current si self.p_hi_given_si.append( \ p_hi_given_si.shared_param_clone(rng=rng, \ Xd=T.horizontal_stack( \ self.obs_transform(si_obs), si_rnn))) hi_p = self.p_hi_given_si[i].output # now we build the model for variational hi given si grad_ll = self.x - self.obs_transform(si_obs) self.q_hi_given_x_si.append(\ q_hi_given_x_si.shared_param_clone(rng=rng, \ Xd=T.horizontal_stack( \ grad_ll, self.obs_transform(si_obs), si_rnn))) hi_q = self.q_hi_given_x_si[i].output # make hi samples that can be switched between hi_p and hi_q self.hi.append( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on hi and the "rnn" part of si. self.p_sip1_given_si_hi.append( \ p_sip1_given_si_hi.shared_param_clone(rng=rng, \ Xd=T.horizontal_stack(self.hi[i], si_rnn))) # construct the update from si_obs/si_rnn to sip1_obs/sip1_rnn sip1_obs = si_obs + self.p_sip1_given_si_hi[i].output_mean sip1_rnn = si_rnn sip1_jnt = T.horizontal_stack(sip1_obs, sip1_rnn) # record the updated state of the generative process self.si.append(sip1_jnt) # check that input/output dimensions of our models agree self._check_model_shapes() ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = np.zeros((1, )).astype(theano.config.floatX) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') self.it_count = theano.shared(value=zero_ary, name='msm_it_count') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_1 = theano.shared(value=zero_ary, name='msm_lam_kld_1') self.lam_kld_2 = theano.shared(value=zero_ary, name='msm_lam_kld_2') self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.group_1_params = [] self.group_1_params.extend(self.q_z_given_x.mlp_params) self.group_1_params.extend(self.p_s0_obs_given_z_obs.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.group_2_params = [] for i in range(self.ir_steps): self.group_2_params.extend(self.q_hi_given_x_si[i].mlp_params) self.group_2_params.extend(self.p_hi_given_si[i].mlp_params) self.group_2_params.extend(self.p_sip1_given_si_hi[i].mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.group_1_params + self.group_2_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z, self.kld_hi_cond, self.kld_hi_glob = \ self._construct_kld_costs() self.kld_cost = (self.lam_kld_1[0] * T.mean(self.kld_z)) + \ (self.lam_kld_2[0] * (T.mean(self.kld_hi_cond) + \ (self.kzg_weight[0] * T.mean(self.kld_hi_glob)))) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs() self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p) # Construct the updates for the generator and inferencer networks self.group_1_updates = get_param_updates(params=self.group_1_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.group_2_updates = get_param_updates(params=self.group_2_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.group_1_updates: self.joint_updates[k] = self.group_1_updates[k] for k in self.group_2_updates: self.joint_updates[k] = self.group_2_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() self.compute_post_klds = self._construct_compute_post_klds() self.compute_fe_terms = self._construct_compute_fe_terms() self.sample_from_prior = self._construct_sample_from_prior() # make easy access points for some interesting parameters self.inf_1_weights = self.q_z_given_x.shared_layers[0].W self.gen_1_weights = self.p_s0_obs_given_z_obs.mu_layers[-1].W self.inf_2_weights = self.q_hi_given_x_si[0].shared_layers[0].W self.gen_2_weights = self.p_sip1_given_si_hi[0].mu_layers[-1].W self.gen_inf_weights = self.p_hi_given_si[0].shared_layers[0].W return
def __init__(self, rng=None, x_in=None, \ p_x_given_z=None, q_z_given_x=None, \ x_dim=None, z_dim=None, \ params=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters if params is None: self.params = {} else: self.params = params if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10.0 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim # set parameters for the isotropic Gaussian prior over z self.prior_mean = 0.0 self.prior_logvar = 0.0 # record the symbolic variables that will provide inputs to the # computation graph created to describe this OneStageModel self.x_in = x_in ##################################################################### # Setup the computation graph that provides values in our objective # ##################################################################### # inferencer model for latent variables given observations self.q_z_given_x = q_z_given_x self.z_mean, self.z_logvar = self.q_z_given_x.apply(self.x_in) # reparametrize ZMUV Gaussian samples to get latent samples... self.z = reparametrize(self.z_mean, self.z_logvar, rng=self.rng) # generator model for observations given latent variables self.p_x_given_z = p_x_given_z self.xt, _ = self.p_x_given_z.apply(self.z) # construct the final output of generator, conditioned on z if self.x_type == 'bernoulli': self.xg = T.nnet.sigmoid(self.xt) else: self.xg = self.xt_transform(self.xt) # self.output_logvar modifies the output distribution zero_ary = to_fX(np.zeros((1, ))) self.output_logvar = theano.shared(value=zero_ary, name='osm_output_logvar') self.bounded_logvar = self.logvar_bound * \ T.tanh(self.output_logvar[0] / self.logvar_bound) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='osm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='osm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='osm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='osm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting controlling KL(q(z|x) || p(z)) self.lam_kld = theano.shared(value=zero_ary, name='osm_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='osm_lam_l2w') self.set_lam_l2w(1e-4) # grab a list of all the parameters to optimize self.joint_params = [self.output_logvar] self.joint_params.extend(self.q_z_given_x.mlp_params) self.joint_params.extend(self.p_x_given_z.mlp_params) ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### # first, do NLL self.nll_costs = self.lam_nll[0] * self._construct_nll_costs() self.nll_cost = T.mean(self.nll_costs) # second, do KLd self.kld_costs = self.lam_kld[0] * self._construct_kld_costs() self.kld_cost = T.mean(self.kld_costs) # third, do regularization self.reg_cost = self.lam_l2w[0] * self._construct_reg_costs() # finally, combine them for the joint cost. self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) # Construct a function for jointly training the generator/inferencer print("Compiling self.train_joint...") self.train_joint = self._construct_train_joint() print("Compiling self.compute_fe_terms...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling self.compute_post_klds...") self.compute_post_klds = self._construct_compute_post_klds() print("Compiling self.sample_from_prior...") self.sample_from_prior = self._construct_sample_from_prior() self.transform_x_to_z = theano.function(inputs=[self.x_in], \ outputs=self.z_mean) self.transform_z_to_x = theano.function(inputs=[self.z], \ outputs=self.xg) self.inf_weights = self.q_z_given_x.shared_layers[0].W self.gen_weights = self.p_x_given_z.output_layers[-1].W return
def __init__(self, rng=None, Xd=None, params=None, shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params if 'build_theano_funcs' in params: self.build_theano_funcs = params['build_theano_funcs'] else: self.build_theano_funcs = True if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'output': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. self.shared_config = params['shared_config'] self.output_config = params['output_config'] ### self.shared_layers = [] ######################################### # Initialize the shared part of network # ######################################### for sl_num, sl_desc in enumerate(self.shared_config): l_name = "shared_layer_{0:d}".format(sl_num) if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, layer_description=sl_desc, name=l_name, W_scale=self.init_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append( new_layer.shared_param_dicts) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][sl_num] new_layer = HiddenLayer(rng=rng, layer_description=sl_desc, W=init_params['W'], b=init_params['b'], b_in=init_params['b_in'], s_in=init_params['s_in'], name=l_name, W_scale=self.init_scale) self.shared_layers.append(new_layer) ################################ # Initialize the output layers # ################################ self.output_layers = [] # take input from the output of the shared network for ol_num, ol_desc in enumerate(self.output_config): ol_name = "output_layer_{0:d}".format(ol_num) if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, layer_description=ol_desc, name=ol_name, W_scale=self.init_scale) self.output_layers.append(new_layer) self.shared_param_dicts['output'].append( new_layer.shared_param_dicts) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['output'][ol_num] new_layer = HiddenLayer(rng=rng, layer_description=ol_desc, W=init_params['W'], b=init_params['b'], b_in=init_params['b_in'], s_in=init_params['s_in'], name=ol_name, W_scale=self.init_scale) self.output_layers.append(new_layer) # mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.output_layers: self.mlp_params.extend(layer.params) return
def __init__(self, rng=None, x_in=None, x_mask=None, x_out=None, \ p_zi_given_xi=None, \ p_sip1_given_zi=None, \ q_zi_given_xi=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.imp_steps = self.params['imp_steps'] self.step_type = self.params['step_type'] self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) self.shared_param_dicts = shared_param_dicts # grab handles to the relevant InfNets self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.x_mask = x_mask self.zi_zmuv = T.tensor3() # setup switching variable for changing between sampling/training zero_ary = to_fX(np.zeros((1, ))) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize parameters "owned" by this model s0_init = to_fX(np.zeros((self.x_dim, ))) init_ary = to_fX(np.zeros((self.x_dim, ))) self.x_null = theano.shared(value=init_ary, name='gpis_xn') self.grad_null = theano.shared(value=init_ary, name='gpsi_gn') self.s0 = theano.shared(value=s0_init, name='gpsi_s0') self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar') self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['x_null'] = self.x_null self.shared_param_dicts['grad_null'] = self.grad_null self.shared_param_dicts['s0'] = self.s0 self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.x_null = self.shared_param_dicts['x_null'] self.grad_null = self.shared_param_dicts['grad_null'] self.s0 = self.shared_param_dicts['s0'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) ################################################## # Setup the iterative imputation loop using scan # ################################################## self.ones_mask = T.ones_like(self.x_mask) def imp_step_func(zi_zmuv, si): si_as_x = self._si_as_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = self.x_mask * grad_unmasked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.concatenate([xi_masked, xi_unmasked], axis=1)) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step elif (self.step_type == 'add'): # add steps just update the guesses additively sip1 = si + si_step elif (self.step_type == 'lstm'): # LSTM-style updates with write and erase gates write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1]) erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) elif (self.step_type == 'layer'): alpha_gate = T.nnet.sigmoid(hydra_out[1]) sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step) else: assert False, "Unknown step type!" # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g # apply scan op for the sequential imputation loop self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0 init_vals = [self.s0_full, None, None, None, None] self.scan_results, self.scan_updates = theano.scan(imp_step_func, \ outputs_info=init_vals, sequences=self.zi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] self.kldi_p2g = self.scan_results[4] # get the initial imputation state self.x0 = (self.x_mask * self.x_in) + \ ((1.0 - self.x_mask) * self._si_as_x(self.s0_full)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='gpsi_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g') self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.joint_params = [self.s0, self.obs_logvar] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling best step cost computer...") self.compute_per_step_cost = self._construct_compute_per_step_cost() print("Compiling data-guided imputer sampler...") self.sample_imputer = self._construct_sample_imputer() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return
def __init__(self, rng=None, x_in=None, x_out=None, p_h_given_z=None, p_x_given_h=None, q_z_given_x=None, q_h_given_z_x=None, x_dim=None, z_dim=None, h_dim=None, h_det_dim=None, params=None, shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.h_dim = h_dim self.h_det_dim = h_det_dim # grab handles to the relevant HydraNets self.q_z_given_x = q_z_given_x self.q_h_given_z_x = q_h_given_z_x self.p_h_given_z = p_h_given_z self.p_x_given_h = p_x_given_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((1,self.z_dim)) ) self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar') self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "hidden" latent state (from both p and q) z_q_mean, z_q_logvar = self.q_z_given_x.apply(self.x_in) z_q = reparametrize(z_q_mean, z_q_logvar, rng=self.rng) z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0) z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0) z_p = reparametrize(z_p_mean, z_p_logvar, rng=self.rng) self.z = (self.train_switch[0] * z_q) + \ ((1.0 - self.train_switch[0]) * z_p) # compute relevant KLds for this step self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, z_p_mean, z_p_logvar) self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, z_q_mean, z_q_logvar) # samples of "hidden" latent state (from both p and q) h_p_mean, h_p_logvar = self.p_h_given_z.apply(self.z) h_p = reparametrize(h_p_mean, h_p_logvar, rng=self.rng) h_q_mean, h_q_logvar = self.q_h_given_z_x.apply( T.concatenate([h_p_mean, self.x_out], axis=1)) h_q = reparametrize(h_q_mean, h_q_logvar, rng=self.rng) # compute "stochastic" and "deterministic" parts of latent state h_sto = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) h_det = h_p_mean if self.h_det_dim is None: # don't pass forward any deterministic state self.h = h_sto else: # pass forward some deterministic state self.h = T.concatenate([h_det[:,:self.h_det_dim], h_sto[:,self.h_det_dim:]], axis=1) # compute relevant KLds for this step self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, h_p_mean, h_p_logvar) self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, h_q_mean, h_q_logvar) # p_x_given_h generates an observation x conditioned on the "hidden" # latent variables h. self.x_gen, _ = self.p_x_given_h.apply(self.h) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='tsm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w') self.set_lam_l2w(1e-5) # get optimizable parameters belonging to the TwoStageModel self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar] # get optimizable parameters belonging to the underlying networks child_params = [] child_params.extend(self.q_z_given_x.mlp_params) child_params.extend(self.q_h_given_z_x.mlp_params) child_params.extend(self.p_h_given_z.mlp_params) child_params.extend(self.p_x_given_h.mlp_params) # make a joint list of all optimizable parameters self.joint_params = self_params + child_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = T.sum(self.kld_z, axis=1) + \ T.sum(self.kld_h, axis=1) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # construct the updates for the generator and inferencer networks all_updates = get_adam_updates(params=self.joint_params, grads=self.joint_grads, alpha=self.lr, beta1=self.mom_1, beta2=self.mom_2, mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0) self.joint_updates = OrderedDict() for k in all_updates: self.joint_updates[k] = all_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() return
def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, b_in=None, s_in=None, name="", W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) # setup scale and bias params for the input if b_in is None: # input biases are always initialized to zero ary = np.zeros((in_dim, ), dtype=theano.config.floatX) b_in = theano.shared(value=ary, name="{0:s}_b_in".format(name)) if s_in is None: # input scales are always initialized to one ary = 0.541325 * np.ones((in_dim, ), dtype=theano.config.floatX) s_in = theano.shared(value=ary, name="{0:s}_s_in".format(name)) self.b_in = b_in self.s_in = s_in # allow an early shift and rescale for inputs to this layer #self.clean_input = T.nnet.softplus(self.s_in) * (input + self.b_in) # use the input directly self.clean_input = input zero_ary = np.zeros((1, )).astype(theano.config.floatX) self.input_noise = theano.shared(value=(zero_ary+input_noise), \ name="{0:s}_input_noise".format(name)) self.bias_noise = theano.shared(value=(zero_ary+bias_noise), \ name="{0:s}_bias_noise".format(name)) self.drop_rate = theano.shared(value=(zero_ary+drop_rate), \ name="{0:s}_bias_noise".format(name)) # Add gaussian noise to the input (if desired) self.fuzzy_input = self.clean_input + (self.input_noise[0] * \ self.rng.normal(size=self.clean_input.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) # Apply masking noise to the input (if desired) self.noisy_input = self._drop_from_input(self.fuzzy_input, \ self.drop_rate[0]) # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation is None: activation = relu_actfun if self.pool_size <= 1: self.activation = activation else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: # Generate initial filters using orthogonal random trick #W_shape = (self.in_dim, self.filt_count) #W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) #W_init = W_scale * npr.normal(0.0, 1.0, W_shape) W_init = ortho_matrix(shape=(self.in_dim, self.filt_count), \ gain=W_scale) W_init = W_init.astype(theano.config.floatX) W = theano.shared(value=W_init, name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count, ), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer self.linear_output = T.dot(self.noisy_input, self.W) + self.b # Add noise to the pre-activation features (if desired) self.noisy_linear = self.linear_output + (self.bias_noise[0] * \ self.rng.normal(size=self.linear_output.shape, avg=0.0, \ std=1.0, dtype=theano.config.floatX)) # Apply activation function self.output = self.activation(self.noisy_linear) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.noisy_linear**2.) / self.output.size # Conveniently package layer parameters self.params = [self.W, self.b, self.b_in, self.s_in] # Layer construction complete... return
def __init__(self, \ rng=None, \ Xd=None, \ prior_sigma=None, \ params=None, \ shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params self.lam_l2a = params['lam_l2a'] if 'build_theano_funcs' in params: self.build_theano_funcs = params['build_theano_funcs'] else: self.build_theano_funcs = True if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.0 if 'input_noise' in params: self.input_noise = params['input_noise'] else: self.input_noise = 0.0 if 'bias_noise' in params: self.bias_noise = params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 if 'encoder' in params: self.encoder = params['encoder'] self.decoder = params['decoder'] self.use_encoder = True self.Xd_encoded = self.encoder(self.Xd) else: self.encoder = lambda x: x self.decoder = lambda x: x self.use_encoder = False self.Xd_encoded = self.encoder(self.Xd) if 'kld2_scale' in params: self.kld2_scale = params['kld2_scale'] else: self.kld2_scale = 0.0 if 'sigma_init_scale' in params: self.sigma_init_scale = params['sigma_init_scale'] else: self.sigma_init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.shared_config = params['shared_config'] self.mu_config = params['mu_config'] self.sigma_config = params['sigma_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun ######################################### # Initialize the shared part of network # ######################################### self.shared_layers = [] layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:]) layer_num = 0 # Construct input to the inference network if self.use_encoder: next_input = self.encoder(self.Xd) else: next_input = self.Xd for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "share_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if first_layer: i_noise = self.input_noise b_noise = 0.0 else: i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.shared_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ##################################### # Initialize the mu part of network # ##################################### self.mu_layers = [] layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "mu_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) self.shared_param_dicts['mu'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['mu'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.mu_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ######################################## # Initialize the sigma part of network # ######################################## self.sigma_layers = [] layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "sigma_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if last_layer: # set in-bound weights for logvar predictions to 0 i_scale = 0.0 * i_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) self.shared_param_dicts['sigma'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['sigma'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.sigma_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # Create a shared parameter for rescaling posterior "sigmas" to allow # control over the velocity of the markov chain generated by repeated # cycling through the INF -> GEN loop. if not ('sigma_scale' in self.shared_param_dicts['sigma'][-1]): # we use a hack-ish check to remain compatible with loading models # that were saved before the addition of the sigma_scale param. zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.sigma_scale = theano.shared(value=zero_ary) new_dict = {'sigma_scale': self.sigma_scale} self.shared_param_dicts['sigma'].append(new_dict) self.set_sigma_scale(1.0) else: # this is a clone of some other InfNet, and that InfNet was made # after adding the sigma_scale param, so use its sigma_scale self.sigma_scale = \ self.shared_param_dicts['sigma'][-1]['sigma_scale'] # Create a shared parameter for maintaining an exponentially decaying # estimate of the population mean of posterior KL divergence. if not ('kld_mean' in self.shared_param_dicts['sigma'][-1]): # add a kld_mean if none was already present zero_ary = np.zeros((1,)).astype(theano.config.floatX) + 100.0 self.kld_mean = theano.shared(value=zero_ary) self.shared_param_dicts['sigma'][-1]['kld_mean'] = self.kld_mean else: # use a kld_mean that's already present self.kld_mean = self.shared_param_dicts['sigma'][-1]['kld_mean'] # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.mu_layers: self.mlp_params.extend(layer.params) for layer in self.sigma_layers: self.mlp_params.extend(layer.params) # The output of this inference network is given by the noisy output # of the final layers of its mu and sigma networks. self.output_mean = self.mu_layers[-1].linear_output self.output_logvar = self.sigma_layers[-1].linear_output self.output_sigma = self.sigma_init_scale * self.sigma_scale[0] * \ T.exp(0.5 * self.output_logvar) # We'll also construct an output containing a single samples from each # of the distributions represented by the rows of self.output_mean and # self.output_sigma. self.output = self._construct_post_samples() self.out_dim = self.sigma_layers[-1].out_dim # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = self.lam_l2a * self._act_reg_cost() # Construct a function for penalizing KL divergence between the # approximate posteriors produced by this model and some isotropic # Gaussian distribution. self.kld_cost = self._construct_kld_cost() self.kld_mean_update = T.cast((0.98 * self.kld_mean) + \ (0.02 * T.mean(self.kld_cost)), 'floatX') # Construct a theano function for sampling from the approximate # posteriors inferred by this model for some collection of points # in the "data space". if self.build_theano_funcs: self.sample_posterior = self._construct_sample_posterior() self.mean_posterior = theano.function([self.Xd], \ outputs=self.output_mean) else: self.sample_posterior = None self.mean_posterior = None return