def compare_speed(): # To run this speed comparison # cd <directory of this file> # THEANO_FLAGS=device=gpu \ # python -c 'import test_rng_curand; test_rng_curand.compare_speed()' mrg = MRG_RandomStreams() crn = CURAND_RandomStreams(234) N = 1000 * 100 dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX)) mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))}, profile='mrg uniform') crn_u = theano.function([], [], updates={dest: crn.uniform((N,))}, profile='crn uniform') mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))}, profile='mrg normal') crn_n = theano.function([], [], updates={dest: crn.normal((N,))}, profile='crn normal') for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost print('DEBUGPRINT') print('----------') theano.printing.debugprint(f) for i in range(100): for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost f.fn.time_thunks = (i > 0) f()
def compare_speed(): # To run this speed comparison # cd <directory of this file> # THEANO_FLAGS=device=gpu \ # python -c 'import test_rng_curand; test_rng_curand.compare_speed()' mrg = MRG_RandomStreams() crn = CURAND_RandomStreams(234) N = 1000 * 100 dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX)) mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))}, profile='mrg uniform') crn_u = theano.function([], [], updates={dest: crn.uniform((N,))}, profile='crn uniform') mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))}, profile='mrg normal') crn_n = theano.function([], [], updates={dest: crn.normal((N,))}, profile='crn normal') for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost print('DEBUGPRINT') print('----------') theano.printing.debugprint(f) for i in range(100): for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost f.fn.time_thunks = (i > 0) f()
def check_normal_basic(shape_as_symbolic, dim_as_symbolic=False): """ check_normal_basic(shape_as_symbolic, dim_as_symbolic=False) Runs a basic sanity check on the `normal` method of a `CURAND_RandomStreams` object. Checks that variates * have a mean in the right neighbourhood (near 0) * are of the specified shape * successive calls produce different arrays of variates Parameters ---------- shape_as_symbolic : boolean If `True`, est the case that the shape tuple is a symbolic variable rather than known at compile-time. dim_as_symbolic : boolean If `True`, test the case that an element of the shape tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic` is `True`. """ rng = CURAND_RandomStreams(234) if shape_as_symbolic: # instantiate a TensorConstant with the value (10, 10) shape = constant((10, 10)) else: if dim_as_symbolic: # Only one dimension is symbolic, with the others known shape = (10, constant(10)) else: shape = (10, 10) u0 = rng.normal(shape) u1 = rng.normal(shape) f0 = theano.function([], u0, mode=mode_with_gpu) f1 = theano.function([], u1, mode=mode_with_gpu) v0list = [f0() for i in range(3)] v1list = [f1() for i in range(3)] #print v0list #print v1list # assert that elements are different in a few ways assert numpy.all(v0list[0] != v0list[1]) assert numpy.all(v1list[0] != v1list[1]) assert numpy.all(v0list[0] != v1list[0]) for v in v0list: assert v.shape == (10, 10) assert v.min() < v.max() assert -.5 <= v.mean() <= .5
def check_normal_basic(shape_as_symbolic, dim_as_symbolic=False): """ check_normal_basic(shape_as_symbolic, dim_as_symbolic=False) Runs a basic sanity check on the `normal` method of a `CURAND_RandomStreams` object. Checks that variates * have a mean in the right neighbourhood (near 0) * are of the specified shape * successive calls produce different arrays of variates Parameters ---------- shape_as_symbolic : boolean If `True`, est the case that the shape tuple is a symbolic variable rather than known at compile-time. dim_as_symbolic : boolean If `True`, test the case that an element of the shape tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic` is `True`. """ rng = CURAND_RandomStreams(234) if shape_as_symbolic: # instantiate a TensorConstant with the value (10, 10) shape = constant((10, 10)) else: if dim_as_symbolic: # Only one dimension is symbolic, with the others known shape = (10, constant(10)) else: shape = (10, 10) u0 = rng.normal(shape) u1 = rng.normal(shape) f0 = theano.function([], u0, mode=mode_with_gpu) f1 = theano.function([], u1, mode=mode_with_gpu) v0list = [f0() for i in range(3)] v1list = [f1() for i in range(3)] # print v0list # print v1list # assert that elements are different in a few ways assert numpy.all(v0list[0] != v0list[1]) assert numpy.all(v1list[0] != v1list[1]) assert numpy.all(v0list[0] != v1list[0]) for v in v0list: assert v.shape == (10, 10) assert v.min() < v.max() assert -.5 <= v.mean() <= .5
def sampler(self, mu, log_sigma): if "gpu" in theano.config.device: from theano.sandbox.cuda.rng_curand import CURAND_RandomStreams srng = CURAND_RandomStreams(seed=seed) # srng = T.shared_randomstreams.RandomStreams(seed=seed) else: srng = T.shared_randomstreams.RandomStreams(seed=seed) eps = srng.normal(mu.shape) # Reparametrize z = mu + (T.exp(0.5 * log_sigma) - 1) * eps * 5e-1 return z
class DiscLayer(object): def __init__(self, rng, input, in_dim, W=None, b=None, W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) self.input = input self.in_dim = in_dim # Get some random initial weights and biases, if not given if W is None: # Generate random initial filters in a typical way W_init = 1.0 * np.asarray(rng.normal( \ size=(self.in_dim, 1)), \ dtype=theano.config.floatX) W = theano.shared(value=(W_scale * W_init)) if b is None: b_init = np.zeros((1, ), dtype=theano.config.floatX) b = theano.shared(value=b_init) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer self.linear_output = 20.0 * T.tanh( (T.dot(self.input, self.W) + self.b) / 20.0) # Apply activation function self.output = self.linear_output # Compute squared sum of outputs, for regularization self.act_l2_sum = T.sum(self.output**2.0) / self.output.shape[0] # Conveniently package layer parameters self.params = [self.W, self.b] # little layer construction complete... return def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class DiscLayer(object): def __init__(self, rng, input, in_dim, W=None, b=None, W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) self.input = input self.in_dim = in_dim # Get some random initial weights and biases, if not given if W is None: # Generate random initial filters in a typical way W_init = 1.0 * np.asarray(rng.normal( \ size=(self.in_dim, 1)), \ dtype=theano.config.floatX) W = theano.shared(value=(W_scale*W_init)) if b is None: b_init = np.zeros((1,), dtype=theano.config.floatX) b = theano.shared(value=b_init) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer self.linear_output = 20.0 * T.tanh((T.dot(self.input, self.W) + self.b) / 20.0) # Apply activation function self.output = self.linear_output # Compute squared sum of outputs, for regularization self.act_l2_sum = T.sum(self.output**2.0) / self.output.shape[0] # Conveniently package layer parameters self.params = [self.W, self.b] # little layer construction complete... return def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + DCG(self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX)) return P_nz
class InfNet(object): """ A net that tries to infer an approximate posterior for some observation, given some deep, directed generative model. The output of this network comprises two constructs: an approximate mean vector and an approximate standard deviation vector (i.e. diagonal matrix) for a Gaussian posterior. Parameters: rng: a numpy.random RandomState object Xd: symbolic input matrix for inputting observable data Xc: symbolic input matrix for inputting control data Xm: symbolic input matrix for a mask on which values to take from Xc and which to take from Xd prior_sigma: standard deviation of isotropic Gaussian prior that our inferred posteriors will be penalized for deviating from. params: a dict of parameters describing the desired ensemble: lam_l2a: L2 regularization weight on neuron activations vis_drop: drop rate to use on observable variables hid_drop: drop rate to use on hidden layer activations -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0 input_noise: standard dev for noise on the input of this net bias_noise: standard dev for noise on the biases of hidden layers shared_config: list of "layer descriptions" for shared part mu_config: list of "layer descriptions" for mu part sigma_config: list of "layer descriptions" for sigma part activation: "function handle" for the desired non-linearity init_scale: scaling factor for hidden layer weights (__ * 0.01) shared_param_dicts: parameters for the MLP controlled by this InfNet """ def __init__(self, \ rng=None, \ Xd=None, \ Xc=None, \ Xm=None, \ prior_sigma=None, \ params=None, \ shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd self.Xc = Xc self.Xm = Xm self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params self.lam_l2a = params['lam_l2a'] if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.0 if 'input_noise' in params: self.input_noise = params['input_noise'] else: self.input_noise = 0.0 if 'bias_noise' in params: self.bias_noise = params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.shared_config = params['shared_config'] self.mu_config = params['mu_config'] self.sigma_config = params['sigma_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun ######################################### # Initialize the shared part of network # ######################################### self.shared_layers = [] layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:]) layer_num = 0 # Construct input by combining data input and control input, taking # unmasked values from data input and others from the control input next_input = ((1.0 - self.Xm) * self.Xd) + \ (self.Xm * self.Xc) for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "share_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if first_layer: i_noise = self.input_noise b_noise = 0.0 else: i_noise = 0.0 b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append({'W': new_layer.W, 'b': new_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale) self.shared_layers.append(new_layer) next_input = self.shared_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ##################################### # Initialize the mu part of network # ##################################### self.mu_layers = [] layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "mu_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.mu_layers.append(new_layer) self.shared_param_dicts['mu'].append({'W': new_layer.W, 'b': new_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['mu'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale) self.mu_layers.append(new_layer) next_input = self.mu_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ######################################## # Initialize the sigma part of network # ######################################## self.sigma_layers = [] layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "sigma_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.sigma_layers.append(new_layer) self.shared_param_dicts['sigma'].append({'W': new_layer.W, 'b': new_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['sigma'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale) self.sigma_layers.append(new_layer) next_input = self.sigma_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.mu_layers: self.mlp_params.extend(layer.params) for layer in self.sigma_layers: self.mlp_params.extend(layer.params) # The output of this inference network is given by the noisy output # of the final layers of its mu and sigma networks. self.output_mu = self.mu_layers[-1].noisy_linear self.output_logvar = self.sigma_layers[-1].noisy_linear self.output_sigma = T.exp(0.5 * self.output_logvar) # We'll also construct an output containing a single samples from each # of the distributions represented by the rows of self.output_mu and # self.output_sigma. self.output = self._construct_post_samples() self.out_dim = self.sigma_layers[-1].out_dim # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = self.lam_l2a * self._act_reg_cost() # Construct a function for penalizing KL divergence between the # approximate posteriors produced by this model and some isotropic # Gaussian distribution. self.kld_cost = self._construct_kld_cost() # Construct a theano function for sampling from the approximate # posteriors inferred by this model for some collection of points # in the "data space". self.sample_posterior = self._construct_sample_posterior() self.mean_posterior = theano.function([self.Xd, self.Xc, self.Xm], \ outputs=self.output_mu) return def _act_reg_cost(self): """ Apply L2 regularization to the activations in each net. """ act_sq_sums = [] for layer in self.shared_layers: act_sq_sums.append(layer.act_l2_sum) for layer in self.mu_layers: act_sq_sums.append(layer.act_l2_sum) for layer in self.sigma_layers: act_sq_sums.append(layer.act_l2_sum) full_act_sq_sum = T.sum(act_sq_sums) return full_act_sq_sum def _construct_post_samples(self): """ Draw a single sample from each of the approximate posteriors encoded in self.output_mu and self.output_sigma. """ post_samples = self.output_mu + (self.output_sigma * \ self.rng.normal(size=self.output_sigma.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) return post_samples def _construct_kld_cost(self): """ Compute (analytically) the KL divergence between each approximate posterior encoded by self.mu/self.sigma and the isotropic Gaussian distribution with mean 0 and standard deviation self.prior_sigma. """ prior_sigma_sq = self.prior_sigma**2.0 prior_log_sigma_sq = np.log(prior_sigma_sq) kld_cost = 0.5 * T.sum(((self.output_mu**2.0 / prior_sigma_sq) + \ (T.exp(self.output_logvar) / prior_sigma_sq) - \ (self.output_logvar - prior_log_sigma_sq) - 1.0), axis=1, keepdims=True) return kld_cost def _construct_sample_posterior(self): """ Construct a sampler that draws a single sample from the inferred posterior for some set of inputs. """ psample = theano.function([self.Xd, self.Xc, self.Xm], \ outputs=self.output) return psample def init_biases(self, b_init=0.0): """ Initialize the biases in all hidden layers to some constant. """ for layer in self.shared_layers: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init layer.b.set_value(b_vec) for layer in self.mu_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init layer.b.set_value(b_vec) for layer in self.sigma_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init layer.b.set_value(b_vec) return def shared_param_clone(self, rng=None, Xd=None, Xc=None, Xm=None): """ Return a clone of this network, with shared parameters but with different symbolic input variables. This can be used for "unrolling" a generate->infer->generate->infer... loop. Then, we can do backprop through time for various objectives. """ clone_net = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ prior_sigma=self.prior_sigma, params=self.params, \ shared_param_dicts=self.shared_param_dicts) return clone_net
class GenFCModule(object): """ Module that transforms random values through a single fully connected layer, and then a linear transform (with another relu, optionally). """ def __init__(self, rand_dim, out_dim, fc_dim, apply_bn_1=True, apply_bn_2=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_fc'): self.rand_dim = rand_dim self.out_dim = out_dim self.fc_dim = fc_dim self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ self.w1 = self.init_func((self.rand_dim, self.fc_dim), "{}_w1".format(self.mod_name)) self.w2 = self.init_func((self.fc_dim, self.out_dim), "{}_w2".format(self.mod_name)) self.params = [self.w1, self.w2] # make gains and biases for transforms that will get batch normed if self.apply_bn_1: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.fc_dim), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.fc_dim), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) if self.apply_bn_2: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g2 = gain_ifn((self.out_dim), "{}_g2".format(self.mod_name)) self.b2 = bias_ifn((self.out_dim), "{}_b2".format(self.mod_name)) self.params.extend([self.g2, self.b2]) return def apply(self, batch_size=None, rand_vals=None): """ Apply this generator module. Pass _either_ batch_size or rand_vals. """ assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals" if rand_vals is None: rand_shape = (batch_size, self.rand_dim) if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) else: rand_shape = (rand_vals.shape[0], self.rand_dim) rand_vals = rand_vals.reshape(rand_shape) # transform random values into fc layer h1 = T.dot(rand_vals, self.w1) if self.apply_bn_1: h1 = batchnorm(h1, g=self.g1, b=self.b1) h1 = relu(h1) # transform from fc layer to output h2 = T.dot(h1, self.w2) if self.apply_bn_2: h2 = batchnorm(h2, g=self.g2, b=self.b2) if self.final_relu: h2 = relu(h2) return h2
class WalkoutModel(object): """ Controller for training a forwards-backwards chainy model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_out: the goal state for forwards-backwards walking process p_z_given_x: InfNet for stochastic part of step p_x_given_z: HydraNet for deterministic part of step params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of observations to construct z_dim: dimension of latent space for policy wobble walkout_steps: number of steps to walk out x_type: can be "bernoulli" or "gaussian" x_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, x_out=None, \ p_z_given_x=None, \ p_x_given_z=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this WalkoutModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.walkout_steps = self.params['walkout_steps'] self.x_type = self.params['x_type'] self.shared_param_dicts = shared_param_dicts if 'x_transform' in self.params: assert((self.params['x_transform'] == 'sigmoid') or \ (self.params['x_transform'] == 'none')) if self.params['x_transform'] == 'sigmoid': self.x_transform = lambda x: T.nnet.sigmoid(x) else: self.x_transform = lambda x: x else: self.x_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.x_transform = lambda x: T.nnet.sigmoid(x) assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) assert((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant networks self.p_z_given_x = p_z_given_x self.p_x_given_z = p_x_given_z # record the symbolic variables that will provide inputs to the # computation graph created for this WalkoutModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for walk-out wobble if self.shared_param_dicts is None: # initialize the parameters "owned" by this model zero_ary = to_fX( np.zeros((1,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) ############################################################### # Setup the forwards (i.e. training) walk-out loop using scan # ############################################################### def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw): # get samples of next zi, according to the forwards model zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv) # check reverse direction probability p(xi_fw | zi_fw) xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_bw_mean = self.x_transform(xi_bw_mean) nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \ log_vars=xi_bw_logvar, mask=None) nll_xi_bw = nll_xi_bw.flatten() # get samples of next xi, according to the forwards model xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_fw_mean = self.x_transform(xi_fw_mean) xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv) # check reverse direction probability p(zi_fw | xi_fw) zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \ log_vars=zi_bw_logvar, mask=None) nll_zi_bw = nll_zi_bw.flatten() # each loop iteration produces the following values: # xi_fw: xi generated fom zi by forwards walk # zi_fw: zi generated fom xi by forwards walk # xi_fw_mean: ---- # xi_fw_logvar: ---- # zi_fw_mean: ---- # zi_fw_logvar: ---- # nll_xi_bw: NLL for reverse step zi_fw -> xi_fw # nll_zi_bw: NLL for reverse step xi_fw -> zi_fw return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw # initialize states for x/z self.x0 = self.x_out self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim) # setup initial values to pass to scan op outputs_init = [self.x0, self.z0, None, None, None, None, None, None] sequences_init = [self.xi_zmuv, self.zi_zmuv] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan(forwards_loop, \ outputs_info=outputs_init, \ sequences=sequences_init) # grab results of the scan op. all values are computed for each step self.xi = self.scan_results[0] self.zi = self.scan_results[1] self.xi_fw_mean = self.scan_results[2] self.xi_fw_logvar = self.scan_results[3] self.zi_fw_mean = self.scan_results[4] self.zi_fw_logvar = self.scan_results[5] self.nll_xi_bw = self.scan_results[6] self.nll_zi_bw = self.scan_results[7] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='srr_lr') # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w') self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_s self.lam_kld_s.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xo): """ Construct the necessary ZMUV gaussian samples for generating trajectories from this WalkoutModel, for input matrix xo. """ zi_zmuv = self.rng.normal( \ size=(self.total_steps, xo.shape[0], self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return zi_zmuv def _construct_rev_masks(self, xo): """ Compute the sequential revelation masks for the input batch in xo. -- We need to construct mask sequences for both p and q. """ if self.use_rev_masks: # make batch copies of self.rev_masks_p and self.rev_masks_q pmasks = self.rev_masks_p.dimshuffle(0,'x',1).repeat(xo.shape[0], axis=1) qmasks = self.rev_masks_q.dimshuffle(0,'x',1).repeat(xo.shape[0], axis=1) else: pm_list = [] qm_list = [] # make a zero mask that does nothing zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1]) # generate independently sampled masks for each revelation block for rb in self.rev_sched: # make a random binary mask with ones at rate rb[1] rand_vals = self.rng.uniform( \ size=(1, xo.shape[0], xo.shape[1]), \ low=0.0, high=1.0, dtype=theano.config.floatX) rand_mask = rand_vals < rb[1] # append the masks for this revleation block to the mask lists # # the guide policy (in q) gets to peek at the values that will be # revealed to the primary policy (in p) for the entire block. The # primary policy only gets to see these values at end of the final # step of the block. Within a given step, values are revealed to q # at the beginning of the step, and to p at the end. # # e.g. in a revelation block with only a single step, the guide # policy sees the values at the beginning of the step, which allows # it to guide the step. the primary policy only gets to see the # values at the end of the step. # # i.e. a standard variational auto-encoder is equivalent to a # sequential revelation and refinement model with only one # revelation block, which has one step and a reveal rate of 1.0. # for refine_step in range(rb[0]-1): pm_list.append(zero_mask) qm_list.append(rand_mask) pm_list.append(rand_mask) qm_list.append(rand_mask) # concatenate each mask list into a 3-tensor pmasks = T.cast(T.concatenate(pm_list, axis=0), 'floatX') qmasks = T.cast(T.concatenate(qm_list, axis=0), 'floatX') return [pmasks, qmasks] def _construct_nll_costs(self, si, xo, nll_mask): """ Construct the negative log-likelihood part of free energy. -- only check NLL where nll_mask == 1 """ xh = self._from_si_to_x( si ) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar, mask=nll_mask) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_s(self, s_i, s_j): """ Compute KL(s_i || s_j) -- assuming bernoullish outputs """ x_i = self._from_si_to_x( s_i ) x_j = self._from_si_to_x( s_j ) kld_s = (x_i * (T.log(x_i) - T.log(x_j))) + \ ((1.0 - x_i) * (T.log(1.0-x_i) - T.log(1.0-x_j))) sum_kld = T.sum(kld_s, axis=1) return sum_kld def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] kld_sis = [] s0 = 0.0*self.si[0] + self.s0 for i in range(self.total_steps): kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1)) if i == 0: kld_sis.append(self._construct_kld_s(self.si[i], s0)) else: kld_sis.append(self._construct_kld_s(self.si[i], self.si[i-1])) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) kld_si = sum(kld_sis) return [kld_pi, kld_qi, kld_gi, kld_si] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xo ], \ outputs=[nll, kld], \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XO, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XO.shape[0],)) kld_sum = np.zeros((XO.shape[0],)) for i in range(sample_count): result = fe_term_sample(XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # compile theano function for computing the costs all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g] cost_func = theano.function(inputs=[ xo ], \ outputs=all_step_costs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # make a function for computing batch-based estimates of costs. # _step_nlls: the expected NLL cost for each step # _step_klds: the expected KL(q||p) cost for each step # _kld_q2p: the expected KL(q||p) cost for each latent dim # _kld_p2q: the expected KL(p||q) cost for each latent dim # _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim def raw_cost_computer(XO): _all_costs = cost_func(to_fX(XO)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX( np.asarray([k for k in _step_klds]) ) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX( np.asarray([k for k in _step_nlls]) ) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \ self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') return func def _construct_sequence_sampler(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function states = [self._from_si_to_x(self.s0_full)] + \ [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)] masks = [self.m0_full] + [self.mi_p[i] for i in range(self.total_steps)] outputs = states + masks # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') # visualize trajectories generated by the model def sample_func(XO, use_guide_policy=False): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # get belief states and masks generated by the scan loop scan_vals = func(to_fX(XO)) step_count = self.total_steps + 1 seq_shape = (step_count, XO.shape[0], XO.shape[1]) xm_seq = np.zeros(seq_shape).astype(theano.config.floatX) xi_seq = np.zeros(seq_shape).astype(theano.config.floatX) mi_seq = np.zeros(seq_shape).astype(theano.config.floatX) for i in range(step_count): _xi = scan_vals[i] _mi = scan_vals[i + step_count] _xm = (_mi * XO) + ((1.0 - _mi) * _xi) xm_seq[i,:,:] = _xm xi_seq[i,:,:] = _xi mi_seq[i,:,:] = _mi # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return [xm_seq, xi_seq, mi_seq] return sample_func def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert(not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict() child_model_dicts['p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict() child_model_dicts['p_x_given_si'] = self.p_x_given_si.save_to_dict() child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return
class HiddenLayer(object): def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, b_in=None, s_in=None, name="", W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) # setup scale and bias params for the input if b_in is None: # input biases are always initialized to zero ary = np.zeros((in_dim, ), dtype=theano.config.floatX) b_in = theano.shared(value=ary, name="{0:s}_b_in".format(name)) if s_in is None: # input scales are always initialized to one ary = 0.541325 * np.ones((in_dim, ), dtype=theano.config.floatX) s_in = theano.shared(value=ary, name="{0:s}_s_in".format(name)) self.b_in = b_in self.s_in = s_in # allow an early shift and rescale for inputs to this layer #self.clean_input = T.nnet.softplus(self.s_in) * (input + self.b_in) # use the input directly self.clean_input = input zero_ary = np.zeros((1, )).astype(theano.config.floatX) self.input_noise = theano.shared(value=(zero_ary+input_noise), \ name="{0:s}_input_noise".format(name)) self.bias_noise = theano.shared(value=(zero_ary+bias_noise), \ name="{0:s}_bias_noise".format(name)) self.drop_rate = theano.shared(value=(zero_ary+drop_rate), \ name="{0:s}_bias_noise".format(name)) # Add gaussian noise to the input (if desired) self.fuzzy_input = self.clean_input + (self.input_noise[0] * \ self.rng.normal(size=self.clean_input.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) # Apply masking noise to the input (if desired) self.noisy_input = self._drop_from_input(self.fuzzy_input, \ self.drop_rate[0]) # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation is None: activation = relu_actfun if self.pool_size <= 1: self.activation = activation else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: # Generate initial filters using orthogonal random trick #W_shape = (self.in_dim, self.filt_count) #W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) #W_init = W_scale * npr.normal(0.0, 1.0, W_shape) W_init = ortho_matrix(shape=(self.in_dim, self.filt_count), \ gain=W_scale) W_init = W_init.astype(theano.config.floatX) W = theano.shared(value=W_init, name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count, ), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer self.linear_output = T.dot(self.noisy_input, self.W) + self.b # Add noise to the pre-activation features (if desired) self.noisy_linear = self.linear_output + (self.bias_noise[0] * \ self.rng.normal(size=self.linear_output.shape, avg=0.0, \ std=1.0, dtype=theano.config.floatX)) # Apply activation function self.output = self.activation(self.noisy_linear) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.noisy_linear**2.) / self.output.size # Conveniently package layer parameters self.params = [self.W, self.b, self.b_in, self.s_in] # Layer construction complete... return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class VCGLoop(object): """ Controller for training a self-looping VAE using guidance provided by a classifier. The classifier tries to discriminate between samples generated by the looped VAE while the VAE minimizes a variational generative model objective and also shifts mass away from regions where the classifier can discern that the generated data is denser than the training data. The generator must be an instance of the InfNet class implemented in "InfNet.py". The discriminator must be an instance of the PeaNet class, as implemented in "PeaNet.py". The inferencer must be an instance of the InfNet class implemented in "InfNet.py". Parameters: rng: numpy.random.RandomState (for reproducibility) x_d: symbolic var for providing points for starting the Markov Chain x_t: symbolic var for providing samples from the target distribution i_net: The InfNet instance that will serve as the inferencer g_net: The HydraNet instance that will serve as the generator d_net: The PeaNet instance that will serve as the discriminator chain_len: number of steps to unroll the VAE Markov Chain data_dim: dimension of the generated data z_dim: dimension of the model prior params: a dict of parameters for controlling various costs x_type: can be "bernoulli" or "gaussian" xt_transform: optional transform for gaussian means logvar_bound: optional bound on gaussian output logvar cost_decay: rate of decay for VAE costs in unrolled chain chain_type: can be 'walkout' or 'walkback' lam_l2d: regularization on squared discriminator output """ def __init__(self, rng=None, x_d=None, x_t=None, \ i_net=None, g_net=None, d_net=None, \ chain_len=None, data_dim=None, z_dim=None, \ params=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.data_dim = data_dim self.z_dim = z_dim self.p_z_mean = 0.0 self.p_z_logvar = 0.0 if params is None: self.params = {} else: self.params = params if 'cost_decay' in self.params: self.cost_decay = self.params['cost_decay'] else: self.cost_decay = 0.1 if 'chain_type' in self.params: assert((self.params['chain_type'] == 'walkback') or \ (self.params['chain_type'] == 'walkout')) self.chain_type = self.params['chain_type'] else: self.chain_type = 'walkout' if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # grab symbolic input variables self.x_d = x_d # initial input for starting the chain self.x_t = x_t # samples from target distribution self.z_zmuv = T.tensor3() # ZMUV gaussian samples for use in scan # get the number of steps for chain unrolling self.chain_len = chain_len # symbolic matrix of indices for inputs from target distribution self.It = T.arange(self.x_t.shape[0]) # symbolic matrix of indices for noise/generated inputs self.Id = T.arange(self.chain_len * self.x_d.shape[0]) + self.x_t.shape[0] # get a clone of the desired VAE, for easy access self.OSM = OneStageModel(rng=rng, x_in=self.x_d, \ p_x_given_z=g_net, q_z_given_x=i_net, \ x_dim=self.data_dim, z_dim=self.z_dim, \ params=self.params) self.IN = self.OSM.q_z_given_x self.GN = self.OSM.p_x_given_z self.transform_x_to_z = self.OSM.transform_x_to_z self.transform_z_to_x = self.OSM.transform_z_to_x self.bounded_logvar = self.OSM.bounded_logvar ################################################## # self-loop the VAE into a multi-step Markov chain. # ** All VAEs in the chain share the same Xc and Xm, which are the # symbolic inputs for providing the observed portion of the input # and a mask indicating which part of the input is "observed". # These inputs are used for training "reconstruction" policies. ################################################## # Setup the iterative generation loop using scan # ################################################## def chain_step_func(zi_zmuv, xim1): # get mean and logvar of z samples for this step zi_mean, zi_logvar = self.IN.apply(xim1, do_samples=False) # transform ZMUV samples to get desired samples zi = (T.exp(0.5 * zi_logvar) * zi_zmuv) + zi_mean # get the next generated xi (pre-transformation) outputs = self.GN.apply(zi) xti = outputs[-1] # apply the observation "mean" transform xgi = self.xt_transform(xti) # compute NLL for this step if self.chain_type == 'walkout': x_true = self.x_d else: x_true = xim1 nlli = self._log_prob(x_true, xgi).flatten() kldi = T.sum(gaussian_kld(zi_mean, zi_logvar, \ self.p_z_mean, self.p_z_logvar), axis=1) return xgi, nlli, kldi # apply the scan op init_values = [self.x_d, None, None] self.scan_results, self.scan_updates = \ theano.scan(chain_step_func, outputs_info=init_values, \ sequences=self.z_zmuv) # get the outputs of the scan op self.xgi = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi = self.scan_results[2] self.xgi_list = [self.xgi[i] for i in range(self.chain_len)] # make a clone of the desired discriminator network, which will try # to discriminate between samples from the training data and samples # generated by the self-looped VAE chain. self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(self.x_t, *self.xgi_list)) zero_ary = np.zeros((1,)).astype(theano.config.floatX) # init shared var for weighting nll of data given posterior sample self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll') self.set_lam_chain_nll(lam_chain_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld') self.set_lam_chain_kld(lam_chain_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w') self.set_lam_l2w(lam_l2w=1e-4) # shared var learning rates for all networks self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn') self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in') # shared var momentum parameters for all networks self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2') # shared var weights for adversarial classification objective self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn') self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init adversarial cost weights for GN/DN self.set_disc_weights() # set a shared var for regularizing the output of the discriminator self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \ name='vcg_lam_l2d') # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the VCGair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this VCGair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params + self.dn_params # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on adversarial binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # first, build the cost to be optimized by the discriminator network, # in general this will be treated somewhat indepedently of the # optimization of the generator and inferencer networks. self.dn_cost = self.disc_cost_dn + self.disc_reg_cost # construct costs relevant to the optimization of the generator and # discriminator networks self.chain_nll_cost = self.lam_chain_nll[0] * \ self._construct_chain_nll_cost(cost_decay=self.cost_decay) self.chain_kld_cost = self.lam_chain_kld[0] * \ self._construct_chain_kld_cost(cost_decay=self.cost_decay) self.other_reg_cost = self._construct_other_reg_cost() self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \ self.chain_kld_cost + self.other_reg_cost # compute total cost on the discriminator and VB generator/inferencer self.joint_cost = self.dn_cost + self.osm_cost print("Computing VCGLoop joint_grad...") # grab the gradients for all parameters to optimize self.joint_grads = OrderedDict() for p in self.dn_params: self.joint_grads[p] = T.grad(self.dn_cost, p) for p in self.in_params: self.joint_grads[p] = T.grad(self.osm_cost, p) for p in self.gn_params: self.joint_grads[p] = T.grad(self.osm_cost, p) # construct the updates for the discriminator, generator and # inferencer networks. all networks share the same first/second # moment momentum and iteration count. the networks each have their # own learning rates, which lets you turn their learning on/off. self.dn_updates = get_adam_updates(params=self.dn_params, \ grads=self.joint_grads, alpha=self.lr_dn, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) self.in_updates = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) # bag up all the updates required for training self.joint_updates = OrderedDict() for k in self.dn_updates: self.joint_updates[k] = self.dn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] print("Compiling VCGLoop train_joint...") # construct the function for training on training data self.train_joint = self._construct_train_joint() return def set_dn_sgd_params(self, learn_rate=0.01): """ Set learning rate for the discriminator network. """ zero_ary = np.zeros((1,)) new_lr = zero_ary + learn_rate self.lr_dn.set_value(new_lr.astype(theano.config.floatX)) return def set_in_sgd_params(self, learn_rate=0.01): """ Set learning rate for the inferencer network. """ zero_ary = np.zeros((1,)) new_lr = zero_ary + learn_rate self.lr_in.set_value(new_lr.astype(theano.config.floatX)) return def set_gn_sgd_params(self, learn_rate=0.01): """ Set learning rate for the generator network. """ zero_ary = np.zeros((1,)) new_lr = zero_ary + learn_rate self.lr_gn.set_value(new_lr.astype(theano.config.floatX)) return def set_all_sgd_params(self, learn_rate=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rates to the same value new_lr = zero_ary + learn_rate self.lr_dn.set_value(new_lr.astype(theano.config.floatX)) self.lr_gn.set_value(new_lr.astype(theano.config.floatX)) self.lr_in.set_value(new_lr.astype(theano.config.floatX)) # set the first/second moment momentum parameters new_mom_1 = zero_ary + mom_1 new_mom_2 = zero_ary + mom_2 self.mom_1.set_value(new_mom_1.astype(theano.config.floatX)) self.mom_2.set_value(new_mom_2.astype(theano.config.floatX)) return def set_disc_weights(self, dweight_gn=1.0, dweight_dn=1.0): """ Set weights for the adversarial classification cost. """ zero_ary = np.zeros((1,)).astype(theano.config.floatX) new_dw_dn = zero_ary + dweight_dn self.dw_dn.set_value(new_dw_dn) new_dw_gn = zero_ary + dweight_gn self.dw_gn.set_value(new_dw_gn) return def set_lam_chain_nll(self, lam_chain_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_chain_nll self.lam_chain_nll.set_value(new_lam.astype(theano.config.floatX)) return def set_lam_chain_kld(self, lam_chain_kld=1.0): """ Set the strength of regularization on KL-divergence for continuous posterior variables. When set to 1.0, this reproduces the standard role of KL(posterior || prior) in variational learning. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_chain_kld self.lam_chain_kld.set_value(new_lam.astype(theano.config.floatX)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(new_lam.astype(theano.config.floatX)) return def _construct_zmuv_samples(self, xi, br): """ Construct the necessary (symbolic) samples for computing through this VCGLoop for input (sybolic) matrix X. """ z_zmuv = self.rng.normal( \ size=(self.chain_len, xi.shape[0]*br, self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return z_zmuv def _construct_disc_layers(self, rng): """ Construct binary discrimination layers for each spawn-net in the underlying discrimnator pseudo-ensemble. All spawn-nets spawned from the same proto-net will use the same disc-layer parameters. """ self.disc_layers = [] self.disc_outputs = [] dn_init_scale = self.DN.init_scale for sn in self.DN.spawn_nets: # construct a "binary discriminator" layer to sit on top of each # spawn net in the discriminator pseudo-ensemble sn_fl = sn[-1] self.disc_layers.append(DiscLayer(rng=rng, \ input=sn_fl.noisy_input, in_dim=sn_fl.in_dim, \ W_scale=dn_init_scale)) # capture the (linear) output of the DiscLayer, for possible reuse self.disc_outputs.append(self.disc_layers[-1].linear_output) # get the params of this DiscLayer, for convenient optimization self.dn_params.extend(self.disc_layers[-1].params) return def _construct_disc_costs(self): """ Construct the generator and discriminator adversarial costs. """ gn_costs = [] dn_costs = [] for dl_output in self.disc_outputs: data_preds = dl_output.take(self.It, axis=0) noise_preds = dl_output.take(self.Id, axis=0) # compute the cost with respect to which we will be optimizing # the parameters of the discriminator network data_size = T.cast(self.It.size, 'floatX') noise_size = T.cast(self.Id.size, 'floatX') dnl_dn_cost = (logreg_loss(data_preds, 1.0) / data_size) + \ (logreg_loss(noise_preds, -1.0) / noise_size) # compute the cost with respect to which we will be optimizing # the parameters of the generative model dnl_gn_cost = (hinge_loss(noise_preds, 0.0) + hinge_sq_loss(noise_preds, 0.0)) / (2.0 * noise_size) dn_costs.append(dnl_dn_cost) gn_costs.append(dnl_gn_cost) dn_cost = self.dw_dn[0] * T.sum(dn_costs) gn_cost = self.dw_gn[0] * T.sum(gn_costs) return [dn_cost, gn_cost] def _log_prob(self, x_true, x_apprx): """ Wrap log-prob with switching for bernoulli/gaussian output types. """ if self.x_type == 'bernoulli': ll_cost = log_prob_bernoulli(x_true, x_apprx) else: ll_cost = log_prob_gaussian2(x_true, x_apprx, \ log_vars=self.bounded_logvar) nll_cost = -ll_cost return nll_cost def _construct_chain_nll_cost(self, cost_decay=0.1): """ Construct the negative log-likelihood part of cost to minimize. This is for operation in "free chain" mode, where a seed point is used to initialize a long(ish) running markov chain. """ assert((cost_decay > 0.0) and (cost_decay < 1.0)) nll_costs = [] step_weight = 1.0 step_weights = [] step_decay = cost_decay for i in range(self.chain_len): c = T.mean(self.nlli[i]) nll_costs.append(step_weight * c) step_weights.append(step_weight) step_weight = step_weight * step_decay nll_cost = sum(nll_costs) / sum(step_weights) return nll_cost def _construct_chain_kld_cost(self, cost_decay=0.1): """ Construct the posterior KLd from prior part of cost to minimize. This is for operation in "free chain" mode, where a seed point is used to initialize a long(ish) running markov chain. """ assert((cost_decay > 0.0) and (cost_decay < 1.0)) kld_costs = [] step_weight = 1.0 step_weights = [] step_decay = cost_decay for i in range(self.chain_len): # sum and reweight the KLd cost for this step in the chain c = T.mean(self.kldi[i]) kld_costs.append(step_weight * c) step_weights.append(step_weight) step_weight = step_weight * step_decay kld_cost = sum(kld_costs) / sum(step_weights) return kld_cost def _construct_other_reg_cost(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network parameters. """ gp_cost = sum([T.sum(par**2.0) for par in self.gn_params]) ip_cost = sum([T.sum(par**2.0) for par in self.in_params]) other_reg_cost = self.lam_l2w[0] * (gp_cost + ip_cost) return other_reg_cost def _construct_train_joint(self): """ Construct theano function to train generator and discriminator jointly. """ # symbolic vars for passing input to training function xd = T.matrix() xt = T.matrix() br = T.lscalar() zzmuv = self._construct_zmuv_samples(xd, br) # collect outputs to return to caller outputs = [self.joint_cost, self.chain_nll_cost, self.chain_kld_cost, \ self.disc_cost_gn, self.disc_cost_dn, self.other_reg_cost] func = theano.function(inputs=[ xd, xt, br ], \ outputs=outputs, updates=self.joint_updates, \ givens={ self.x_d: xd.repeat(br, axis=0), \ self.x_t: xt, self.z_zmuv: zzmuv }) return func def sample_from_chain(self, X_d, X_c=None, X_m=None, loop_iters=5, \ sigma_scale=None): """ Sample for several rounds through the I<->G loop, initialized with the the "data variable" samples in X_d. """ result = self.OSM.sample_from_chain(X_d, X_c=X_c, X_m=X_m, \ loop_iters=loop_iters, sigma_scale=sigma_scale) return result def sample_from_prior(self, samp_count): """ Draw independent samples from the model's prior. """ Xs = self.OSM.sample_from_prior(samp_count) return Xs
class GenNet(object): """ A net that transforms a simple distribution so that it matches some more complicated distribution, for some definition of match.... Parameters: rng: a numpy.random RandomState object Xp: symbolic matrix for inputting latent variable samples prior_sigma: standard deviation of isotropic Gaussian prior that this generator will transform to match some other distribution params: a dict of parameters describing the desired network: lam_l2a: L2 regularization weight on neuron activations vis_drop: drop rate to use on the latent variable space hid_drop: drop rate to use on the hidden layer activations -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0 bias_noise: standard dev for noise on the biases of hidden layers mlp_config: list of "layer descriptions" out_type: set this to "bernoulli" for generating outputs to match bernoulli-valued observations and set it to "gaussian" to match general real-valued observations. activation: "function handle" for the desired non-linearity init_scale: scaling factor for hidden layer weights (__ * 0.01) shared_param_dicts: parameters for the MLP controlled by this GenNet """ def __init__(self, \ rng=None, \ Xp=None, \ prior_sigma=None, \ params=None, \ shared_param_dicts=None): # First, setup a shared random number generator for this layer self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xp = Xp self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### assert(not (params is None)) self.params = params lam_l2a = self.params['lam_l2a'] if 'vis_drop' in self.params: # Drop rate on the latent variables self.vis_drop = self.params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in self.params: # Drop rate on hidden layer activations self.hid_drop = self.params['hid_drop'] else: self.hid_drop = 0.0 if 'bias_noise' in self.params: # Noise sigma for hidden layer biases self.bias_noise = self.params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 if 'out_type' in params: # check which type of output distribution to generate self.out_type = params['out_type'] assert((self.out_type == 'bernoulli') or \ (self.out_type == 'gaussian')) else: # default to bernoulli-valued outputs self.out_type = 'bernoulli' # Check if the params for this net were given a priori. This option # will be used for creating "clones" of a generative network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = [] self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.mlp_config = params['mlp_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun self.mlp_depth = len(self.mlp_config) - 1 self.latent_dim = self.mlp_config[0] self.data_dim = self.mlp_config[-1] ########################## # Initialize the network # ########################## self.mlp_layers = [] self.logvar_layer = None layer_def_pairs = zip(self.mlp_config[:-1],self.mlp_config[1:]) layer_num = 0 next_input = self.Xp for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "gn_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.mlp_layers.append(new_layer) self.shared_param_dicts.append({'W': new_layer.W, 'b': new_layer.b}) if (last_layer and (self.out_type == 'gaussian')): # add an extra layer/transform for encoding log-variance lv_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name+'_logvar', W_scale=self.init_scale) self.logvar_layer = lv_layer self.mlp_layers.append(lv_layer) self.shared_param_dicts.append({'W': lv_layer.W, 'b': lv_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts[layer_num] self.mlp_layers.append(HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale)) if (last_layer and (self.out_type == 'gaussian')): init_params = self.shared_param_dicts[layer_num+1] self.mlp_layers.append(HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale)) next_input = self.mlp_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # construct a mask for deciding which output dimensions to keep/ignore if self.is_clone: self.output_mask = self.shared_param_dicts[-1]['output_mask'] self.output_bias = self.shared_param_dicts[-1]['output_bias'] else: row_mask = np.ones((self.data_dim,)).astype(theano.config.floatX) self.output_mask = theano.shared(value=row_mask, name='gn_output_mask') row_mask = 0.0 * row_mask self.output_bias = theano.shared(value=row_mask, name='gn_output_bias') op_dict = {'output_mask': self.output_mask, \ 'output_bias': self.output_bias} self.shared_param_dicts.append(op_dict) # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.mlp_layers: self.mlp_params.extend(layer.params) # add the output bias vector to the param list self.mlp_params.append(self.output_bias) # The output of this generator network is given by the noisy output # of its final layer. We will keep a running estimate of the mean and # covariance of the distribution induced by combining this network's # latent noise source with its deep non-linear transform. These will # be used to encourage the induced distribution to match the first and # second-order moments of the distribution we are trying to match. if self.out_type == 'bernoulli': self.output = (T.nnet.sigmoid(self.mlp_layers[-1].linear_output + self.output_bias) * \ self.output_mask) self.output_mu = self.output self.output_logvar = self.output self.output_sigma = self.output else: self.output_mu = self.mlp_layers[-1].linear_output + self.output_bias self.output_logvar = self.mlp_layers[-2].linear_output self.output_sigma = T.sqrt(T.exp(self.output_logvar)) self.output = self._construct_post_samples() * self.output_mask self.out_dim = self.mlp_layers[-1].out_dim C_init = np.zeros((self.out_dim,self.out_dim)).astype(theano.config.floatX) m_init = np.zeros((self.out_dim,)).astype(theano.config.floatX) self.dist_mean = theano.shared(m_init, name='gn_dist_mean') self.dist_cov = theano.shared(C_init, name='gn_dist_cov') # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = lam_l2a * self._act_reg_cost() # Construct a sampler for drawing independent samples from this model's # isotropic Gaussian prior, and a sampler for the model distribution. self.sample_from_prior = self._construct_prior_sampler() self.sample_from_model = self._construct_model_sampler() # Construct a function for passing points from the latent/prior space # through the transform induced by the current model parameters. self.transform_prior = self._construct_transform_prior() return def _act_reg_cost(self): """ Apply L2 regularization to the activations in this network. """ act_sq_sums = [] for layer in self.mlp_layers: act_sq_sums.append(layer.act_l2_sum) full_act_sq_sum = T.sum(act_sq_sums) return full_act_sq_sum def _construct_post_samples(self): """ Draw a single sample from each of the approximate posteriors encoded in self.output_mu and self.output_sigma. """ post_samples = self.output_mu + (self.output_sigma * \ self.rng.normal(size=self.output_sigma.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) return post_samples def _construct_prior_sampler(self): """ Draw independent samples from this model's isotropic Gaussian prior. """ samp_count = T.lscalar() prior_samples = self.prior_sigma * self.rng.normal( \ size=(samp_count, self.latent_dim), avg=0.0, std=1.0, \ dtype=theano.config.floatX) prior_sampler = theano.function([samp_count], outputs=prior_samples) return prior_sampler def _construct_model_sampler(self): """ Draw independent samples from this model's distribution. """ samp_count = T.lscalar() prior_samples = self.prior_sigma * self.rng.normal( \ size=(samp_count, self.latent_dim), avg=0.0, std=1.0, \ dtype=theano.config.floatX) prior_sampler = theano.function([samp_count], outputs=self.output, \ givens={self.Xp: prior_samples}) return prior_sampler def _construct_transform_prior(self): """ Apply the tranform induced by the current model parameters to some set of points in the latent/prior space. """ feedforward = theano.function([self.Xp], outputs=self.output) return feedforward def _batch_moments(self): """ Compute covariance and mean of the current sample outputs. """ mu = T.mean(self.output, axis=0, keepdims=True) sigma = T.dot((self.output.T - mu.T), (self.output - mu)) return [mu, sigma] def init_biases(self, b_init=0.0): """ Initialize the biases in all hidden layers to some constant. """ for layer in self.mlp_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init layer.b.set_value(b_vec) return def init_moments(self, X_noise): """ Initialize the running mean and covariance estimates. """ X_noise_sym = T.matrix() out_func = theano.function(inputs=[ X_noise_sym ], \ outputs=[ self.output ], \ givens={self.Xp: X_noise_sym}) # Compute outputs for the input latent noise matrix X_out = out_func(X_noise.astype(theano.config.floatX))[0] # Compute mean and covariance of the outputs mu = np.mean(X_out, axis=0) X_out_minus_mu = X_out - mu sigma = np.dot(X_out_minus_mu.T,X_out_minus_mu) / X_out.shape[0] # Initialize the network's running estimates self.dist_cov.set_value(sigma.astype(theano.config.floatX)) self.dist_mean.set_value(mu.astype(theano.config.floatX)) return def set_output_mask(self, output_mask): """ Set a (probably) binary mask on the output dimensions. """ assert(output_mask.size == self.data_dim) output_mask = output_mask.reshape((self.data_dim,)) self.output_mask.set_value(output_mask.astype(theano.config.floatX)) return def compute_log_prob(self, Xd=None): """ Compute negative log likelihood of the data in Xd, with respect to the output distributions currently at self.output_.... Compute log-prob for all entries in Xd. """ if (self.out_type == 'bernoulli'): log_prob_cost = log_prob_bernoulli(Xd, self.output, mask=self.output_mask) else: log_prob_cost = log_prob_gaussian2(Xd, self.output_mu, \ les_logvars=self.output_logvar, mask=self.output_mask) return log_prob_cost def masked_log_prob(self, Xc=None, Xm=None): """ Compute negative log likelihood of the data in Xc, with respect to the output distributions currently at self.output_.... Select entries in Xd to compute log-prob for based on the mask Xm. When Xm[i] == 1, don't measure NLL Xc[i]... """ # to measure NLL for Xc[i] only when Xm[i] is 0, we need to make an # inverse mask Xm_inv = 1 - X_m, because the masking in the log pdf # functions measures NLL only for observations where the mask != 0. Xm_inv = 1.0 - Xm if (self.out_type == 'bernoulli'): log_prob_cost = log_prob_bernoulli(Xc, self.output, mask=Xm_inv) else: log_prob_cost = log_prob_gaussian2(Xc, self.output_mu, \ les_logvars=self.output_logvar, mask=Xm_inv) return log_prob_cost def shared_param_clone(self, rng=None, Xp=None): """ Return a clone of this network, with shared parameters but with different symbolic input variables. This can be used for "unrolling" a generate->infer->generate->infer... loop. Then, we can do backprop through time for various objectives. """ clone_net = GenNet(rng=rng, Xp=Xp, \ prior_sigma=self.prior_sigma, params=self.params, \ shared_param_dicts=self.shared_param_dicts) return clone_net
class GPSImputer(object): """ Controller for training a multi-step imputater via guided policy search. Parameters: rng: numpy.random.RandomState (for reproducibility) x_in: the initial state for imputation x_out: the goal state for imputation x_mask: mask for state dims to keep fixed during imputation p_zi_given_xi: HydraNet for stochastic part of step (2 outputs) p_sip1_given_zi: HydraNet for deterministic part of step (3 outputs) q_zi_given_xi: HydraNet for the guide policy (2 outputs) params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of inputs to reconstruct z_dim: dimension of latent space for policy wobble imp_steps: number of reconstruction steps to perform step_type: either "add", "jump", "lstm", or "layer" x_type: can be "bernoulli" or "gaussian" """ def __init__(self, rng=None, x_in=None, x_mask=None, x_out=None, \ p_zi_given_xi=None, \ p_sip1_given_zi=None, \ q_zi_given_xi=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.imp_steps = self.params['imp_steps'] self.step_type = self.params['step_type'] self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) self.shared_param_dicts = shared_param_dicts # grab handles to the relevant InfNets self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.x_mask = x_mask self.zi_zmuv = T.tensor3() # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize parameters "owned" by this model s0_init = to_fX( np.zeros((self.x_dim,)) ) init_ary = to_fX( np.zeros((self.x_dim,)) ) self.x_null = theano.shared(value=init_ary, name='gpis_xn') self.grad_null = theano.shared(value=init_ary, name='gpsi_gn') self.s0 = theano.shared(value=s0_init, name='gpsi_s0') self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['x_null'] = self.x_null self.shared_param_dicts['grad_null'] = self.grad_null self.shared_param_dicts['s0'] = self.s0 self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.x_null = self.shared_param_dicts['x_null'] self.grad_null = self.shared_param_dicts['grad_null'] self.s0 = self.shared_param_dicts['s0'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) ################################################## # Setup the iterative imputation loop using scan # ################################################## self.ones_mask = T.ones_like(self.x_mask) def imp_step_func(zi_zmuv, si): si_as_x = self._si_as_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = self.x_mask * grad_unmasked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.concatenate([xi_masked, xi_unmasked], axis=1)) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step elif (self.step_type == 'add'): # add steps just update the guesses additively sip1 = si + si_step elif (self.step_type == 'lstm'): # LSTM-style updates with write and erase gates write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1]) erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) elif (self.step_type == 'layer'): alpha_gate = T.nnet.sigmoid(hydra_out[1]) sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step) else: assert False, "Unknown step type!" # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g # apply scan op for the sequential imputation loop self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0 init_vals = [self.s0_full, None, None, None, None] self.scan_results, self.scan_updates = theano.scan(imp_step_func, \ outputs_info=init_vals, sequences=self.zi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] self.kldi_p2g = self.scan_results[4] # get the initial imputation state self.x0 = (self.x_mask * self.x_in) + \ ((1.0 - self.x_mask) * self._si_as_x(self.s0_full)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='gpsi_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g') self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.joint_params = [self.s0, self.obs_logvar] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling best step cost computer...") self.compute_per_step_cost = self._construct_compute_per_step_cost() print("Compiling data-guided imputer sampler...") self.sample_imputer = self._construct_sample_imputer() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def _si_as_x(self, si): """ Convert from "state" to "observation". """ si_as_x = T.nnet.sigmoid(si) return si_as_x def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_nll(self, lam_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_nll self.lam_nll.set_value(to_fX(new_lam)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xi, br): """ Construct the necessary (symbolic) samples for computing through this GPSImputer for input (sybolic) matrix xi. """ zi_zmuv = self.rng.normal( \ size=(self.imp_steps, xi.shape[0]*br, self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return zi_zmuv def _construct_nll_costs(self, si, xo, xm): """ Construct the negative log-likelihood part of free energy. """ # average log-likelihood over the refinement sequence xh = self._si_as_x(si) xm_inv = 1.0 - xm # we will measure nll only where xm_inv is 1 if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh, mask=xm_inv) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar, mask=xm_inv) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] for i in range(self.imp_steps): kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1)) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) return [kld_pi, kld_qi, kld_gi] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xi, xo, xm ], \ outputs=[nll, kld], \ givens={self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv}, \ updates=self.scan_updates, \ on_unused_input='ignore') # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XI, XO, XM, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's imputation policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XI.shape[0],)) kld_sum = np.zeros((XI.shape[0],)) for i in range(sample_count): result = fe_term_sample(XI, XO, XM) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # compile theano function for computing the costs all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g] cost_func = theano.function(inputs=[xi, xo, xm], \ outputs=all_step_costs, \ givens={ self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv }, \ updates=self.scan_updates, \ on_unused_input='ignore') # make a function for computing multi-sample estimates of cost def raw_cost_computer(XI, XO, XM): _all_costs = cost_func(to_fX(XI), to_fX(XO), to_fX(XM)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX( np.asarray([k for k in _step_klds]) ) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX( np.asarray([k for k in _step_nlls]) ) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_compute_per_step_cost(self): """ Construct a theano function for computing the best possible cost achieved by sequential imputation. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # construct symbolic variables for the step-wise cost step_mean_nll = T.mean(self.nlli, axis=1).flatten() step_lone_kld = T.sum(self.kldi_q2p, axis=2) step_cumu_kld = T.extra_ops.cumsum(step_lone_kld, axis=0) step_mean_kld = T.mean(step_cumu_kld, axis=1).flatten() # compile theano function for computing the step-wise cost step_cost_func = theano.function(inputs=[xi, xo, xm], \ outputs=[step_mean_nll, step_mean_kld], \ givens={ self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv }, \ updates=self.scan_updates, \ on_unused_input='ignore') def best_cost_computer(XI, XO, XM, sample_count=20): # compute a multi-sample estimate of variational free-energy step_nll_sum = np.zeros((self.imp_steps,)) step_kld_sum = np.zeros((self.imp_steps,)) for i in range(sample_count): result = step_cost_func(XI, XO, XM) step_nll_sum += result[0].ravel() step_kld_sum += result[1].ravel() mean_step_nll = step_nll_sum / float(sample_count) mean_step_kld = step_kld_sum / float(sample_count) return [mean_step_nll, mean_step_kld] return best_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() br = T.lscalar() zizmuv = self._construct_zi_zmuv(xi, br) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \ self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xi, xo, xm, br ], \ outputs=outputs, \ givens={ self.x_in: xi.repeat(br, axis=0), \ self.x_out: xo.repeat(br, axis=0), \ self.x_mask: xm.repeat(br, axis=0), \ self.zi_zmuv: zizmuv }, \ updates=self.joint_updates, \ on_unused_input='ignore') return func def _construct_sample_imputer(self): """ Construct a function for drawing samples from the distribution generated by running this imputer. """ xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) oputs = [self.x0] + [self._si_as_x(self.si[i]) for i in range(self.imp_steps)] sample_func = theano.function(inputs=[xi, xo, xm], outputs=oputs, \ givens={self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv}, \ updates=self.scan_updates, \ on_unused_input='ignore') def imputer_sampler(XI, XO, XM, use_guide_policy=False): XI = to_fX( XI ) XO = to_fX( XO ) XM = to_fX( XM ) # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's imputation policy self.set_train_switch(switch_val=0.0) # draw guided/unguided conditional samples model_samps = sample_func(XI, XO, XM) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) # reverse engineer the "masked" samples... masked_samps = [] for xs in model_samps: xsm = (XM * XI) + ((1.0 - XM) * xs) masked_samps.append(xsm) return model_samps, masked_samps return imputer_sampler def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert(not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict() child_model_dicts['p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict() child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return
class GPSImputer(object): """ Controller for training a multi-step imputater via guided policy search. Parameters: rng: numpy.random.RandomState (for reproducibility) x_in: the initial state for imputation x_out: the goal state for imputation x_mask: mask for state dims to keep fixed during imputation p_zi_given_xi: HydraNet for stochastic part of step (2 outputs) p_sip1_given_zi: HydraNet for deterministic part of step (3 outputs) q_zi_given_xi: HydraNet for the guide policy (2 outputs) params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of inputs to reconstruct z_dim: dimension of latent space for policy wobble imp_steps: number of reconstruction steps to perform step_type: either "add", "jump", "lstm", or "layer" x_type: can be "bernoulli" or "gaussian" """ def __init__(self, rng=None, x_in=None, x_mask=None, x_out=None, \ p_zi_given_xi=None, \ p_sip1_given_zi=None, \ q_zi_given_xi=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.imp_steps = self.params['imp_steps'] self.step_type = self.params['step_type'] self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) self.shared_param_dicts = shared_param_dicts # grab handles to the relevant InfNets self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.x_mask = x_mask self.zi_zmuv = T.tensor3() # setup switching variable for changing between sampling/training zero_ary = to_fX(np.zeros((1, ))) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize parameters "owned" by this model s0_init = to_fX(np.zeros((self.x_dim, ))) init_ary = to_fX(np.zeros((self.x_dim, ))) self.x_null = theano.shared(value=init_ary, name='gpis_xn') self.grad_null = theano.shared(value=init_ary, name='gpsi_gn') self.s0 = theano.shared(value=s0_init, name='gpsi_s0') self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar') self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['x_null'] = self.x_null self.shared_param_dicts['grad_null'] = self.grad_null self.shared_param_dicts['s0'] = self.s0 self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.x_null = self.shared_param_dicts['x_null'] self.grad_null = self.shared_param_dicts['grad_null'] self.s0 = self.shared_param_dicts['s0'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) ################################################## # Setup the iterative imputation loop using scan # ################################################## self.ones_mask = T.ones_like(self.x_mask) def imp_step_func(zi_zmuv, si): si_as_x = self._si_as_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = self.x_mask * grad_unmasked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.concatenate([xi_masked, xi_unmasked], axis=1)) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step elif (self.step_type == 'add'): # add steps just update the guesses additively sip1 = si + si_step elif (self.step_type == 'lstm'): # LSTM-style updates with write and erase gates write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1]) erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) elif (self.step_type == 'layer'): alpha_gate = T.nnet.sigmoid(hydra_out[1]) sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step) else: assert False, "Unknown step type!" # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g # apply scan op for the sequential imputation loop self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0 init_vals = [self.s0_full, None, None, None, None] self.scan_results, self.scan_updates = theano.scan(imp_step_func, \ outputs_info=init_vals, sequences=self.zi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] self.kldi_p2g = self.scan_results[4] # get the initial imputation state self.x0 = (self.x_mask * self.x_in) + \ ((1.0 - self.x_mask) * self._si_as_x(self.s0_full)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='gpsi_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g') self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.joint_params = [self.s0, self.obs_logvar] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling best step cost computer...") self.compute_per_step_cost = self._construct_compute_per_step_cost() print("Compiling data-guided imputer sampler...") self.sample_imputer = self._construct_sample_imputer() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def _si_as_x(self, si): """ Convert from "state" to "observation". """ si_as_x = T.nnet.sigmoid(si) return si_as_x def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1, )) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_nll(self, lam_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_nll self.lam_nll.set_value(to_fX(new_lam)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1, )) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xi, br): """ Construct the necessary (symbolic) samples for computing through this GPSImputer for input (sybolic) matrix xi. """ zi_zmuv = self.rng.normal( \ size=(self.imp_steps, xi.shape[0]*br, self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return zi_zmuv def _construct_nll_costs(self, si, xo, xm): """ Construct the negative log-likelihood part of free energy. """ # average log-likelihood over the refinement sequence xh = self._si_as_x(si) xm_inv = 1.0 - xm # we will measure nll only where xm_inv is 1 if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh, mask=xm_inv) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar, mask=xm_inv) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] for i in range(self.imp_steps): kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1)) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) return [kld_pi, kld_qi, kld_gi] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xi, xo, xm ], \ outputs=[nll, kld], \ givens={self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv}, \ updates=self.scan_updates, \ on_unused_input='ignore') # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XI, XO, XM, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's imputation policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XI.shape[0], )) kld_sum = np.zeros((XI.shape[0], )) for i in range(sample_count): result = fe_term_sample(XI, XO, XM) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # compile theano function for computing the costs all_step_costs = [ self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g ] cost_func = theano.function(inputs=[xi, xo, xm], \ outputs=all_step_costs, \ givens={ self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv }, \ updates=self.scan_updates, \ on_unused_input='ignore') # make a function for computing multi-sample estimates of cost def raw_cost_computer(XI, XO, XM): _all_costs = cost_func(to_fX(XI), to_fX(XO), to_fX(XM)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX(np.asarray([k for k in _step_klds])) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX(np.asarray([k for k in _step_nlls])) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_compute_per_step_cost(self): """ Construct a theano function for computing the best possible cost achieved by sequential imputation. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # construct symbolic variables for the step-wise cost step_mean_nll = T.mean(self.nlli, axis=1).flatten() step_lone_kld = T.sum(self.kldi_q2p, axis=2) step_cumu_kld = T.extra_ops.cumsum(step_lone_kld, axis=0) step_mean_kld = T.mean(step_cumu_kld, axis=1).flatten() # compile theano function for computing the step-wise cost step_cost_func = theano.function(inputs=[xi, xo, xm], \ outputs=[step_mean_nll, step_mean_kld], \ givens={ self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv }, \ updates=self.scan_updates, \ on_unused_input='ignore') def best_cost_computer(XI, XO, XM, sample_count=20): # compute a multi-sample estimate of variational free-energy step_nll_sum = np.zeros((self.imp_steps, )) step_kld_sum = np.zeros((self.imp_steps, )) for i in range(sample_count): result = step_cost_func(XI, XO, XM) step_nll_sum += result[0].ravel() step_kld_sum += result[1].ravel() mean_step_nll = step_nll_sum / float(sample_count) mean_step_kld = step_kld_sum / float(sample_count) return [mean_step_nll, mean_step_kld] return best_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() br = T.lscalar() zizmuv = self._construct_zi_zmuv(xi, br) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \ self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xi, xo, xm, br ], \ outputs=outputs, \ givens={ self.x_in: xi.repeat(br, axis=0), \ self.x_out: xo.repeat(br, axis=0), \ self.x_mask: xm.repeat(br, axis=0), \ self.zi_zmuv: zizmuv }, \ updates=self.joint_updates, \ on_unused_input='ignore') return func def _construct_sample_imputer(self): """ Construct a function for drawing samples from the distribution generated by running this imputer. """ xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) oputs = [self.x0] + [ self._si_as_x(self.si[i]) for i in range(self.imp_steps) ] sample_func = theano.function(inputs=[xi, xo, xm], outputs=oputs, \ givens={self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv}, \ updates=self.scan_updates, \ on_unused_input='ignore') def imputer_sampler(XI, XO, XM, use_guide_policy=False): XI = to_fX(XI) XO = to_fX(XO) XM = to_fX(XM) # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's imputation policy self.set_train_switch(switch_val=0.0) # draw guided/unguided conditional samples model_samps = sample_func(XI, XO, XM) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) # reverse engineer the "masked" samples... masked_samps = [] for xs in model_samps: xsm = (XM * XI) + ((1.0 - XM) * xs) masked_samps.append(xsm) return model_samps, masked_samps return imputer_sampler def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert (not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict() child_model_dicts[ 'p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict() child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return
class SimpleInfNet(object): def __init__(self, rng, in_dim, out_dim, \ W_mean=None, b_mean=None, \ W_logvar=None, b_logvar=None, \ name="", W_scale=1.0): # setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # set some basic layer properties self.in_dim = in_dim self.out_dim = out_dim # initialize weights and biases for mean estimate if W_mean is None: # Generate initial filters using orthogonal random trick W_shape = (self.in_dim, self.out_dim) if W_scale > 0.1: W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) W_init = W_scale * npr.normal(0.0, 1.0, W_shape) W_init = W_init.astype(theano.config.floatX) W_mean = theano.shared(value=W_init, \ name="{0:s}_W_mean".format(name)) if b_mean is None: b_init = np.zeros((self.out_dim,), \ dtype=theano.config.floatX) b_mean = theano.shared(value=b_init, \ name="{0:s}_b_mean".format(name)) # grab handles for easy access self.W_mean = W_mean self.b_mean = b_mean # initialize weights and biases for log-variance estimate if W_logvar is None: # Generate initial filters using orthogonal random trick W_shape = (self.in_dim, self.out_dim) W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) W_init = W_scale * npr.normal(0.0, 1.0, W_shape) #W_init = ortho_matrix(shape=W_shape, gain=W_scale) W_init = W_init.astype(theano.config.floatX) W_logvar = theano.shared(value=W_init, \ name="{0:s}_W_logvar".format(name)) if b_logvar is None: b_init = np.zeros((self.out_dim,), \ dtype=theano.config.floatX) b_logvar = theano.shared(value=b_init, \ name="{0:s}_b_logvar".format(name)) # grab handles for easy access self.W_logvar = W_logvar self.b_logvar = b_logvar # Conveniently package layer parameters self.mlp_params = [self.W_mean, self.b_mean, \ self.W_logvar, self.b_logvar] # Layer construction complete... return def get_bias(self): """ Get the bias at output layer. """ out_bias = self.b_mean return out_bias def apply(self, x, do_samples=True): """ Apply this SimpleInfNet to some input. """ z_mean = T.dot(x, self.W_mean) + self.b_mean z_logvar = T.dot(x, self.W_logvar) + self.b_logvar z_samples = z_mean + ( (T.exp(0.5*z_logvar)) * \ DCG(self.rng.normal(size=z_mean.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) ) # wrap them up for easy returnage result = [z_mean, z_logvar] if do_samples: result.append(z_samples) return result
class GenFCModule(object): """ Module that transforms random values through a single fully connected layer, and then a linear transform (with another relu, optionally). """ def __init__(self, rand_dim, out_dim, fc_dim, apply_bn_1=True, apply_bn_2=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_fc'): self.rand_dim = rand_dim self.out_dim = out_dim self.fc_dim = fc_dim self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ self.w1 = self.init_func((self.rand_dim, self.fc_dim), "{}_w1".format(self.mod_name)) self.w2 = self.init_func((self.fc_dim, self.out_dim), "{}_w2".format(self.mod_name)) self.params = [self.w1, self.w2] # make gains and biases for transforms that will get batch normed if self.apply_bn_1: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.fc_dim), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.fc_dim), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) if self.apply_bn_2: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g2 = gain_ifn((self.out_dim), "{}_g2".format(self.mod_name)) self.b2 = bias_ifn((self.out_dim), "{}_b2".format(self.mod_name)) self.params.extend([self.g2, self.b2]) return def apply(self, batch_size=None, rand_vals=None): """ Apply this generator module. Pass _either_ batch_size or rand_vals. """ assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals" if rand_vals is None: rand_shape = (batch_size, self.rand_dim) if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) else: rand_shape = (rand_vals.shape[0], self.rand_dim) rand_vals = rand_vals.reshape(rand_shape) # transform random values into fc layer h1 = T.dot(rand_vals, self.w1) if self.apply_bn_1: h1 = batchnorm(h1, g=self.g1, b=self.b1) h1 = relu(h1) # transform from fc layer to output h2 = T.dot(h1, self.w2) if self.apply_bn_2: h2 = batchnorm(h2, g=self.g2, b=self.b2) if self.final_relu: h2 = relu(h2) return h2
class GenUniModule(object): """ Module that applies a linear transform followed by an non-linearity. """ def __init__(self, rand_dim, out_dim, apply_bn=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_uni'): self.rand_dim = rand_dim self.out_dim = out_dim self.apply_bn = apply_bn self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ self.w1 = self.init_func((self.rand_dim, self.out_dim), "{}_w1".format(self.mod_name)) self.params = [self.w1] # make gains and biases for transforms that will get batch normed if self.apply_bn: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.out_dim), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.out_dim), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) return def apply(self, batch_size=None, rand_vals=None): """ Apply this generator module. Pass _either_ batch_size or rand_vals. """ assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals" if rand_vals is None: rand_shape = (batch_size, self.rand_dim) if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) else: rand_shape = (rand_vals.shape[0], self.rand_dim) rand_vals = rand_vals.reshape(rand_shape) # transform random values linearly h1 = T.dot(rand_vals, self.w1) if self.apply_bn: h1 = batchnorm(h1, g=self.g1, b=self.b1) if self.final_relu: h1 = relu(h1) return h1 ############## # EYE BUFFER # ##############
class GenConvModule(object): """ Module of one "fractionally strided" convolution layer followed by one regular convolution layer. Inputs to the fractionally strided convolution can optionally be augmented with some random values. Params: filt_shape: shape for convolution filters -- should be square and odd in_chans: number of channels in the inputs to module out_chans: number of channels in the outputs from module rand_chans: number of random channels to augment input use_rand: flag for whether or not to augment inputs apply_bn_1: flag for whether to batch normalize following first conv apply_bn_2: flag for whether to batch normalize following second conv us_stride: upsampling ratio in the fractionally strided convolution use_pooling: whether to use unpooling or fractional striding init_func: function for initializing module parameters mod_name: text name for identifying module in theano graph rand_type: whether to use Gaussian or uniform randomness """ def __init__(self, filt_shape, in_chans, out_chans, rand_chans, use_rand=True, apply_bn_1=True, apply_bn_2=True, us_stride=2, use_pooling=True, init_func=None, mod_name='gm_conv', rand_type='normal'): assert ((filt_shape[0] % 2) > 0), "filter dim should be odd (not even)" self.filt_dim = filt_shape[0] self.in_chans = in_chans self.out_chans = out_chans self.rand_chans = rand_chans self.use_rand = use_rand self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.us_stride = us_stride self.use_pooling = use_pooling self.mod_name = mod_name self.rand_type = rand_type self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ if self.use_rand: # random values will be stacked on exogenous input self.w1 = self.init_func( (self.out_chans, (self.in_chans + self.rand_chans), self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name)) else: # random values won't be stacked on exogenous input self.w1 = self.init_func( (self.out_chans, self.in_chans, self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name)) self.w2 = self.init_func( (self.out_chans, self.out_chans, self.filt_dim, self.filt_dim), "{}_w2".format(self.mod_name)) self.params = [self.w1, self.w2] # make gains and biases for transforms that will get batch normed if self.apply_bn_1: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.out_chans), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.out_chans), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) if self.apply_bn_2: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g2 = gain_ifn((self.out_chans), "{}_g2".format(self.mod_name)) self.b2 = bias_ifn((self.out_chans), "{}_b2".format(self.mod_name)) self.params.extend([self.g2, self.b2]) return def apply(self, input, rand_vals=None): """ Apply this generator module to some input. """ batch_size = input.shape[0] bm = int((self.filt_dim - 1) / 2) # use "same" mode convolutions ss = self.us_stride # stride for "learned upsampling" if self.use_pooling: # "unpool" the input if desired input = input.repeat(ss, axis=2).repeat(ss, axis=3) # get shape for random values that will augment input rand_shape = (batch_size, self.rand_chans, input.shape[2], input.shape[3]) if self.use_rand: # augment input with random channels if rand_vals is None: if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) rand_vals = rand_vals.reshape(rand_shape) # stack random values on top of input full_input = T.concatenate([rand_vals, input], axis=1) else: # don't augment input with random channels full_input = input # apply first convolution, perhaps with fractional striding if self.use_pooling: h1 = dnn_conv(full_input, self.w1, subsample=(1, 1), border_mode=(bm, bm)) else: # apply first conv layer (with fractional stride for upsampling) h1 = deconv(full_input, self.w1, subsample=(ss, ss), border_mode=(bm, bm)) if self.apply_bn_1: h1 = batchnorm(h1, g=self.g1, b=self.b1) h1 = relu(h1) # apply second conv layer h2 = dnn_conv(h1, self.w2, subsample=(1, 1), border_mode=(bm, bm)) if self.apply_bn_2: h2 = batchnorm(h2, g=self.g2, b=self.b2) h2 = relu(h2) return h2
class ConvPoolLayer(object): """ A simple convolution --> max-pooling layer. The (symbolic) input to this layer must be a theano.tensor.dtensor4 shaped like (batch_size, chan_count, im_dim_1, im_dim_2). filt_def should be a 4-tuple like (filt_count, in_chans, filt_def_1, filt_def_2) pool_def should be a 3-tuple like (pool_dim, pool_stride) """ def __init__(self, rng, input=None, filt_def=None, pool_def=(2, 2), \ activation=None, drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, name="", W_scale=1.0): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + self.rng.normal(size=input.shape, \ avg=0.0, std=input_noise, dtype=theano.config.floatX) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set the activation function for the conv filters if activation: self.activation = activation else: self.activation = lambda x: relu_actfun(x) # initialize weights with random weights W_init = 0.01 * np.asarray(rng.normal( \ size=filt_def), dtype=theano.config.floatX) self.W = theano.shared(value=(W_scale*W_init), \ name="{0:s}_W".format(name)) # the bias is a 1D tensor -- one bias per output feature map b_init = np.zeros((filt_def[0],), dtype=theano.config.floatX) + 0.1 self.b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # convolve input feature maps with filters input_c01b = self.noisy_input.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_c01b = self.W.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_op = FilterActs(stride=1, partial_sum=1) contig_input = gpu_contiguous(input_c01b) contig_filters = gpu_contiguous(filters_c01b) conv_out_c01b = conv_op(contig_input, contig_filters) if (bias_noise > 1e-4): noisy_conv_out_c01b = conv_out_c01b + self.rng.normal( \ size=conv_out_c01b.shape, avg=0.0, std=bias_noise, \ dtype=theano.config.floatX) else: noisy_conv_out_c01b = conv_out_c01b # downsample each feature map individually, using maxpooling pool_op = MaxPool(ds=pool_def[0], stride=pool_def[1]) mp_out_c01b = pool_op(noisy_conv_out_c01b) mp_out_bc01 = mp_out_c01b.dimshuffle(3, 0, 1, 2) # c01b to bc01 # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.noisy_linear_output = mp_out_bc01 + self.b.dimshuffle('x', 0, 'x', 'x') self.linear_output = self.noisy_linear_output self.output = self.activation(self.noisy_linear_output) # store parameters of this layer self.params = [self.W, self.b] return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class GenNet(object): """ A net that transforms a simple distribution so that it matches some more complicated distribution, for some definition of match.... Parameters: rng: a numpy.random RandomState object Xp: symbolic matrix for inputting latent variable samples prior_sigma: standard deviation of isotropic Gaussian prior that this generator will transform to match some other distribution params: a dict of parameters describing the desired network: lam_l2a: L2 regularization weight on neuron activations vis_drop: drop rate to use on the latent variable space hid_drop: drop rate to use on the hidden layer activations -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0 bias_noise: standard dev for noise on the biases of hidden layers out_noise: standard dev for noise on the output of this net mlp_config: list of "layer descriptions" activation: "function handle" for the desired non-linearity mlp_param_dicts: parameters for the MLP controlled by this GenNet """ def __init__(self, \ rng=None, \ Xp=None, \ prior_sigma=None, \ params=None, \ mlp_param_dicts=None): # First, setup a shared random number generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) # Grab the symbolic input matrix self.Xp = Xp self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### assert(not (params is None)) self.params = params lam_l2a = self.params['lam_l2a'] if 'vis_drop' in self.params: # Drop rate on the latent variables self.vis_drop = self.params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in self.params: # Drop rate on hidden layer activations self.hid_drop = self.params['hid_drop'] else: self.hid_drop = 0.0 if 'bias_noise' in self.params: # Noise sigma for hidden layer biases self.bias_noise = self.params['bias_noise'] else: self.bias_noise = 0.0 if 'out_noise' in self.params: # Noise sigma for the output/observable layer self.out_noise = self.params['out_noise'] else: self.out_noise = 0.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of a generative network, with all # of the network parameters shared between clones. if mlp_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.mlp_param_dicts = [] self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. mlp_param_dicts). self.mlp_param_dicts = mlp_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.mlp_config = params['mlp_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun self.mlp_depth = len(self.mlp_config) - 1 self.latent_dim = self.mlp_config[0] self.data_dim = self.mlp_config[-1] ########################## # Initialize the network # ########################## self.clip_params = {} self.mlp_layers = [] layer_def_pairs = zip(self.mlp_config[:-1],self.mlp_config[1:]) layer_num = 0 next_input = self.Xp for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "gn_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if last_layer: b_noise = self.out_noise else: b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=1.0) self.mlp_layers.append(new_layer) self.mlp_param_dicts.append({'W': new_layer.W, 'b': new_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.mlp_param_dicts[layer_num] self.mlp_layers.append(HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=1.0)) next_input = self.mlp_layers[-1].output # Set the non-bias parameters of this layer to be clipped self.clip_params[self.mlp_layers[-1].W] = 1 # Acknowledge layer completion layer_num = layer_num + 1 # TODO: implement adjustable norm clipping self.clip_norms = {} # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.mlp_layers: self.mlp_params.extend(layer.params) # The output of this generator network is given by the noisy output # of its final layer. We will keep a running estimate of the mean and # covariance of the distribution induced by combining this network's # latent noise source with its deep non-linear transform. These will # be used to encourage the induced distribution to match the first and # second-order moments of the distribution we are trying to match. #self.output = self.mlp_layers[-1].noisy_linear self.output = T.nnet.sigmoid(self.mlp_layers[-1].noisy_linear) self.out_dim = self.mlp_layers[-1].out_dim C_init = np.zeros((self.out_dim,self.out_dim)).astype(theano.config.floatX) m_init = np.zeros((self.out_dim,)).astype(theano.config.floatX) self.dist_mean = theano.shared(m_init, name='gn_dist_mean') self.dist_cov = theano.shared(C_init, name='gn_dist_cov') # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = lam_l2a * self._act_reg_cost() # Construct a sampler for drawing independent samples from this model's # isotropic Gaussian prior, and a sampler for the model distribution. self.sample_from_prior = self._construct_prior_sampler() self.sample_from_model = self._construct_model_sampler() # Construct a function for passing points from the latent/prior space # through the transform induced by the current model parameters. self.transform_prior = self._construct_transform_prior() return def _act_reg_cost(self): """ Apply L2 regularization to the activations in this network. """ act_sq_sums = [] for layer in self.mlp_layers: act_sq_sums.append(layer.act_l2_sum) full_act_sq_sum = T.sum(act_sq_sums) return full_act_sq_sum def _construct_prior_sampler(self): """ Draw independent samples from this model's isotropic Gaussian prior. """ samp_count = T.lscalar() prior_samples = self.prior_sigma * self.rng.normal( \ size=(samp_count, self.latent_dim), avg=0.0, std=1.0, \ dtype=theano.config.floatX) prior_sampler = theano.function([samp_count], outputs=prior_samples) return prior_sampler def _construct_model_sampler(self): """ Draw independent samples from this model's distribution. """ samp_count = T.lscalar() prior_samples = self.prior_sigma * self.rng.normal( \ size=(samp_count, self.latent_dim), avg=0.0, std=1.0, \ dtype=theano.config.floatX) prior_sampler = theano.function([samp_count], outputs=self.output, \ givens={self.Xp: prior_samples}) return prior_sampler def _construct_transform_prior(self): """ Apply the tranform induced by the current model parameters to some set of points in the latent/prior space. """ feedforward = theano.function([self.Xp], outputs=self.output) return feedforward def _batch_moments(self): """ Compute covariance and mean of the current sample outputs. """ mu = T.mean(self.output, axis=0, keepdims=True) sigma = T.dot((self.output.T - mu.T), (self.output - mu)) return [mu, sigma] def init_biases(self, b_init=0.0): """ Initialize the biases in all hidden layers to some constant. """ for layer in self.mlp_layers[:-1]: b_init = (0.0 * layer.b.get_value(borrow=False)) + b_init layer.b.set_value(b_init) return def init_moments(self, X_noise): """ Initialize the running mean and covariance estimates. """ X_noise_sym = T.matrix() out_func = theano.function(inputs=[ X_noise_sym ], \ outputs=[ self.output ], \ givens={self.Xp: X_noise_sym}) # Compute outputs for the input latent noise matrix X_out = out_func(X_noise.astype(theano.config.floatX))[0] # Compute mean and covariance of the outputs mu = np.mean(X_out, axis=0) X_out_minus_mu = X_out - mu sigma = np.dot(X_out_minus_mu.T,X_out_minus_mu) / X_out.shape[0] # Initialize the network's running estimates self.dist_cov.set_value(sigma.astype(theano.config.floatX)) self.dist_mean.set_value(mu.astype(theano.config.floatX)) return def shared_param_clone(self, rng=None, Xp=None): """ Return a clone of this network, with shared parameters but with different symbolic input variables. This can be used for "unrolling" a generate->infer->generate->infer... loop. Then, we can do backprop through time for various objectives. """ clone_net = GenNet(rng=rng, Xp=Xp, \ prior_sigma=self.prior_sigma, params=self.params, \ mlp_param_dicts=self.mlp_param_dicts) return clone_net
class WalkoutModel(object): """ Controller for training a forwards-backwards chainy model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_out: the goal state for forwards-backwards walking process p_z_given_x: InfNet for stochastic part of step p_x_given_z: HydraNet for deterministic part of step params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of observations to construct z_dim: dimension of latent space for policy wobble walkout_steps: number of steps to walk out x_type: can be "bernoulli" or "gaussian" x_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, x_out=None, \ p_z_given_x=None, \ p_x_given_z=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this WalkoutModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.walkout_steps = self.params['walkout_steps'] self.x_type = self.params['x_type'] self.shared_param_dicts = shared_param_dicts if 'x_transform' in self.params: assert((self.params['x_transform'] == 'sigmoid') or \ (self.params['x_transform'] == 'none')) if self.params['x_transform'] == 'sigmoid': self.x_transform = lambda x: T.nnet.sigmoid(x) else: self.x_transform = lambda x: x else: self.x_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.x_transform = lambda x: T.nnet.sigmoid(x) assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) assert ((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant networks self.p_z_given_x = p_z_given_x self.p_x_given_z = p_x_given_z # record the symbolic variables that will provide inputs to the # computation graph created for this WalkoutModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for walk-out wobble if self.shared_param_dicts is None: # initialize the parameters "owned" by this model zero_ary = to_fX(np.zeros((1, ))) self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar') self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) ############################################################### # Setup the forwards (i.e. training) walk-out loop using scan # ############################################################### def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw): # get samples of next zi, according to the forwards model zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv) # check reverse direction probability p(xi_fw | zi_fw) xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_bw_mean = self.x_transform(xi_bw_mean) nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \ log_vars=xi_bw_logvar, mask=None) nll_xi_bw = nll_xi_bw.flatten() # get samples of next xi, according to the forwards model xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_fw_mean = self.x_transform(xi_fw_mean) xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv) # check reverse direction probability p(zi_fw | xi_fw) zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \ log_vars=zi_bw_logvar, mask=None) nll_zi_bw = nll_zi_bw.flatten() # each loop iteration produces the following values: # xi_fw: xi generated fom zi by forwards walk # zi_fw: zi generated fom xi by forwards walk # xi_fw_mean: ---- # xi_fw_logvar: ---- # zi_fw_mean: ---- # zi_fw_logvar: ---- # nll_xi_bw: NLL for reverse step zi_fw -> xi_fw # nll_zi_bw: NLL for reverse step xi_fw -> zi_fw return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw # initialize states for x/z self.x0 = self.x_out self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim) # setup initial values to pass to scan op outputs_init = [self.x0, self.z0, None, None, None, None, None, None] sequences_init = [self.xi_zmuv, self.zi_zmuv] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan(forwards_loop, \ outputs_info=outputs_init, \ sequences=sequences_init) # grab results of the scan op. all values are computed for each step self.xi = self.scan_results[0] self.zi = self.scan_results[1] self.xi_fw_mean = self.scan_results[2] self.xi_fw_logvar = self.scan_results[3] self.zi_fw_mean = self.scan_results[4] self.zi_fw_logvar = self.scan_results[5] self.nll_xi_bw = self.scan_results[6] self.nll_zi_bw = self.scan_results[7] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='srr_lr') # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w') self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs( p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1, )) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_s self.lam_kld_s.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1, )) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xo): """ Construct the necessary ZMUV gaussian samples for generating trajectories from this WalkoutModel, for input matrix xo. """ zi_zmuv = self.rng.normal( \ size=(self.total_steps, xo.shape[0], self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return zi_zmuv def _construct_rev_masks(self, xo): """ Compute the sequential revelation masks for the input batch in xo. -- We need to construct mask sequences for both p and q. """ if self.use_rev_masks: # make batch copies of self.rev_masks_p and self.rev_masks_q pmasks = self.rev_masks_p.dimshuffle(0, 'x', 1).repeat(xo.shape[0], axis=1) qmasks = self.rev_masks_q.dimshuffle(0, 'x', 1).repeat(xo.shape[0], axis=1) else: pm_list = [] qm_list = [] # make a zero mask that does nothing zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1]) # generate independently sampled masks for each revelation block for rb in self.rev_sched: # make a random binary mask with ones at rate rb[1] rand_vals = self.rng.uniform( \ size=(1, xo.shape[0], xo.shape[1]), \ low=0.0, high=1.0, dtype=theano.config.floatX) rand_mask = rand_vals < rb[1] # append the masks for this revleation block to the mask lists # # the guide policy (in q) gets to peek at the values that will be # revealed to the primary policy (in p) for the entire block. The # primary policy only gets to see these values at end of the final # step of the block. Within a given step, values are revealed to q # at the beginning of the step, and to p at the end. # # e.g. in a revelation block with only a single step, the guide # policy sees the values at the beginning of the step, which allows # it to guide the step. the primary policy only gets to see the # values at the end of the step. # # i.e. a standard variational auto-encoder is equivalent to a # sequential revelation and refinement model with only one # revelation block, which has one step and a reveal rate of 1.0. # for refine_step in range(rb[0] - 1): pm_list.append(zero_mask) qm_list.append(rand_mask) pm_list.append(rand_mask) qm_list.append(rand_mask) # concatenate each mask list into a 3-tensor pmasks = T.cast(T.concatenate(pm_list, axis=0), 'floatX') qmasks = T.cast(T.concatenate(qm_list, axis=0), 'floatX') return [pmasks, qmasks] def _construct_nll_costs(self, si, xo, nll_mask): """ Construct the negative log-likelihood part of free energy. -- only check NLL where nll_mask == 1 """ xh = self._from_si_to_x(si) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar, mask=nll_mask) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_s(self, s_i, s_j): """ Compute KL(s_i || s_j) -- assuming bernoullish outputs """ x_i = self._from_si_to_x(s_i) x_j = self._from_si_to_x(s_j) kld_s = (x_i * (T.log(x_i) - T.log(x_j))) + \ ((1.0 - x_i) * (T.log(1.0-x_i) - T.log(1.0-x_j))) sum_kld = T.sum(kld_s, axis=1) return sum_kld def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] kld_sis = [] s0 = 0.0 * self.si[0] + self.s0 for i in range(self.total_steps): kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1)) if i == 0: kld_sis.append(self._construct_kld_s(self.si[i], s0)) else: kld_sis.append( self._construct_kld_s(self.si[i], self.si[i - 1])) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) kld_si = sum(kld_sis) return [kld_pi, kld_qi, kld_gi, kld_si] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xo ], \ outputs=[nll, kld], \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XO, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XO.shape[0], )) kld_sum = np.zeros((XO.shape[0], )) for i in range(sample_count): result = fe_term_sample(XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # compile theano function for computing the costs all_step_costs = [ self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g ] cost_func = theano.function(inputs=[ xo ], \ outputs=all_step_costs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # make a function for computing batch-based estimates of costs. # _step_nlls: the expected NLL cost for each step # _step_klds: the expected KL(q||p) cost for each step # _kld_q2p: the expected KL(q||p) cost for each latent dim # _kld_p2q: the expected KL(p||q) cost for each latent dim # _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim def raw_cost_computer(XO): _all_costs = cost_func(to_fX(XO)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX(np.asarray([k for k in _step_klds])) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX(np.asarray([k for k in _step_nlls])) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \ self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') return func def _construct_sequence_sampler(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function states = [self._from_si_to_x(self.s0_full)] + \ [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)] masks = [self.m0_full ] + [self.mi_p[i] for i in range(self.total_steps)] outputs = states + masks # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') # visualize trajectories generated by the model def sample_func(XO, use_guide_policy=False): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # get belief states and masks generated by the scan loop scan_vals = func(to_fX(XO)) step_count = self.total_steps + 1 seq_shape = (step_count, XO.shape[0], XO.shape[1]) xm_seq = np.zeros(seq_shape).astype(theano.config.floatX) xi_seq = np.zeros(seq_shape).astype(theano.config.floatX) mi_seq = np.zeros(seq_shape).astype(theano.config.floatX) for i in range(step_count): _xi = scan_vals[i] _mi = scan_vals[i + step_count] _xm = (_mi * XO) + ((1.0 - _mi) * _xi) xm_seq[i, :, :] = _xm xi_seq[i, :, :] = _xi mi_seq[i, :, :] = _mi # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return [xm_seq, xi_seq, mi_seq] return sample_func def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert (not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict() child_model_dicts[ 'p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict() child_model_dicts['p_x_given_si'] = self.p_x_given_si.save_to_dict() child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return
class InfNet(object): """ A net that tries to infer an approximate posterior for some observation, given some deep, directed generative model. The output of this network comprises two constructs: an approximate mean vector and an approximate standard deviation vector (i.e. diagonal matrix) for a Gaussian posterior. Parameters: rng: a numpy.random RandomState object Xd: symbolic input matrix for inputs params: a dict of parameters describing the desired network: vis_drop: drop rate to use on observable variables hid_drop: drop rate to use on hidden layer activations -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0 input_noise: standard dev for noise on the input of this net bias_noise: standard dev for noise on the biases of hidden layers shared_config: list of "layer descriptions" for shared part mu_config: list of "layer descriptions" for mu part sigma_config: list of "layer descriptions" for sigma part activation: "function handle" for the desired non-linearity init_scale: scaling factor for hidden layer weights (__ * 0.01) shared_param_dicts: parameters for the MLP controlled by this InfNet """ def __init__(self, \ rng=None, \ Xd=None, \ params=None, \ shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params if 'build_theano_funcs' in params: self.build_theano_funcs = params['build_theano_funcs'] else: self.build_theano_funcs = True if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.0 if 'input_noise' in params: self.input_noise = params['input_noise'] else: self.input_noise = 0.0 if 'bias_noise' in params: self.bias_noise = params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 if 'sigma_init_scale' in params: self.sigma_init_scale = params['sigma_init_scale'] else: self.sigma_init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.shared_config = params['shared_config'] self.mu_config = params['mu_config'] self.sigma_config = params['sigma_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun ######################################### # Initialize the shared part of network # ######################################### self.shared_layers = [] layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:]) layer_num = 0 # Construct input to the inference network next_input = self.Xd for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "share_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if first_layer: i_noise = self.input_noise b_noise = 0.0 else: i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append( \ new_layer.shared_param_dicts) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) next_input = self.shared_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ##################################### # Initialize the mu part of network # ##################################### self.mu_layers = [] layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "mu_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) self.shared_param_dicts['mu'].append( \ new_layer.shared_param_dicts) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['mu'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) next_input = self.mu_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ######################################## # Initialize the sigma part of network # ######################################## self.sigma_layers = [] layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "sigma_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if last_layer: # set in-bound weights for logvar predictions to 0 i_scale = 0.0 * i_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) self.shared_param_dicts['sigma'].append( \ new_layer.shared_param_dicts) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['sigma'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) next_input = self.sigma_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # Create a shared parameter for rescaling posterior "sigmas" to allow # control over the velocity of the markov chain generated by repeated # cycling through the INF -> GEN loop. if not ('sigma_scale' in self.shared_param_dicts['sigma'][-1]): # we use a hack-ish check to remain compatible with loading models # that were saved before the addition of the sigma_scale param. zero_ary = to_fX(np.zeros((1,))) self.sigma_scale = theano.shared(value=zero_ary) new_dict = {'sigma_scale': self.sigma_scale} self.shared_param_dicts['sigma'].append(new_dict) self.set_sigma_scale(1.0) else: # this is a clone of some other InfNet, and that InfNet was made # after adding the sigma_scale param, so use its sigma_scale self.sigma_scale = \ self.shared_param_dicts['sigma'][-1]['sigma_scale'] # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.mu_layers: self.mlp_params.extend(layer.params) for layer in self.sigma_layers: self.mlp_params.extend(layer.params) # The output of this inference network is given by the noisy output # of the final layers of its mu and sigma networks. self.output_mean, self.output_logvar, self.output_samples = \ self.apply(Xd) self.output = self.output_samples self.out_dim = self.sigma_layers[-1].out_dim # Construct a theano function for sampling from the approximate # posteriors inferred by this model for some collection of points # in the "data space". if self.build_theano_funcs: self.sample_posterior = self._construct_sample_posterior() self.mean_posterior = theano.function([self.Xd], \ outputs=self.output_mean) else: self.sample_posterior = None self.mean_posterior = None ######################################################## # CONSTRUCT FUNCTIONS FOR RICA PRETRAINING INPUT LAYER # ######################################################## self.rica_func = None self.W_rica = self.shared_layers[0].W return def apply(self, X, do_samples=True): """ Pass input X through this InfNet and get the resulting Gaussian conditional distribution. """ # pass activations through the shared layers shared_acts = [X] for layer in self.shared_layers: r0, r1, layer_acts = layer.apply(shared_acts[-1]) shared_acts.append(layer_acts) # pass activations through the mean estimating layers mu_acts = [shared_acts[-1]] for layer in self.mu_layers: r0, r1, layer_acts = layer.apply(mu_acts[-1]) mu_acts.append(layer_acts) layer_acts, r0, r1 = self.mu_layers[-1].apply(mu_acts[-2]) mu_acts[-1] = layer_acts # use linear output at last layer # pass activations through the logvar estimating layers sigma_acts = [shared_acts[-1]] for layer in self.sigma_layers: r0, r1, layer_acts = layer.apply(sigma_acts[-1]) sigma_acts.append(layer_acts) layer_acts, r0, r1 = self.sigma_layers[-1].apply(sigma_acts[-2]) sigma_acts[-1] = layer_acts # use linear output at last layer # construct the outputs we will want to access output_mean = mu_acts[-1] output_logvar = sigma_acts[-1] # wrap them up for easy returnage result = [output_mean, output_logvar] if do_samples: output_samples = output_mean + \ ( (self.sigma_scale[0] * T.exp(0.5*output_logvar)) * \ self.rng.normal(size=output_mean.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) ) result.append(output_samples) return result def apply_shared(self, X): """ Pass input X through this InfNet's shared layers. """ # pass activations through the shared layers shared_acts = [X] for layer in self.shared_layers: r0, r1, layer_acts = layer.apply(shared_acts[-1]) shared_acts.append(layer_acts) result = shared_acts[-1] return result def train_rica(self, X, lr, lam): """ CONSTRUCT FUNCTIONS FOR RICA PRETRAINING INPUT LAYER """ if self.rica_func is None: l_rate = T.scalar() lam_l1 = T.scalar() X_in = T.matrix('in_X_in') W_in = self.W_rica + self.rng.normal(size=self.W_rica.shape, \ avg=0.0, std=0.01, dtype=theano.config.floatX) X_enc = X_in H_rec = T.dot(X_enc, W_in) X_rec = T.dot(H_rec, W_in.T) recon_cost = T.sum((X_enc - X_rec)**2.0) / X_enc.shape[0] spars_cost = lam_l1 * (T.sum(soft_abs(H_rec)) / H_rec.shape[0]) rica_cost = recon_cost + spars_cost dW = T.grad(rica_cost, self.W_rica) rica_updates = {self.W_rica: self.W_rica - (l_rate * dW)} rica_outputs = [rica_cost, recon_cost, spars_cost] self.rica_func = theano.function([X_in, l_rate, lam_l1], \ outputs=rica_outputs, \ updates=rica_updates) outputs = self.rica_func(X, lr, lam) return outputs def set_sigma_scale(self, sigma_scale=1.0): """ Set the posterior sigma rescaling shared parameter to some value. """ zero_ary = np.zeros((1,)) new_scale = zero_ary + sigma_scale self.sigma_scale.set_value(to_fX(new_scale)) return def set_bias_noise(self, bias_noise=0.0): """ Set the bias noise in all hidden layers to the given value. """ new_ary = np.zeros((1,)) + bias_noise new_bn = to_fX( new_ary ) for layer in self.shared_layers: layer.bias_noise.set_value(new_bn) for layer in self.mu_layers: layer.bias_noise.set_value(new_bn) for layer in self.sigma_layers: layer.bias_noise.set_value(new_bn) return def _construct_sample_posterior(self): """ Construct a sampler that draws a single sample from the inferred posterior for some set of inputs. """ psample = theano.function([self.Xd], \ outputs=self.output) return psample def init_biases(self, b_init=0.0, b_std=1e-2): """ Initialize the biases in all hidden layers to some constant. """ for layer in self.shared_layers: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(to_fX(b_vec)) for layer in self.mu_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(to_fX(b_vec)) for layer in self.sigma_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(to_fX(b_vec)) return def shared_param_clone(self, rng=None, Xd=None): """ Return a clone of this network, with shared parameters but with different symbolic input variables. This can be used for "unrolling" a generate->infer->generate->infer... loop. Then, we can do backprop through time for various objectives. """ clone_net = InfNet(rng=rng, Xd=Xd, params=self.params, \ shared_param_dicts=self.shared_param_dicts) return clone_net def forked_param_clone(self, rng=None, Xd=None): """ Return a clone of this network, with forked copies of the current shared parameters of this InfNet, with different symbolic inputs too. """ new_spds = {} old_spds = self.shared_param_dicts # shared param dicts is nested like: dict of list of dicts # i.e., spd[k] is a list and spd[k][i] is a dict for k1 in old_spds: new_spds[k1] = [] for i in range(len(old_spds[k1])): new_spds[k1].append({}) for k2 in old_spds[k1][i]: old_sp = old_spds[k1][i][k2] old_sp_forked = old_sp.get_value(borrow=False) new_sp = theano.shared(value=old_sp_forked) new_spds[k1][i][k2] = new_sp clone_net = InfNet(rng=rng, Xd=Xd, params=self.params, \ shared_param_dicts=new_spds) return clone_net def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. We'll pickle everything required to create a clone of this model given the pickle and the rng/Xd params to the cloning function: "InfNet.shared_param_clone()". """ assert(not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {'shared': [], 'mu': [], 'sigma': []} for layer_group in ['shared', 'mu', 'sigma']: for shared_dict in self.shared_param_dicts[layer_group]: numpy_dict = {} for key in shared_dict: numpy_dict[key] = shared_dict[key].get_value(borrow=False) numpy_param_dicts[layer_group].append(numpy_dict) # dump the numpy version of self.shared_param_dicts cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) f_handle.close() return def save_to_dict(self): """ Dump important stuff to a dict that can reboot the model. """ model_dict = {} # dump the dict self.params, which just holds "simple" python values model_dict['params'] = self.params # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {'shared': [], 'mu': [], 'sigma': []} for layer_group in ['shared', 'mu', 'sigma']: for shared_dict in self.shared_param_dicts[layer_group]: numpy_dict = {} for key in shared_dict: numpy_dict[key] = shared_dict[key].get_value(borrow=False) numpy_param_dicts[layer_group].append(numpy_dict) # dump the numpy version of self.shared_param_dicts model_dict['numpy_param_dicts'] = numpy_param_dicts return model_dict
class HiddenLayer(object): def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, \ use_bias=True, name=""): # Setup a shared random generator for this layer #self.srng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.srng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + \ (input_noise * self.srng.normal(size=input.shape, \ dtype=theano.config.floatX)) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation: self.activation = activation else: if self.pool_size <= 1: self.activation = lambda x: relu_actfun(x) else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: if self.pool_size <= 1: # Generate random initial filters in a typical way W_init = np.asarray(0.04 * rng.standard_normal( \ size=(self.in_dim, self.filt_count)), \ dtype=theano.config.floatX) else: # Generate groups of random filters to pool over such that # intra-group correlations are stronger than inter-group # correlations, to encourage pooling over similar filters... filters = [] for g_num in range(self.pool_count): g_filt = 0.01 * rng.standard_normal(size=(self.in_dim,1)) for f_num in range(self.pool_size): f_filt = g_filt + (0.005 * rng.standard_normal( \ size=(self.in_dim,1))) filters.append(f_filt) W_init = np.hstack(filters).astype(theano.config.floatX) W = theano.shared(value=W_init, name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count,), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer if use_bias: self.linear_output = T.dot(self.noisy_input, self.W) + self.b else: self.linear_output = T.dot(self.noisy_input, self.W) # Add noise to the pre-activation features (if desired) self.noisy_linear = self.linear_output + \ (bias_noise * self.srng.normal(size=self.linear_output.shape, \ dtype=theano.config.floatX)) # Apply activation function self.output = self.activation(self.noisy_linear) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.output**2.) / self.output.size self.row_l1_sum = T.sum(abs(row_normalize(self.output))) / \ self.output.shape[0] self.col_l1_sum = T.sum(abs(col_normalize(self.output))) / \ self.output.shape[1] # Conveniently package layer parameters if use_bias: self.params = [self.W, self.b] else: self.params = [self.W] # Layer construction complete... return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p #drop_mask = self.srng.binomial(n=1, p=1-p, size=input.shape, \ # dtype=theano.config.floatX) noise_rnd = self.srng.uniform(input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = noise_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" #P_nz = P + self.srng.normal(size=P.shape, avg=0., std=noise_lvl, \ # dtype=theano.config.floatX) P_nz = P + self.srng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class InfNet(object): """ A net that tries to infer an approximate posterior for some observation, given some deep, directed generative model. The output of this network comprises two constructs: an approximate mean vector and an approximate standard deviation vector (i.e. diagonal matrix) for a Gaussian posterior. Parameters: rng: a numpy.random RandomState object Xd: symbolic input matrix for inputs prior_sigma: standard deviation of isotropic Gaussian prior that our inferred posteriors will be penalized for deviating from. params: a dict of parameters describing the desired network: lam_l2a: L2 regularization weight on neuron activations vis_drop: drop rate to use on observable variables hid_drop: drop rate to use on hidden layer activations -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0 input_noise: standard dev for noise on the input of this net bias_noise: standard dev for noise on the biases of hidden layers shared_config: list of "layer descriptions" for shared part mu_config: list of "layer descriptions" for mu part sigma_config: list of "layer descriptions" for sigma part activation: "function handle" for the desired non-linearity init_scale: scaling factor for hidden layer weights (__ * 0.01) encoder: a function that will be applied to inputs prior to passing them through the network. this can be used for in-lining, e.g., PCA preprocessing on training data shared_param_dicts: parameters for the MLP controlled by this InfNet """ def __init__(self, \ rng=None, \ Xd=None, \ prior_sigma=None, \ params=None, \ shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params self.lam_l2a = params['lam_l2a'] if 'build_theano_funcs' in params: self.build_theano_funcs = params['build_theano_funcs'] else: self.build_theano_funcs = True if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.0 if 'input_noise' in params: self.input_noise = params['input_noise'] else: self.input_noise = 0.0 if 'bias_noise' in params: self.bias_noise = params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 if 'encoder' in params: self.encoder = params['encoder'] self.decoder = params['decoder'] self.use_encoder = True self.Xd_encoded = self.encoder(self.Xd) else: self.encoder = lambda x: x self.decoder = lambda x: x self.use_encoder = False self.Xd_encoded = self.encoder(self.Xd) if 'kld2_scale' in params: self.kld2_scale = params['kld2_scale'] else: self.kld2_scale = 0.0 if 'sigma_init_scale' in params: self.sigma_init_scale = params['sigma_init_scale'] else: self.sigma_init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.shared_config = params['shared_config'] self.mu_config = params['mu_config'] self.sigma_config = params['sigma_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun ######################################### # Initialize the shared part of network # ######################################### self.shared_layers = [] layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:]) layer_num = 0 # Construct input to the inference network if self.use_encoder: next_input = self.encoder(self.Xd) else: next_input = self.Xd for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "share_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if first_layer: i_noise = self.input_noise b_noise = 0.0 else: i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.shared_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ##################################### # Initialize the mu part of network # ##################################### self.mu_layers = [] layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "mu_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) self.shared_param_dicts['mu'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['mu'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.mu_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ######################################## # Initialize the sigma part of network # ######################################## self.sigma_layers = [] layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "sigma_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if last_layer: # set in-bound weights for logvar predictions to 0 i_scale = 0.0 * i_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) self.shared_param_dicts['sigma'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['sigma'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.sigma_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # Create a shared parameter for rescaling posterior "sigmas" to allow # control over the velocity of the markov chain generated by repeated # cycling through the INF -> GEN loop. if not ('sigma_scale' in self.shared_param_dicts['sigma'][-1]): # we use a hack-ish check to remain compatible with loading models # that were saved before the addition of the sigma_scale param. zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.sigma_scale = theano.shared(value=zero_ary) new_dict = {'sigma_scale': self.sigma_scale} self.shared_param_dicts['sigma'].append(new_dict) self.set_sigma_scale(1.0) else: # this is a clone of some other InfNet, and that InfNet was made # after adding the sigma_scale param, so use its sigma_scale self.sigma_scale = \ self.shared_param_dicts['sigma'][-1]['sigma_scale'] # Create a shared parameter for maintaining an exponentially decaying # estimate of the population mean of posterior KL divergence. if not ('kld_mean' in self.shared_param_dicts['sigma'][-1]): # add a kld_mean if none was already present zero_ary = np.zeros((1,)).astype(theano.config.floatX) + 100.0 self.kld_mean = theano.shared(value=zero_ary) self.shared_param_dicts['sigma'][-1]['kld_mean'] = self.kld_mean else: # use a kld_mean that's already present self.kld_mean = self.shared_param_dicts['sigma'][-1]['kld_mean'] # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.mu_layers: self.mlp_params.extend(layer.params) for layer in self.sigma_layers: self.mlp_params.extend(layer.params) # The output of this inference network is given by the noisy output # of the final layers of its mu and sigma networks. self.output_mean = self.mu_layers[-1].linear_output self.output_logvar = self.sigma_layers[-1].linear_output self.output_sigma = self.sigma_init_scale * self.sigma_scale[0] * \ T.exp(0.5 * self.output_logvar) # We'll also construct an output containing a single samples from each # of the distributions represented by the rows of self.output_mean and # self.output_sigma. self.output = self._construct_post_samples() self.out_dim = self.sigma_layers[-1].out_dim # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = self.lam_l2a * self._act_reg_cost() # Construct a function for penalizing KL divergence between the # approximate posteriors produced by this model and some isotropic # Gaussian distribution. self.kld_cost = self._construct_kld_cost() self.kld_mean_update = T.cast((0.98 * self.kld_mean) + \ (0.02 * T.mean(self.kld_cost)), 'floatX') # Construct a theano function for sampling from the approximate # posteriors inferred by this model for some collection of points # in the "data space". if self.build_theano_funcs: self.sample_posterior = self._construct_sample_posterior() self.mean_posterior = theano.function([self.Xd], \ outputs=self.output_mean) else: self.sample_posterior = None self.mean_posterior = None return def set_sigma_scale(self, sigma_scale=1.0): """ Set the posterior sigma rescaling shared parameter to some value. """ zero_ary = np.zeros((1,)) new_scale = zero_ary + sigma_scale self.sigma_scale.set_value(new_scale.astype(theano.config.floatX)) return def _act_reg_cost(self): """ Apply L2 regularization to the activations in each net. """ act_sq_sums = [] for layer in self.shared_layers: act_sq_sums.append(layer.act_l2_sum) for layer in self.mu_layers: act_sq_sums.append(layer.act_l2_sum) for layer in self.sigma_layers: act_sq_sums.append(layer.act_l2_sum) full_act_sq_sum = T.sum(act_sq_sums) return full_act_sq_sum def _construct_post_samples(self): """ Draw a single sample from each of the approximate posteriors encoded in self.output_mean and self.output_sigma. """ post_samples = self.output_mean + (self.output_sigma * \ self.rng.normal(size=self.output_sigma.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) return post_samples def _construct_kld_cost(self): """ Compute (analytically) the KL divergence between each approximate posterior encoded by self.mu/self.sigma and the isotropic Gaussian distribution with mean 0 and standard deviation self.prior_sigma. """ prior_mu = 0.0 prior_logvar = np.log(self.prior_sigma**2.0) post_klds = gaussian_kld(self.output_mean, self.output_logvar, \ prior_mu, prior_logvar) kld_cost = T.sum(post_klds, axis=1, keepdims=True) return kld_cost def _construct_sample_posterior(self): """ Construct a sampler that draws a single sample from the inferred posterior for some set of inputs. """ psample = theano.function([self.Xd], \ outputs=self.output) return psample def init_biases(self, b_init=0.0, b_std=1e-2): """ Initialize the biases in all hidden layers to some constant. """ for layer in self.shared_layers: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(b_vec.astype(theano.config.floatX)) for layer in self.mu_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(b_vec.astype(theano.config.floatX)) for layer in self.sigma_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(b_vec.astype(theano.config.floatX)) return def shared_param_clone(self, rng=None, Xd=None, build_funcs=True): """ Return a clone of this network, with shared parameters but with different symbolic input variables. This can be used for "unrolling" a generate->infer->generate->infer... loop. Then, we can do backprop through time for various objectives. """ new_params = self.params new_params['build_theano_funcs'] = build_funcs clone_net = InfNet(rng=rng, Xd=Xd, \ prior_sigma=self.prior_sigma, params=self.params, \ shared_param_dicts=self.shared_param_dicts) return clone_net def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. We'll pickle everything required to create a clone of this model given the pickle and the rng/Xd params to the cloning function: "InfNet.shared_param_clone()". """ assert(not (f_name is None)) f_handle = file(f_name, 'wb') # dump the "simple" python value in self.prior_sigma cPickle.dump(self.prior_sigma, f_handle, protocol=-1) # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {'shared': [], 'mu': [], 'sigma': []} for layer_group in ['shared', 'mu', 'sigma']: for shared_dict in self.shared_param_dicts[layer_group]: numpy_dict = {} for key in shared_dict: numpy_dict[key] = shared_dict[key].get_value(borrow=False) numpy_param_dicts[layer_group].append(numpy_dict) # dump the numpy version of self.shared_param_dicts cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) f_handle.close() return
class TwoStageModel2(object): """ Controller for training a two-step hierarchical generative model. x: the "observation" variables z: the "prior" latent variables h: the "hidden" latent variables Generative model is: p(x) = \sum_{z,h} p(x|h) p(h|z) p(z) Variational model is: q(h,z|x) = q(h|x) q(z|h) Parameters: rng: numpy.random.RandomState (for reproducibility) x_in: the input data to encode x_out: the target output to decode p_h_given_z: InfNet for h given z p_x_given_h: InfNet for x given h q_h_given_x: InfNet for h given x q_z_given_h: InfNet for z given h x_dim: dimension of the "observation" space z_dim: dimension of the "prior" latent space h_dim: dimension of the "hidden" latent space params: REQUIRED PARAMS SHOWN BELOW x_type: can be "bernoulli" or "gaussian" obs_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_h_given_z=None, \ p_x_given_h=None, \ q_h_given_x=None, \ q_z_given_h=None, \ x_dim=None, \ z_dim=None, \ h_dim=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.h_dim = h_dim # grab handles to the relevant InfNets self.q_h_given_x = q_h_given_x self.q_z_given_h = q_z_given_h self.p_h_given_z = p_h_given_z self.p_x_given_h = p_x_given_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this TSM init_vec = to_fX( np.zeros((1,self.z_dim)) ) self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar') self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "hidden" latent state (from q) h_q_mean, h_q_logvar, h_q = \ self.q_h_given_x.apply(self.x_in, do_samples=True) # samples of "prior" latent state (from q) z_q_mean, z_q_logvar, z_q = \ self.q_z_given_h.apply(h_q, do_samples=True) # samples of "prior" latent state (from p) z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0) z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0) zmuv = self.rng.normal(size=z_q.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) z_p = (T.exp(0.5*z_p_logvar) * zmuv) + z_p_mean # samples from z -- switched between q/p self.z = (self.train_switch[0] * z_q) + \ ((1.0 - self.train_switch[0]) * z_p) # samples of "hidden" latent state (from p) h_p_mean, h_p_logvar, h_p = \ self.p_h_given_z.apply(self.z, do_samples=True) # samples from h -- switched between q/p self.h = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) # compute KLds for "prior" and "hidden" latent distributions self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, \ z_p_mean, z_p_logvar) self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, \ z_q_mean, z_q_logvar) self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, \ h_p_mean, h_p_logvar) self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, \ h_q_mean, h_q_logvar) # p_x_given_h generates an observation x conditioned on the "hidden" # latent variables h. self.x_gen, _ = self.p_x_given_h.apply(self.h, do_samples=False) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='tsm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w') self.set_lam_l2w(1e-5) # get optimizable parameters belonging to the TwoStageModel self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar] # get optimizable parameters belonging to the underlying networks child_params = [] child_params.extend(self.q_h_given_x.mlp_params) child_params.extend(self.q_z_given_h.mlp_params) child_params.extend(self.p_h_given_z.mlp_params) child_params.extend(self.p_x_given_h.mlp_params) # make a joint list of all optimizable parameters self.joint_params = self_params + child_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = T.sum(self.kld_z, axis=1) + \ T.sum(self.kld_h, axis=1) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # construct the updates for the generator and inferencer networks all_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0) self.joint_updates = OrderedDict() for k in all_updates: self.joint_updates[k] = all_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() return def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_nll(self, lam_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_nll self.lam_nll.set_value(to_fX(new_lam)) return def set_lam_kld(self, lam_kld_q2p=1.0, lam_kld_p2q=1.0): """ Set the relative weight of various KL-divergences. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_q2p self.lam_kld_q2p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_p2q self.lam_kld_p2q.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_nll_costs(self, xo): """ Construct the negative log-likelihood part of free energy. """ # average log-likelihood over the refinement sequence xh = self.obs_transform(self.x_gen) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar) nll_costs = -ll_costs return nll_costs def _construct_kld_costs(self, p=1.0): """ Construct the posterior KL-divergence part of cost to minimize. """ kld_z_q2p = T.sum(self.kld_z_q2p**p, axis=1, keepdims=True) kld_z_p2q = T.sum(self.kld_z_p2q**p, axis=1, keepdims=True) kld_h_q2p = T.sum(self.kld_h_q2p**p, axis=1, keepdims=True) kld_h_p2q = T.sum(self.kld_h_p2q**p, axis=1, keepdims=True) return [kld_z_q2p, kld_z_p2q, kld_h_q2p, kld_h_p2q] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() br = T.lscalar() # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \ self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xi, xo, br ], \ outputs=outputs, \ givens={ self.x_in: xi.repeat(br, axis=0), \ self.x_out: xo.repeat(br, axis=0) }, \ updates=self.joint_updates) return func def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # construct values to output nll = self._construct_nll_costs(self.x_out) kld_z = self.kld_z_q2p kld_h = self.kld_h_q2p # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[self.x_in, self.x_out], \ outputs=[nll, kld_z, kld_h]) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XI, XO, sample_count): # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XI.shape[0],)) kld_z_sum = np.zeros((XI.shape[0],)) kld_h_sum = np.zeros((XI.shape[0],)) for i in range(sample_count): result = fe_term_sample(XI, XO) nll_sum += result[0].ravel() kld_z_sum += np.sum(result[1], axis=1).ravel() kld_h_sum += np.sum(result[2], axis=1).ravel() mean_nll = nll_sum / float(sample_count) mean_kld = (kld_z_sum + kld_h_sum) / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator def _construct_sample_from_prior(self): """ Construct a function for drawing independent samples from the distribution generated by this TwoStageModel. """ x_sym = T.matrix() sample_func = theano.function(inputs=[x_sym], \ outputs=self.obs_transform(self.x_gen), \ givens={self.x_in: T.zeros_like(x_sym), \ self.x_out: T.zeros_like(x_sym)}) def prior_sampler(samp_count): x_samps = to_fX( np.zeros((samp_count, self.x_dim)) ) old_switch = self.train_switch.get_value(borrow=False) # set model to generation mode self.set_train_switch(switch_val=0.0) # generate samples from model model_samps = sample_func(x_samps) # set model back to previous mode self.set_train_switch(switch_val=old_switch) return model_samps return prior_sampler
class HiddenLayer(object): def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, b_in=None, s_in=None, \ name="", W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) # setup parameters for controlling zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.input_noise = theano.shared(value=(zero_ary+input_noise), \ name="{0:s}_input_noise".format(name)) self.bias_noise = theano.shared(value=(zero_ary+bias_noise), \ name="{0:s}_bias_noise".format(name)) self.drop_rate = theano.shared(value=(zero_ary+drop_rate), \ name="{0:s}_drop_rate".format(name)) # setup scale and bias params for the input if b_in is None: # input biases are always initialized to zero ary = np.zeros((in_dim,), dtype=theano.config.floatX) b_in = theano.shared(value=ary, name="{0:s}_b_in".format(name)) if s_in is None: # input scales are always initialized to one ary = 0.541325 * np.ones((in_dim,), dtype=theano.config.floatX) s_in = theano.shared(value=ary, name="{0:s}_s_in".format(name)) self.b_in = b_in self.s_in = s_in # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation is None: activation = relu_actfun if self.pool_size <= 1: self.activation = activation else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: # Generate initial filters using orthogonal random trick W_shape = (self.in_dim, self.filt_count) #W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) #W_init = W_scale * npr.normal(0.0, 1.0, W_shape) W_init = ortho_matrix(shape=(self.in_dim, self.filt_count), \ gain=W_scale) #W_init = 0.01 * npr.normal(0.0, 1.0, W_shape) W_init = W_init.astype(theano.config.floatX) W = theano.shared(value=W_init, name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count,), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Feedforward through the layer use_in = input_noise > 0.001 use_bn = bias_noise > 0.001 use_drop = drop_rate > 0.001 self.linear_output, self.noisy_linear, self.output = \ self.apply(input, use_in=use_in, use_bn=use_bn, \ use_drop=use_drop) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.noisy_linear**2.) / self.output.size # Conveniently package layer parameters self.params = [self.W, self.b, self.b_in, self.s_in] self.shared_param_dicts = { \ 'W': self.W, \ 'b': self.b, \ 'b_in': self.b_in, \ 's_in': self.s_in } # Layer construction complete... return def apply(self, input, use_in=False, use_bn=False, use_drop=False): """ Apply feedforward to this input, returning several partial results. """ # Add gaussian noise to the input (if desired) #fancy_input = T.nnet.softplus(self.s_in) * (input + self.b_in) fancy_input = input if use_in: fuzzy_input = fancy_input + self.input_noise[0] * \ self.rng.normal(size=fancy_input.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: fuzzy_input = fancy_input # Apply masking noise to the input (if desired) if use_drop: noisy_input = self._drop_from_input(fuzzy_input, self.drop_rate[0]) else: noisy_input = fuzzy_input self.noisy_input = noisy_input # Compute linear "pre-activation" for this layer linear_output = T.dot(noisy_input, self.W) + self.b # Add noise to the pre-activation features (if desired) if use_bn: noisy_linear = linear_output + self.bias_noise[0] * \ self.rng.normal(size=linear_output.shape, avg=0.0, \ std=1.0, dtype=theano.config.floatX) else: noisy_linear = linear_output # Apply activation function final_output = self.activation(noisy_linear) # package partial results for easy return results = [linear_output, noisy_linear, final_output] return results def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class GenConvModule(object): """ Module of one "fractionally strided" convolution layer followed by one regular convolution layer. Inputs to the fractionally strided convolution can optionally be augmented with some random values. Params: filt_shape: shape for convolution filters -- should be square and odd in_chans: number of channels in the inputs to module out_chans: number of channels in the outputs from module rand_chans: number of random channels to augment input use_rand: flag for whether or not to augment inputs apply_bn_1: flag for whether to batch normalize following first conv apply_bn_2: flag for whether to batch normalize following second conv us_stride: upsampling ratio in the fractionally strided convolution use_pooling: whether to use unpooling or fractional striding init_func: function for initializing module parameters mod_name: text name for identifying module in theano graph rand_type: whether to use Gaussian or uniform randomness """ def __init__(self, filt_shape, in_chans, out_chans, rand_chans, use_rand=True, apply_bn_1=True, apply_bn_2=True, us_stride=2, use_pooling=True, init_func=None, mod_name='gm_conv', rand_type='normal'): assert ((filt_shape[0] % 2) > 0), "filter dim should be odd (not even)" self.filt_dim = filt_shape[0] self.in_chans = in_chans self.out_chans = out_chans self.rand_chans = rand_chans self.use_rand = use_rand self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.us_stride = us_stride self.use_pooling = use_pooling self.mod_name = mod_name self.rand_type = rand_type self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ if self.use_rand: # random values will be stacked on exogenous input self.w1 = self.init_func((self.out_chans, (self.in_chans+self.rand_chans), self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name)) else: # random values won't be stacked on exogenous input self.w1 = self.init_func((self.out_chans, self.in_chans, self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name)) self.w2 = self.init_func((self.out_chans, self.out_chans, self.filt_dim, self.filt_dim), "{}_w2".format(self.mod_name)) self.params = [self.w1, self.w2] # make gains and biases for transforms that will get batch normed if self.apply_bn_1: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.out_chans), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.out_chans), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) if self.apply_bn_2: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g2 = gain_ifn((self.out_chans), "{}_g2".format(self.mod_name)) self.b2 = bias_ifn((self.out_chans), "{}_b2".format(self.mod_name)) self.params.extend([self.g2, self.b2]) return def apply(self, input, rand_vals=None): """ Apply this generator module to some input. """ batch_size = input.shape[0] bm = int((self.filt_dim - 1) / 2) # use "same" mode convolutions ss = self.us_stride # stride for "learned upsampling" if self.use_pooling: # "unpool" the input if desired input = input.repeat(ss, axis=2).repeat(ss, axis=3) # get shape for random values that will augment input rand_shape = (batch_size, self.rand_chans, input.shape[2], input.shape[3]) if self.use_rand: # augment input with random channels if rand_vals is None: if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) rand_vals = rand_vals.reshape(rand_shape) # stack random values on top of input full_input = T.concatenate([rand_vals, input], axis=1) else: # don't augment input with random channels full_input = input # apply first convolution, perhaps with fractional striding if self.use_pooling: h1 = dnn_conv(full_input, self.w1, subsample=(1, 1), border_mode=(bm, bm)) else: # apply first conv layer (with fractional stride for upsampling) h1 = deconv(full_input, self.w1, subsample=(ss, ss), border_mode=(bm, bm)) if self.apply_bn_1: h1 = batchnorm(h1, g=self.g1, b=self.b1) h1 = relu(h1) # apply second conv layer h2 = dnn_conv(h1, self.w2, subsample=(1, 1), border_mode=(bm, bm)) if self.apply_bn_2: h2 = batchnorm(h2, g=self.g2, b=self.b2) h2 = relu(h2) return h2
class DAELayer(object): def __init__(self, rng, clean_input=None, fuzzy_input=None, \ in_dim=0, out_dim=0, activation=None, input_noise=0., \ W=None, b_h=None, b_v=None, W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) # Grab the layer input and perturb it with some sort of noise. This # is, afterall, a _denoising_ autoencoder... self.clean_input = clean_input self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise) # Set some basic layer properties self.activation = activation self.in_dim = in_dim self.out_dim = out_dim # Get some random initial weights and biases, if not given if W is None: W_init = np.asarray(1.0 * DCG(rng.standard_normal( \ size=(in_dim, out_dim)), dtype=theano.config.floatX)) W = theano.shared(value=(W_scale*W_init), name='W') if b_h is None: b_init = np.zeros((out_dim,), dtype=theano.config.floatX) b_h = theano.shared(value=b_init, name='b_h') if b_v is None: b_init = np.zeros((in_dim,), dtype=theano.config.floatX) b_v = theano.shared(value=b_init, name='b_v') # Grab pointers to the now-initialized weights and biases self.W = W self.b_h = b_h self.b_v = b_v # Put the learnable/optimizable parameters into a list self.params = [self.W, self.b_h, self.b_v] # Beep boop... layer construction complete... return def compute_costs(self, lam_l1=None): """Compute reconstruction and activation sparsity costs.""" # Get noise-perturbed encoder/decoder parameters W_nz = self._noisy_params(self.W, 0.01) b_nz = self.b_h #self._noisy_params(self.b_h, 0.05) # Compute hidden and visible activations A_v, A_h = self._compute_activations(self.noisy_input, \ W_nz, b_nz, self.b_v) # Compute reconstruction error cost recon_cost = T.sum((self.clean_input - A_v)**2.0) / \ self.clean_input.shape[0] # Compute sparsity penalty (over both population and lifetime) row_l1_sum = T.sum(abs(row_normalize(A_h))) / A_h.shape[0] col_l1_sum = T.sum(abs(col_normalize(A_h))) / A_h.shape[1] sparse_cost = lam_l1[0] * (row_l1_sum + col_l1_sum) return [recon_cost, sparse_cost] def _compute_hidden_acts(self, X, W, b_h): """Compute activations of encoder (at hidden layer).""" A_h = self.activation(T.dot(X, W) + b_h) return A_h def _compute_activations(self, X, W, b_h, b_v): """Compute activations of decoder (at visible layer).""" A_h = self._compute_hidden_acts(X, W, b_h) A_v = T.dot(A_h, W.T) + b_v return [A_v, A_h] def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" if noise_lvl > 1e-3: P_nz = P + DCG(self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX)) else: P_nz = P return P_nz def _get_noisy_input(self, input, p): """p is the probability of dropping elements of input.""" drop_rnd = self.rng.uniform(input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # Cast mask from int to float32, to keep things on GPU noisy_input = input * DCG(drop_mask) return noisy_input
class HiddenLayer(object): def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, name="", W_scale=1.0): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + self.rng.normal(size=input.shape, \ avg=0.0, std=input_noise, dtype=theano.config.floatX) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation: self.activation = activation else: if self.pool_size <= 1: self.activation = lambda x: relu_actfun(x) else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: if self.pool_size <= 1: # Generate random initial filters in a typical way W_init = 0.01 * np.asarray(rng.normal( \ size=(self.in_dim, self.filt_count)), \ dtype=theano.config.floatX) else: # Generate groups of random filters to pool over such that # intra-group correlations are stronger than inter-group # correlations, to encourage pooling over similar filters... filters = [] f_size = (self.in_dim, 1) for g_num in range(self.pool_count): g_filt = 0.01 * rng.normal(size=f_size) for f_num in range(self.pool_size): f_filt = g_filt + 0.003 * rng.normal(size=f_size) filters.append(f_filt) W_init = np.hstack(filters).astype(theano.config.floatX) W = theano.shared(value=(W_scale * W_init), name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count, ), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer self.linear_output = T.dot(self.noisy_input, self.W) + self.b # Add noise to the pre-activation features (if desired) if bias_noise > 1e-3: self.noisy_linear = self.linear_output + \ self.rng.normal(size=self.linear_output.shape, \ avg=0.0, std=bias_noise, dtype=theano.config.floatX) else: self.noisy_linear = self.linear_output # Apply activation function self.output = self.activation(self.noisy_linear) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.output**2.) / self.output.size self.row_l1_sum = T.sum(abs(row_normalize(self.output))) / \ self.output.shape[0] self.col_l1_sum = T.sum(abs(col_normalize(self.output))) / \ self.output.shape[1] # Conveniently package layer parameters self.params = [self.W, self.b] # Layer construction complete... return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class GenUniModule(object): """ Module that applies a linear transform followed by an non-linearity. """ def __init__(self, rand_dim, out_dim, apply_bn=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_uni'): self.rand_dim = rand_dim self.out_dim = out_dim self.apply_bn = apply_bn self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ self.w1 = self.init_func((self.rand_dim, self.out_dim), "{}_w1".format(self.mod_name)) self.params = [ self.w1 ] # make gains and biases for transforms that will get batch normed if self.apply_bn: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.out_dim), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.out_dim), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) return def apply(self, batch_size=None, rand_vals=None): """ Apply this generator module. Pass _either_ batch_size or rand_vals. """ assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals" if rand_vals is None: rand_shape = (batch_size, self.rand_dim) if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) else: rand_shape = (rand_vals.shape[0], self.rand_dim) rand_vals = rand_vals.reshape(rand_shape) # transform random values linearly h1 = T.dot(rand_vals, self.w1) if self.apply_bn: h1 = batchnorm(h1, g=self.g1, b=self.b1) if self.final_relu: h1 = relu(h1) return h1 ############## # EYE BUFFER # ##############
class MultiStageModel(object): """ Controller for training a multi-step iterative refinement model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_in: the input data to encode x_out: the target output to decode p_s0_given_z: InfNet for initializing "canvas" state p_hi_given_si: InfNet for hi given si p_sip1_given_si_hi: HydraNet for sip1 given si and hi q_z_given_x: InfNet for z given x q_hi_given_x_si: InfNet for hi given x and si obs_dim: dimension of the observations to generate z_dim: dimension of the "initial" latent space h_dim: dimension of the "primary" latent space ir_steps: number of "iterative refinement" steps to perform params: REQUIRED PARAMS SHOWN BELOW x_type: can be "bernoulli" or "gaussian" obs_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_s0_given_z=None, \ p_hi_given_si=None, \ p_sip1_given_si_hi=None, \ q_z_given_x=None, \ q_hi_given_x_si=None, \ obs_dim=None, \ z_dim=None, h_dim=None, \ ir_steps=4, params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.obs_dim = obs_dim self.z_dim = z_dim self.h_dim = h_dim self.ir_steps = ir_steps # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_hi_given_x_si = q_hi_given_x_si self.p_s0_given_z = p_s0_given_z self.p_hi_given_si = p_hi_given_si self.p_sip1_given_si_hi = p_sip1_given_si_hi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.hi_zmuv = T.tensor3() # for ZMUV Gaussian samples to use in scan # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) # this weight balances l1 vs. l2 penalty on posterior KLds self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2') self.set_lam_kld_l1l2(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar') init_vec = to_fX( np.zeros((self.obs_dim,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) # setup a function for computing reconstruction log likelihood if self.x_type == 'bernoulli': self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_bernoulli(xo, xh)) else: self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar)) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.x_out.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) ############################# # Setup self.z and self.s0. # ############################# print("Building MSM step 0...") drop_x = drop_mask * self.x_in self.q_z_mean, self.q_z_logvar, self.z = \ self.q_z_given_x.apply(drop_x, do_samples=True) # get initial observation state self.s0, _ = self.p_s0_given_z.apply(self.z, do_samples=False) # gather KLd and NLL for the initialization step self.init_klds = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) self.init_nlls = -1.0 * \ self.log_prob_func(self.x_out, self.obs_transform(self.s0)) ################################################## # Setup the iterative generation loop using scan # ################################################## def ir_step_func(hi_zmuv, sim1): # get variables used throughout this refinement step sim1_obs = self.obs_transform(sim1) # transform state -> obs grad_ll = self.x_out - sim1_obs # get samples of next hi, conditioned on current si hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \ sim1_obs, do_samples=False) # now we build the model for variational hi given si hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \ T.horizontal_stack(grad_ll, sim1_obs), \ do_samples=False) hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean # make hi samples that can be switched between hi_p and hi_q hi = ( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on si and hi. ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi) # get the transformed values (for an LSTM style update) i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0) f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0) # perform an LSTM-like update of the state sim1 -> si si = (in_vals * i_gate) + (sim1 * f_gate) # compute generator NLL for this step nlli = self.log_prob_func(self.x_out, self.obs_transform(si)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \ hi_p_mean, hi_p_logvar) kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \ hi_q_mean, hi_q_logvar) return si, nlli, kldi_q2p, kldi_p2q init_values = [self.s0, None, None, None] self.scan_results, self.scan_updates = theano.scan(ir_step_func, \ outputs_info=init_values, sequences=self.hi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z') self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q') self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.q_params = [] self.q_params.extend(self.q_z_given_x.mlp_params) self.q_params.extend(self.q_hi_given_x_si.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.p_params = [self.p_z_mean, self.p_z_logvar] self.p_params.extend(self.p_hi_given_si.mlp_params) self.p_params.extend(self.p_sip1_given_si_hi.mlp_params) self.p_params.extend(self.p_s0_given_z.mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.q_params + self.p_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q, self.kld_hi_q2p, self.kld_hi_p2q = \ self._construct_kld_costs(p=1.0) self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_hi = (self.lam_kld_q2p[0] * self.kld_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kld_hi_p2q) self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_hi # now do l2 KLd costs self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_hi_q2p, self.kl2_hi_p2q = \ self._construct_kld_costs(p=2.0) self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_z_p2q) self.kl2_hi = (self.lam_kld_q2p[0] * self.kl2_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_hi_p2q) self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_hi # compute joint l1/l2 KLd cost self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \ ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) self.kl2_cost = T.mean(self.kl2_costs) self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_l1l2_cost + \ self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_l1l2_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.q_updates = get_adam_updates(params=self.q_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.p_updates = get_adam_updates(params=self.p_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.q_updates: self.joint_updates[k] = self.q_updates[k] for k in self.p_updates: self.joint_updates[k] = self.p_updates[k] # add scan updates, which seem to be required for k in self.scan_updates: self.joint_updates[k] = self.scan_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_klds = self._construct_raw_klds() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() print("Compiling data-guided model sampler...") self.sample_from_input = self._construct_sample_from_input() return def set_sgd_params(self, lr_1=0.01, lr_2=0.01, \ mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rates new_lr_1 = zero_ary + lr_1 self.lr_1.set_value(to_fX(new_lr_1)) new_lr_2 = zero_ary + lr_2 self.lr_2.set_value(to_fX(new_lr_2)) # set momentums new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_nll(self, lam_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_nll self.lam_nll.set_value(to_fX(new_lam)) return def set_lam_kld(self, lam_kld_z=1.0, lam_kld_q2p=1.0, lam_kld_p2q=1.0): """ Set the relative weight of various KL-divergences. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_z self.lam_kld_z.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q2p self.lam_kld_q2p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_p2q self.lam_kld_p2q.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def set_lam_kld_l1l2(self, lam_kld_l1l2=1.0): """ Set the weight for shaping penalty on conditional priors over zt. """ zero_ary = np.zeros((1,)) new_val = zero_ary + lam_kld_l1l2 self.lam_kld_l1l2.set_value(to_fX(new_val)) return def set_drop_rate(self, drop_rate=0.0): """ Set the weight for shaping penalty on conditional priors over zt. """ zero_ary = np.zeros((1,)) new_val = zero_ary + drop_rate self.drop_rate.set_value(to_fX(new_val)) return def _construct_zmuv_samples(self, xi, br): """ Construct the necessary (symbolic) samples for computing through this MultiStageModel for input (sybolic) matrix X. """ z_zmuv = self.rng.normal( \ size=(xi.shape[0]*br, self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) hi_zmuv = self.rng.normal( \ size=(self.ir_steps, xi.shape[0]*br, self.h_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return z_zmuv, hi_zmuv def _construct_nll_costs(self, si, xo): """ Construct the negative log-likelihood part of free energy. """ # average log-likelihood over the refinement sequence xh = self.obs_transform(si) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar) nll_costs = -ll_costs return nll_costs def _construct_kld_costs(self, p=1.0): """ Construct the posterior KL-divergence part of cost to minimize. """ kld_hi_q2ps = [] kld_hi_p2qs = [] for i in range(self.ir_steps): kld_hi_q2p = self.kldi_q2p[i] kld_hi_p2q = self.kldi_p2q[i] kld_hi_q2ps.append(T.sum(kld_hi_q2p**p, \ axis=1, keepdims=True)) kld_hi_p2qs.append(T.sum(kld_hi_p2q**p, \ axis=1, keepdims=True)) # compute the batch-wise costs kld_hi_q2p = sum(kld_hi_q2ps) kld_hi_p2q = sum(kld_hi_p2qs) # construct KLd cost for the distributions over z kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \ self.q_z_mean, self.q_z_logvar) kld_z_q2p = T.sum(kld_z_q2ps**p, axis=1, keepdims=True) kld_z_p2q = T.sum(kld_z_p2qs**p, axis=1, keepdims=True) return [kld_z_q2p, kld_z_p2q, kld_hi_q2p, kld_hi_p2q] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() br = T.lscalar() # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \ self.reg_cost, self.obs_costs] # compile the theano function _, hi_zmuv = self._construct_zmuv_samples(xi, br) func = theano.function(inputs=[ xi, xo, br ], \ outputs=outputs, \ givens={ self.x_in: xi.repeat(br, axis=0), \ self.x_out: xo.repeat(br, axis=0), \ self.hi_zmuv: hi_zmuv }, \ updates=self.joint_updates) return func def _construct_raw_klds(self): """ Construct function for computing KLd per latent dimension. """ # gather step-wise costs into a single list (init costs at the end) all_step_costs = [self.init_klds, self.kldi_q2p, self.kldi_p2q] # compile theano function for computing all relevant costs inputs = [self.x_in, self.x_out, self.hi_zmuv] cost_func = theano.function(inputs=inputs, outputs=all_step_costs, \ updates=self.scan_updates) def raw_kld_computer(XI, XO): hi_zmuv = to_fX( npr.randn(self.ir_steps, XI.shape[0], self.h_dim) ) _all_costs = cost_func(XI, XO, hi_zmuv) _init_klds = _all_costs[0] _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) results = [_init_klds, _kld_q2p, _kld_p2q] return results return raw_kld_computer def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() _, hi_zmuv = self._construct_zmuv_samples(xi, 1) # construct values to output nll = self.nlli[-1] kld = self.kld_z.flatten() + self.kld_hi_q2p.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xi, xo ], \ outputs=[nll, kld], \ givens={self.x_in: xi, \ self.x_out: xo, \ self.hi_zmuv: hi_zmuv}, \ updates=self.scan_updates) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XI, XO, sample_count): # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XI.shape[0],)) kld_sum = np.zeros((XI.shape[0],)) for i in range(sample_count): result = fe_term_sample(XI, XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator def _construct_sample_from_prior(self): """ Construct a function for drawing independent samples from the distribution generated by this MultiStageModel. This function returns the full sequence of "partially completed" examples. """ z_sym = T.matrix() x_sym = T.matrix() irs = self.ir_steps oputs = [self.obs_transform(self.s0)] oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)]) _, hi_zmuv = self._construct_zmuv_samples(x_sym, 1) sample_func = theano.function(inputs=[z_sym, x_sym], outputs=oputs, \ givens={ self.z: z_sym, \ self.x_in: T.zeros_like(x_sym), \ self.x_out: T.zeros_like(x_sym), \ self.hi_zmuv: hi_zmuv }, \ updates=self.scan_updates) def prior_sampler(samp_count): x_samps = to_fX( np.zeros((samp_count, self.obs_dim)) ) old_switch = self.train_switch.get_value(borrow=False) # set model to generation mode self.set_train_switch(switch_val=0.0) z_samps = to_fX( npr.randn(samp_count, self.z_dim) ) model_samps = sample_func(z_samps, x_samps) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return model_samps return prior_sampler def _construct_sample_from_input(self): """ Construct a function for drawing samples from the distribution generated by this MultiStageModel, conditioned on some inputs to the initial encoder stage (i.e. self.q_z_given_x). This returns the full sequence of "partially completed" examples. """ xi = T.matrix() xo = T.matrix() irs = self.ir_steps oputs = [self.obs_transform(self.s0)] oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)]) _, hi_zmuv = self._construct_zmuv_samples(xi, 1) sample_func = theano.function(inputs=[xi, xo], outputs=oputs, \ givens={ self.x_in: xi, \ self.x_out: xo, \ self.hi_zmuv: hi_zmuv }, \ updates=self.scan_updates) def conditional_sampler(XI, XO=None, guided_decoding=False): XI = to_fX( XI ) if XO is None: XO = XI XO = to_fX( XO ) # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if guided_decoding: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's generative policy self.set_train_switch(switch_val=0.0) # draw guided/unguided conditional samples model_samps = sample_func(XI, XO) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return model_samps return conditional_sampler
class ConvPoolLayer(object): """ A simple convolution --> max-pooling layer. The (symbolic) input to this layer must be a theano.tensor.dtensor4 shaped like (batch_size, chan_count, im_dim_1, im_dim_2). filt_def should be a 4-tuple like (filt_count, in_chans, filt_def_1, filt_def_2) pool_def should be a 3-tuple like (pool_dim, pool_stride) """ def __init__(self, rng, input=None, filt_def=None, pool_def=(2, 2), \ activation=None, drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, name="", W_scale=1.0): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + self.rng.normal(size=input.shape, \ avg=0.0, std=input_noise, dtype=theano.config.floatX) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set the activation function for the conv filters if activation: self.activation = activation else: self.activation = lambda x: relu_actfun(x) # initialize weights with random weights W_init = 0.01 * np.asarray(rng.normal( \ size=filt_def), dtype=theano.config.floatX) self.W = theano.shared(value=(W_scale*W_init), \ name="{0:s}_W".format(name)) # the bias is a 1D tensor -- one bias per output feature map b_init = np.zeros((filt_def[0], ), dtype=theano.config.floatX) + 0.1 self.b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # convolve input feature maps with filters input_c01b = self.noisy_input.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_c01b = self.W.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_op = FilterActs(stride=1, partial_sum=1) contig_input = gpu_contiguous(input_c01b) contig_filters = gpu_contiguous(filters_c01b) conv_out_c01b = conv_op(contig_input, contig_filters) if (bias_noise > 1e-4): noisy_conv_out_c01b = conv_out_c01b + self.rng.normal( \ size=conv_out_c01b.shape, avg=0.0, std=bias_noise, \ dtype=theano.config.floatX) else: noisy_conv_out_c01b = conv_out_c01b # downsample each feature map individually, using maxpooling pool_op = MaxPool(ds=pool_def[0], stride=pool_def[1]) mp_out_c01b = pool_op(noisy_conv_out_c01b) mp_out_bc01 = mp_out_c01b.dimshuffle(3, 0, 1, 2) # c01b to bc01 # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.noisy_linear_output = mp_out_bc01 + self.b.dimshuffle( 'x', 0, 'x', 'x') self.linear_output = self.noisy_linear_output self.output = self.activation(self.noisy_linear_output) # store parameters of this layer self.params = [self.W, self.b] return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
class DAELayer(object): def __init__(self, rng, clean_input=None, fuzzy_input=None, \ in_dim=0, out_dim=0, activation=None, input_noise=0., \ W=None, b_h=None, b_v=None): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) # Grab the layer input and perturb it with some sort of noise. This # is, afterall, a _denoising_ autoencoder... self.clean_input = clean_input self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise) # Set some basic layer properties self.activation = activation self.in_dim = in_dim self.out_dim = out_dim # Get some random initial weights and biases, if not given if W is None: W_init = np.asarray(0.01 * rng.standard_normal( \ size=(in_dim, out_dim)), dtype=theano.config.floatX) W = theano.shared(value=W_init, name='W') if b_h is None: b_init = np.zeros((out_dim, ), dtype=theano.config.floatX) b_h = theano.shared(value=b_init, name='b_h') if b_v is None: b_init = np.zeros((in_dim, ), dtype=theano.config.floatX) b_v = theano.shared(value=b_init, name='b_v') # Grab pointers to the now-initialized weights and biases self.W = W self.b_h = b_h self.b_v = b_v # Put the learnable/optimizable parameters into a list self.params = [self.W, self.b_h, self.b_v] # Beep boop... layer construction complete... return def compute_costs(self, lam_l1=None): """Compute reconstruction and activation sparsity costs.""" # Get noise-perturbed encoder/decoder parameters W_nz = self._noisy_params(self.W, 0.01) b_nz = self.b_h #self._noisy_params(self.b_h, 0.05) # Compute hidden and visible activations A_v, A_h = self._compute_activations(self.noisy_input, \ W_nz, b_nz, self.b_v) # Compute reconstruction error cost recon_cost = T.sum((self.clean_input - A_v)**2.0) / \ self.clean_input.shape[0] # Compute sparsity penalty (over both population and lifetime) row_l1_sum = T.sum(abs(row_normalize(A_h))) / A_h.shape[0] col_l1_sum = T.sum(abs(col_normalize(A_h))) / A_h.shape[1] sparse_cost = lam_l1[0] * (row_l1_sum + col_l1_sum) return [recon_cost, sparse_cost] def _compute_hidden_acts(self, X, W, b_h): """Compute activations of encoder (at hidden layer).""" A_h = self.activation(T.dot(X, W) + b_h) return A_h def _compute_activations(self, X, W, b_h, b_v): """Compute activations of decoder (at visible layer).""" A_h = self._compute_hidden_acts(X, W, b_h) A_v = T.dot(A_h, W.T) + b_v return [A_v, A_h] def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" if noise_lvl > 1e-3: P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) else: P_nz = P return P_nz def _get_noisy_input(self, input, p): """p is the probability of dropping elements of input.""" drop_rnd = self.rng.uniform(input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # Cast mask from int to float32, to keep things on GPU noisy_input = input * drop_mask return noisy_input
class SRRModel(object): """ Controller for training a sequential revelation and refinement model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_out: the goal state for iterative refinement p_zi_given_xi: InfNet for stochastic part of step p_sip1_given_zi: HydraNet for deterministic part of step p_x_given_si: HydraNet for transform from s-space to x-space q_zi_given_xi: InfNet for the guide policy params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of observations to construct z_dim: dimension of latent space for policy wobble s_dim: dimension of space in which to perform construction use_p_x_given_si: boolean for whether to use p_x_given_si rev_sched: list of "revelation" blocks. each block is described by the number of steps prior to revelation, and the percentage of remaining pixels to reveal. rev_masks: matrix of revelation masks. the row i provides the mask for iteration i of the srr loop. when this argument is passed, rev_sched is ignored and the revelation schedule is determined by rev_masks. step_type: either "add" or "jump" x_type: can be "bernoulli" or "gaussian" obs_transform: can be 'none' or 'sigmoid' """ def __init__( self, rng=None, x_out=None, p_zi_given_xi=None, p_sip1_given_zi=None, p_x_given_si=None, q_zi_given_xi=None, params=None, shared_param_dicts=None, ): # setup a rng for this SRRModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params["x_dim"] self.z_dim = self.params["z_dim"] self.s_dim = self.params["s_dim"] self.use_p_x_given_si = self.params["use_p_x_given_si"] self.step_type = self.params["step_type"] self.x_type = self.params["x_type"] if self.use_p_x_given_si: print("Constructing hypotheses indirectly in s-space...") else: print("Constructing hypotheses directly in x-space...") assert self.s_dim == self.x_dim if "obs_transform" in self.params: assert (self.params["obs_transform"] == "sigmoid") or (self.params["obs_transform"] == "none") if self.params["obs_transform"] == "sigmoid": self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == "bernoulli": self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # Deal with revelation scheduling if ("rev_masks" in self.params) and (self.params["rev_masks"] is not None): rmp = self.params["rev_masks"][0].astype(theano.config.floatX) rmq = self.params["rev_masks"][1].astype(theano.config.floatX) self.rev_masks_p = theano.shared(value=rmp, name="srrm_rev_masks_p") self.rev_masks_q = theano.shared(value=rmq, name="srrm_rev_masks_q") self.rev_sched = None self.use_rev_masks = True else: self.rev_sched = self.params["rev_sched"] self.rev_masks_p = None self.rev_masks_q = None self.use_rev_masks = False nice_nums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] # "validate" the set of revelation block descriptions for rev_block in self.rev_sched: assert rev_block[0] in nice_nums assert (rev_block[1] >= 0.0) and (rev_block[1] <= 1.01) assert (self.x_type == "bernoulli") or (self.x_type == "gaussian") assert (self.step_type == "add") or (self.step_type == "jump") # grab handles to the relevant networks self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.p_x_given_si = p_x_given_si self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created for this SRRModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for policy wobble self.p_masks = T.tensor3() # revelation masks for primary policy self.q_masks = T.tensor3() # revelation masks for guide policy if self.use_rev_masks: self.total_steps = self.params["rev_masks"][0].shape[0] else: self.total_steps = sum([rb[0] for rb in self.rev_sched]) # setup switching variable for changing between sampling/training zero_ary = to_fX(np.zeros((1,))) self.train_switch = theano.shared(value=zero_ary, name="srrm_train_switch") self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize the parameters "owned" by this model s0_init = to_fX(np.zeros((self.s_dim,))) ss_init = to_fX(0.5 * np.ones((self.total_steps,))) self.s0 = theano.shared(value=s0_init, name="srrm_s0") self.obs_logvar = theano.shared(value=zero_ary, name="srrm_obs_logvar") self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0]) self.step_scales = theano.shared(value=ss_init, name="srrm_step_scales") self.shared_param_dicts = {} self.shared_param_dicts["s0"] = self.s0 self.shared_param_dicts["obs_logvar"] = self.obs_logvar self.shared_param_dicts["step_scales"] = self.step_scales else: # grab the parameters required by this model from a given dict self.s0 = self.shared_param_dicts["s0"] self.obs_logvar = self.shared_param_dicts["obs_logvar"] self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0]) self.step_scales = self.shared_param_dicts["step_scales"] ################################################################## # Setup the sequential revelation and refinement loop using scan # ################################################################## # ss: This is a sequence of scalars that will be used to rescale the # "gradient" input to the primary and guide policies. # # zi_zmuv: This is a sequence of ZMUV gaussian samples that will be # reparametrized to sample actions from the policies. # # p_masks: This is a sequence of "unmasking" masks. When one of these # masking variables is 1, the corresponding value in self.x_out # will be "revealed" to the primary policy. Prediction error # is measured for a value only the first time it is revealed. # Once revealed, a value remains "visible" to the policy. # The final step should reveal all values. # # q_masks: This is a sequence of "unmasking" masks. These are similar # to p_masks, but control which values are revealed to the # guide policy. The guide policy masking sequence should be # constructed to stay "ahead of" the primary policy's masking # sequence. The guide policy needs to know which values will # be revealed to the primary policy so that it can focus its # reconstruction efforts on those values. Otherwise, the guide # policy will immediately reconstruct the entire target. # # si: This is the current "belief state" for each trial in the training # batch. The belief state is updated in each iteration, and passed # forward through the recurrence. # # mi_p: This is the current revelation mask for the primary policy. # # mi_q: This is the current revelation mask for the guide policy. # def srr_step_func(ss, zi_zmuv, p_masks, q_masks, si, mi_p, mi_q): # transform the current belief state into an observation si_as_x = self._from_si_to_x(si) full_grad = T.log(1.0 + T.exp(ss)) * (self.x_out - si_as_x) # get the masked belief state and gradient for primary policy xi_for_p = (mi_p * self.x_out) + ((1.0 - mi_p) * si_as_x) grad_for_p = mi_p * full_grad # update the guide policy's revelation mask new_to_q = (1.0 - mi_q) * q_masks mip1_q = mi_q + new_to_q # get the masked belief state and gradient for guide policy # xi_for_q = (mip1_q * self.x_out) + ((1.0 - mip1_q) * si_as_x) xi_for_q = xi_for_p grad_for_q = mip1_q * full_grad # get samples of next zi, according to the primary policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( T.horizontal_stack(xi_for_p, grad_for_p), do_samples=False ) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.horizontal_stack(xi_for_q, grad_for_q), do_samples=False ) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = (self.train_switch[0] * zi_q) + ((1.0 - self.train_switch[0]) * zi_p) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || N(0, I)) # compute next si, given sampled zi (i.e. update the belief state) hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if self.step_type == "jump": # jump steps always do a full swap of belief state sip1 = si_step else: # additive steps adjust the belief state like an LSTM write_gate = T.nnet.sigmoid(2.0 + hydra_out[1]) erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) # update the primary policy's revelation mask new_to_p = (1.0 - mi_p) * p_masks mip1_p = mi_p + new_to_p # compute NLL only for the newly revealed values nlli = self._construct_nll_costs(sip1, self.x_out, new_to_p) # each loop iteration produces the following values: # sip1: belief state at end of current step # mip1_p: revealed values mask to use in next step (primary) # mip1_q: revealed values mask to use in next step (guide) # nlli: NLL for values revealed at end of current step # kldi_q2p: KL(q || p) for the current step # kldi_p2q: KL(p || q) for the current step # kldi_p2g: KL(p || N(0,I)) for the current step return sip1, mip1_p, mip1_q, nlli, kldi_q2p, kldi_p2q, kldi_p2g # initialize belief state to self.s0 self.s0_full = T.alloc(0.0, self.x_out.shape[0], self.s_dim) + self.s0 # initialize revelation masks to 0 for all values in all trials self.m0_full = T.zeros_like(self.x_out) # setup initial values to pass to scan op outputs_init = [self.s0_full, self.m0_full, self.m0_full, None, None, None, None] sequences_init = [self.step_scales, self.zi_zmuv, self.p_masks, self.q_masks] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan( srr_step_func, outputs_info=outputs_init, sequences=sequences_init ) # grab results of the scan op. all values are computed for each step self.si = self.scan_results[0] # belief states self.mi_p = self.scan_results[1] # primary revelation masks self.mi_q = self.scan_results[2] # guide revelation masks self.nlli = self.scan_results[3] # NLL on newly revealed values self.kldi_q2p = self.scan_results[4] # KL(q || p) self.kldi_p2q = self.scan_results[5] # KL(p || q) self.kldi_p2g = self.scan_results[6] # KL(p || N(0,I)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1,))) self.lr = theano.shared(value=zero_ary, name="srr_lr") # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name="srr_mom_1") self.mom_2 = theano.shared(value=zero_ary, name="srr_mom_2") # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name="srr_lam_kld_p") self.lam_kld_q = theano.shared(value=zero_ary, name="srr_lam_kld_q") self.lam_kld_g = theano.shared(value=zero_ary, name="srr_lam_kld_g") self.lam_kld_s = theano.shared(value=zero_ary, name="srr_lam_kld_s") self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name="srr_lam_l2w") self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0) self.kld_costs = ( (self.lam_kld_p[0] * self.kld_p) + (self.lam_kld_q[0] * self.kld_q) + (self.lam_kld_g[0] * self.kld_g) + (self.lam_kld_s[0] * self.kld_s) ) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates( params=self.joint_params, grads=self.joint_grads, alpha=self.lr, beta1=self.mom_1, beta2=self.mom_2, mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0, ) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters # self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def _from_si_to_x(self, si): """ Convert the given si from s-space to x-space. """ if self.use_p_x_given_si: x_pre_trans, _ = self.p_x_given_si.apply(si) else: x_pre_trans = si x_post_trans = self.obs_transform(x_pre_trans) return x_post_trans def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_s self.lam_kld_s.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if switch_val < 0.5: switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xo): """ Construct the necessary ZMUV gaussian samples for generating trajectories from this SRRModel, for input matrix xo. """ zi_zmuv = self.rng.normal( size=(self.total_steps, xo.shape[0], self.z_dim), avg=0.0, std=1.0, dtype=theano.config.floatX ) return zi_zmuv def _construct_rev_masks(self, xo): """ Compute the sequential revelation masks for the input batch in xo. -- We need to construct mask sequences for both p and q. """ if self.use_rev_masks: # make batch copies of self.rev_masks_p and self.rev_masks_q pmasks = self.rev_masks_p.dimshuffle(0, "x", 1).repeat(xo.shape[0], axis=1) qmasks = self.rev_masks_q.dimshuffle(0, "x", 1).repeat(xo.shape[0], axis=1) else: pm_list = [] qm_list = [] # make a zero mask that does nothing zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1]) # generate independently sampled masks for each revelation block for rb in self.rev_sched: # make a random binary mask with ones at rate rb[1] rand_vals = self.rng.uniform( size=(1, xo.shape[0], xo.shape[1]), low=0.0, high=1.0, dtype=theano.config.floatX ) rand_mask = rand_vals < rb[1] # append the masks for this revleation block to the mask lists # # the guide policy (in q) gets to peek at the values that will be # revealed to the primary policy (in p) for the entire block. The # primary policy only gets to see these values at end of the final # step of the block. Within a given step, values are revealed to q # at the beginning of the step, and to p at the end. # # e.g. in a revelation block with only a single step, the guide # policy sees the values at the beginning of the step, which allows # it to guide the step. the primary policy only gets to see the # values at the end of the step. # # i.e. a standard variational auto-encoder is equivalent to a # sequential revelation and refinement model with only one # revelation block, which has one step and a reveal rate of 1.0. # for refine_step in range(rb[0] - 1): pm_list.append(zero_mask) qm_list.append(rand_mask) pm_list.append(rand_mask) qm_list.append(rand_mask) # concatenate each mask list into a 3-tensor pmasks = T.cast(T.concatenate(pm_list, axis=0), "floatX") qmasks = T.cast(T.concatenate(qm_list, axis=0), "floatX") return [pmasks, qmasks] def _construct_nll_costs(self, si, xo, nll_mask): """ Construct the negative log-likelihood part of free energy. -- only check NLL where nll_mask == 1 """ xh = self._from_si_to_x(si) if self.x_type == "bernoulli": ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask) else: ll_costs = log_prob_gaussian2(xo, xh, log_vars=self.bounded_logvar, mask=nll_mask) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_s(self, s_i, s_j): """ Compute KL(s_i || s_j) -- assuming bernoullish outputs """ x_i = self._from_si_to_x(s_i) x_j = self._from_si_to_x(s_j) kld_s = (x_i * (T.log(x_i) - T.log(x_j))) + ((1.0 - x_i) * (T.log(1.0 - x_i) - T.log(1.0 - x_j))) sum_kld = T.sum(kld_s, axis=1) return sum_kld def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] kld_sis = [] s0 = 0.0 * self.si[0] + self.s0 for i in range(self.total_steps): kld_pis.append(T.sum(self.kldi_p2q[i] ** p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i] ** p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i] ** p, axis=1)) if i == 0: kld_sis.append(self._construct_kld_s(self.si[i], s0)) else: kld_sis.append(self._construct_kld_s(self.si[i], self.si[i - 1])) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) kld_si = sum(kld_sis) return [kld_pi, kld_qi, kld_gi, kld_si] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p ** 2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function( inputs=[xo], outputs=[nll, kld], givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks}, updates=self.scan_updates, on_unused_input="ignore", ) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XO, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XO.shape[0],)) kld_sum = np.zeros((XO.shape[0],)) for i in range(sample_count): result = fe_term_sample(XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # compile theano function for computing the costs all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g] cost_func = theano.function( inputs=[xo], outputs=all_step_costs, givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks}, updates=self.scan_updates, on_unused_input="ignore", ) # make a function for computing batch-based estimates of costs. # _step_nlls: the expected NLL cost for each step # _step_klds: the expected KL(q||p) cost for each step # _kld_q2p: the expected KL(q||p) cost for each latent dim # _kld_p2q: the expected KL(p||q) cost for each latent dim # _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim def raw_cost_computer(XO): _all_costs = cost_func(to_fX(XO)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX(np.asarray([k for k in _step_klds])) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX(np.asarray([k for k in _step_nlls])) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function( inputs=[xo], outputs=outputs, givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks}, updates=self.joint_updates, on_unused_input="ignore", ) return func def _construct_sequence_sampler(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function states = [self._from_si_to_x(self.s0_full)] + [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)] masks = [self.m0_full] + [self.mi_p[i] for i in range(self.total_steps)] outputs = states + masks # compile the theano function func = theano.function( inputs=[xo], outputs=outputs, givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks}, updates=self.joint_updates, on_unused_input="ignore", ) # visualize trajectories generated by the model def sample_func(XO, use_guide_policy=False): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # get belief states and masks generated by the scan loop scan_vals = func(to_fX(XO)) step_count = self.total_steps + 1 seq_shape = (step_count, XO.shape[0], XO.shape[1]) xm_seq = np.zeros(seq_shape).astype(theano.config.floatX) xi_seq = np.zeros(seq_shape).astype(theano.config.floatX) mi_seq = np.zeros(seq_shape).astype(theano.config.floatX) for i in range(step_count): _xi = scan_vals[i] _mi = scan_vals[i + step_count] _xm = (_mi * XO) + ((1.0 - _mi) * _xi) xm_seq[i, :, :] = _xm xi_seq[i, :, :] = _xi mi_seq[i, :, :] = _mi # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return [xm_seq, xi_seq, mi_seq] return sample_func def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert not (f_name is None) f_handle = file(f_name, "wb") # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts["p_zi_given_xi"] = self.p_zi_given_xi.save_to_dict() child_model_dicts["p_sip1_given_zi"] = self.p_sip1_given_zi.save_to_dict() child_model_dicts["p_x_given_si"] = self.p_x_given_si.save_to_dict() child_model_dicts["q_zi_given_xi"] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return