def compare_speed(): # To run this speed comparison # cd <directory of this file> # THEANO_FLAGS=device=gpu \ # python -c 'import test_rng_curand; test_rng_curand.compare_speed()' mrg = MRG_RandomStreams() crn = CURAND_RandomStreams(234) N = 1000 * 100 dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX)) mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))}, profile='mrg uniform') crn_u = theano.function([], [], updates={dest: crn.uniform((N,))}, profile='crn uniform') mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))}, profile='mrg normal') crn_n = theano.function([], [], updates={dest: crn.normal((N,))}, profile='crn normal') for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost print('DEBUGPRINT') print('----------') theano.printing.debugprint(f) for i in range(100): for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost f.fn.time_thunks = (i > 0) f()
def __init__(self, rng, W=None, m=1.0, n_samples=50, shape=None, batch_size=1000): if W is None: W = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (shape[0] + shape[1])), high=numpy.sqrt(6. / (shape[0] + shape[1])), size=(shape[0], shape[1])), dtype=theano.config.floatX) self.W = theano.shared(value=W, name='Hashtag_emb', borrow=True) self.batch_size = batch_size self.n_ht = W.shape[0] self.m = m self.n_samples = n_samples self.csrng = CURAND_RandomStreams(123) mask = self.csrng.uniform(size=(self.n_samples, 1), low=0.0, high=1.0, dtype=theano.config.floatX) self.rfun = theano.function([], mask.argsort(axis=0)) self.alpha = T.constant( 1.0 / numpy.arange(start=1, stop=self.n_ht + 1, step=1)) self.weights = [self.W] self.biases = []
def check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False): """ check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False) Runs a basic sanity check on the `uniform` method of a `CURAND_RandomStreams` object. Checks that variates * are in the range [0, 1] * have a mean in the right neighbourhood (near 0.5) * are of the specified shape * successive calls produce different arrays of variates Parameters ---------- shape_as_symbolic : boolean If `True`, est the case that the shape tuple is a symbolic variable rather than known at compile-time. dim_as_symbolic : boolean If `True`, test the case that an element of the shape tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic` is `True`. """ rng = CURAND_RandomStreams(234) if shape_as_symbolic: # instantiate a TensorConstant with the value (10, 10) shape = constant((10, 10)) else: # Only one dimension is symbolic, with the others known if dim_as_symbolic: shape = (10, constant(10)) else: shape = (10, 10) u0 = rng.uniform(shape) u1 = rng.uniform(shape) f0 = theano.function([], u0, mode=mode_with_gpu) f1 = theano.function([], u1, mode=mode_with_gpu) v0list = [f0() for i in range(3)] v1list = [f1() for i in range(3)] # print v0list # print v1list # assert that elements are different in a few ways assert numpy.all(v0list[0] != v0list[1]) assert numpy.all(v1list[0] != v1list[1]) assert numpy.all(v0list[0] != v1list[0]) for v in v0list: assert v.shape == (10, 10) assert v.min() >= 0 assert v.max() <= 1 assert v.min() < v.max() assert .25 <= v.mean() <= .75
def check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False): """ check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False) Runs a basic sanity check on the `uniform` method of a `CURAND_RandomStreams` object. Checks that variates * are in the range [0, 1] * have a mean in the right neighbourhood (near 0.5) * are of the specified shape * successive calls produce different arrays of variates Parameters ---------- shape_as_symbolic : boolean If `True`, est the case that the shape tuple is a symbolic variable rather than known at compile-time. dim_as_symbolic : boolean If `True`, test the case that an element of the shape tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic` is `True`. """ rng = CURAND_RandomStreams(234) if shape_as_symbolic: # instantiate a TensorConstant with the value (10, 10) shape = constant((10, 10)) else: # Only one dimension is symbolic, with the others known if dim_as_symbolic: shape = (10, constant(10)) else: shape = (10, 10) u0 = rng.uniform(shape) u1 = rng.uniform(shape) f0 = theano.function([], u0, mode=mode_with_gpu) f1 = theano.function([], u1, mode=mode_with_gpu) v0list = [f0() for i in range(3)] v1list = [f1() for i in range(3)] #print v0list #print v1list # assert that elements are different in a few ways assert numpy.all(v0list[0] != v0list[1]) assert numpy.all(v1list[0] != v1list[1]) assert numpy.all(v0list[0] != v1list[0]) for v in v0list: assert v.shape == (10, 10) assert v.min() >= 0 assert v.max() <= 1 assert v.min() < v.max() assert .25 <= v.mean() <= .75
def sampler(self, mu, log_sigma): if "gpu" in theano.config.device: from theano.sandbox.cuda.rng_curand import CURAND_RandomStreams srng = CURAND_RandomStreams(seed=seed) # srng = T.shared_randomstreams.RandomStreams(seed=seed) else: srng = T.shared_randomstreams.RandomStreams(seed=seed) eps = srng.normal(mu.shape) # Reparametrize z = mu + (T.exp(0.5 * log_sigma) - 1) * eps * 5e-1 return z
def __init__(self, filt_shape, in_chans, out_chans, rand_chans, use_rand=True, apply_bn_1=True, apply_bn_2=True, us_stride=2, use_pooling=True, init_func=None, mod_name='gm_conv', rand_type='normal'): assert ((filt_shape[0] % 2) > 0), "filter dim should be odd (not even)" self.filt_dim = filt_shape[0] self.in_chans = in_chans self.out_chans = out_chans self.rand_chans = rand_chans self.use_rand = use_rand self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.us_stride = us_stride self.use_pooling = use_pooling self.mod_name = mod_name self.rand_type = rand_type self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return
def __init__(self, rand_dim, out_dim, fc_dim, apply_bn_1=True, apply_bn_2=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_fc'): self.rand_dim = rand_dim self.out_dim = out_dim self.fc_dim = fc_dim self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return
def __init__(self, rng, input, in_dim, W=None, b=None, W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) self.input = input self.in_dim = in_dim # Get some random initial weights and biases, if not given if W is None: # Generate random initial filters in a typical way W_init = 1.0 * np.asarray(rng.normal( \ size=(self.in_dim, 1)), \ dtype=theano.config.floatX) W = theano.shared(value=(W_scale*W_init)) if b is None: b_init = np.zeros((1,), dtype=theano.config.floatX) b = theano.shared(value=b_init) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer self.linear_output = 20.0 * T.tanh((T.dot(self.input, self.W) + self.b) / 20.0) # Apply activation function self.output = self.linear_output # Compute squared sum of outputs, for regularization self.act_l2_sum = T.sum(self.output**2.0) / self.output.shape[0] # Conveniently package layer parameters self.params = [self.W, self.b] # little layer construction complete... return
class Training(Layer): def __init__(self,rng, W=None,m=1.0, n_samples=50,shape=None,batch_size=1000): if W is None: W = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (shape[0] + shape[1])), high=numpy.sqrt(6. / (shape[0] + shape[1])), size=(shape[0], shape[1])), dtype=theano.config.floatX) self.W = theano.shared(value=W, name='Hashtag_emb', borrow=True) self.batch_size = batch_size self.n_ht = W.shape[0] self.m = m self.n_samples = n_samples self.csrng = CURAND_RandomStreams(123) mask = self.csrng.uniform(size=(self.n_samples,1),low=0.0,high=1.0,dtype=theano.config.floatX) self.rfun = theano.function([],mask.argsort(axis=0)) self.alpha = T.constant(1.0/numpy.arange(start=1,stop=self.n_ht + 1,step=1)) self.weights = [self.W] self.biases = [] def __repr__(self): return "{}: W_shape: {}, m={}, n_samples={}, n_ht={}".format(self.__class__.__name__, self.W.shape.eval(),self.m,self.n_samples,self.n_ht) def output_func(self, input): self.f = T.tensordot(input.dimshuffle(0,'x',1),self.W.dimshuffle('x',0,1),axes=[[1,2],[0,2]]) # cosine sim self.y_pred = T.argmax(self.f,axis=0) return self.y_pred def get_tag_neg(self,f,f_y): cand = f[(f > f_y - self.m).nonzero()] rnk =cand.shape[0] - 1# due to i != y if rnk == 0: return 0 l = T.sum(self.alpha[T.arange(rnk)]) return l/rnk def _warp_loss_cost(self, y,i): f_y = self.f[T.arange(y.shape[0]), y] s = self.m - f_y + self.f[T.arange(i.shape[0]),i] return T.maximum(0.0,s) def warp_loss_cost(self, y, idx): f_y = self.f[T.arange(y.shape[0]), y] f_yy = T.repeat(f_y.dimshuffle(0,'x'),self.f.shape[1],axis=1) f_idx = T.maximum(0.0,f_yy - self.f + self.m) idx = f_idx.argsort(axis=1)[:,0] s = self.m - f_y + self.f[T.arange(idx.shape[0]),idx] return T.maximum(0.0,s) def training_cost(self, y,i): return T.mean(self.warp_loss_cost(y,i))
def __init__(self, rng, in_dim, out_dim, \ W_mean=None, b_mean=None, \ W_logvar=None, b_logvar=None, \ name="", W_scale=1.0): # setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # set some basic layer properties self.in_dim = in_dim self.out_dim = out_dim # initialize weights and biases for mean estimate if W_mean is None: # Generate initial filters using orthogonal random trick W_shape = (self.in_dim, self.out_dim) if W_scale > 0.1: W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) W_init = W_scale * npr.normal(0.0, 1.0, W_shape) W_init = W_init.astype(theano.config.floatX) W_mean = theano.shared(value=W_init, \ name="{0:s}_W_mean".format(name)) if b_mean is None: b_init = np.zeros((self.out_dim,), \ dtype=theano.config.floatX) b_mean = theano.shared(value=b_init, \ name="{0:s}_b_mean".format(name)) # grab handles for easy access self.W_mean = W_mean self.b_mean = b_mean # initialize weights and biases for log-variance estimate if W_logvar is None: # Generate initial filters using orthogonal random trick W_shape = (self.in_dim, self.out_dim) W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) W_init = W_scale * npr.normal(0.0, 1.0, W_shape) #W_init = ortho_matrix(shape=W_shape, gain=W_scale) W_init = W_init.astype(theano.config.floatX) W_logvar = theano.shared(value=W_init, \ name="{0:s}_W_logvar".format(name)) if b_logvar is None: b_init = np.zeros((self.out_dim,), \ dtype=theano.config.floatX) b_logvar = theano.shared(value=b_init, \ name="{0:s}_b_logvar".format(name)) # grab handles for easy access self.W_logvar = W_logvar self.b_logvar = b_logvar # Conveniently package layer parameters self.mlp_params = [self.W_mean, self.b_mean, \ self.W_logvar, self.b_logvar] # Layer construction complete... return
def __init__(self): self.rng = RandomStreams() if 'gpu' in theano.config.device: self.fast_rng = CURAND_RandomStreams(seed=42) else: self.fast_rng = None self.entity_mutate_rate = 0.003 self.bit_mutate_rate = 0.05 self.crossover_rate = 0.7
def __init__(self, rng, clean_input=None, fuzzy_input=None, \ in_dim=0, out_dim=0, activation=None, input_noise=0., \ W=None, b_h=None, b_v=None): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) # Grab the layer input and perturb it with some sort of noise. This # is, afterall, a _denoising_ autoencoder... self.clean_input = clean_input self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise) # Set some basic layer properties self.activation = activation self.in_dim = in_dim self.out_dim = out_dim # Get some random initial weights and biases, if not given if W is None: W_init = np.asarray(0.01 * rng.standard_normal( \ size=(in_dim, out_dim)), dtype=theano.config.floatX) W = theano.shared(value=W_init, name='W') if b_h is None: b_init = np.zeros((out_dim, ), dtype=theano.config.floatX) b_h = theano.shared(value=b_init, name='b_h') if b_v is None: b_init = np.zeros((in_dim, ), dtype=theano.config.floatX) b_v = theano.shared(value=b_init, name='b_v') # Grab pointers to the now-initialized weights and biases self.W = W self.b_h = b_h self.b_v = b_v # Put the learnable/optimizable parameters into a list self.params = [self.W, self.b_h, self.b_v] # Beep boop... layer construction complete... return
def __init__(self, rand_dim, out_dim, apply_bn=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_uni'): self.rand_dim = rand_dim self.out_dim = out_dim self.apply_bn = apply_bn self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return
def __init__(self, rng, layer_description, W=None, b=None, b_in=None, s_in=None, name="", W_scale=1.0): # parse options from layer_description assert 'layer_type' in layer_description, \ "layer_description must provide layer_type" assert ((layer_description['layer_type'] == 'fc') or \ (layer_description['layer_type'] == 'conv')), \ "layer_type must be fc or conv" self.layer_description = layer_description self.layer_type = layer_description['layer_type'] self.in_chans = layer_description['in_chans'] self.out_chans = layer_description['out_chans'] self.activation = layer_description['activation'] self.filt_dim = layer_description.get('filt_dim', None) self.conv_stride = layer_description.get('conv_stride', None) self.apply_bn = layer_description.get('apply_bn', False) self.drop_rate = layer_description.get('drop_rate', 0.0) self.shape_func_in = layer_description.get('shape_func_in', None) self.shape_func_out = layer_description.get('shape_func_out', None) # setup additional params self.rng = RandStream(rng.randint(1000000)) self.W_scale = W_scale self.name = name if self.layer_type == 'fc': self.W, self.b, self.b_in, self.s_in = \ self._init_fc_params(W=W, b=b, b_in=b_in, s_in=s_in) else: self.W, self.b, self.b_in, self.s_in = \ self._init_conv_params(W=W, b=b, b_in=b_in, s_in=s_in) # Conveniently package layer parameters self.params = [self.W, self.b, self.b_in, self.s_in] self.shared_param_dicts = { 'W': self.W, 'b': self.b, 'b_in': self.b_in, 's_in': self.s_in } # Layer construction complete... return
class DiscLayer(object): def __init__(self, rng, input, in_dim, W=None, b=None): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) self.input = input self.in_dim = in_dim # Get some random initial weights and biases, if not given if W is None: # Generate random initial filters in a typical way W_init = 0.01 * np.asarray(rng.normal( \ size=(self.in_dim, 1)), \ dtype=theano.config.floatX) W = theano.shared(value=W_init) if b is None: b_init = np.zeros((1,), dtype=theano.config.floatX) b = theano.shared(value=b_init) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer self.linear_output = 20.0 * T.tanh((T.dot(self.input, self.W) + self.b) / 20.0) # Apply activation function self.output = self.linear_output # Compute squared sum of outputs, for regularization self.act_l2_sum = T.sum(self.output**2.0) / self.output.shape[0] # Conveniently package layer parameters self.params = [self.W, self.b] # little layer construction complete... return def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
def __init__(self,rng, W=None,m=1.0, n_samples=50,shape=None,batch_size=1000): if W is None: W = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (shape[0] + shape[1])), high=numpy.sqrt(6. / (shape[0] + shape[1])), size=(shape[0], shape[1])), dtype=theano.config.floatX) self.W = theano.shared(value=W, name='Hashtag_emb', borrow=True) self.batch_size = batch_size self.n_ht = W.shape[0] self.m = m self.n_samples = n_samples self.csrng = CURAND_RandomStreams(123) mask = self.csrng.uniform(size=(self.n_samples,1),low=0.0,high=1.0,dtype=theano.config.floatX) self.rfun = theano.function([],mask.argsort(axis=0)) self.alpha = T.constant(1.0/numpy.arange(start=1,stop=self.n_ht + 1,step=1)) self.weights = [self.W] self.biases = []
def __init__(self, rng, clean_input=None, fuzzy_input=None, \ in_dim=0, out_dim=0, activation=None, input_noise=0., \ W=None, b_h=None, b_v=None): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) # Grab the layer input and perturb it with some sort of noise. This # is, afterall, a _denoising_ autoencoder... self.clean_input = clean_input self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise) # Set some basic layer properties self.activation = activation self.in_dim = in_dim self.out_dim = out_dim # Get some random initial weights and biases, if not given if W is None: W_init = np.asarray(0.01 * rng.standard_normal( \ size=(in_dim, out_dim)), dtype=theano.config.floatX) W = theano.shared(value=W_init, name='W') if b_h is None: b_init = np.zeros((out_dim,), dtype=theano.config.floatX) b_h = theano.shared(value=b_init, name='b_h') if b_v is None: b_init = np.zeros((in_dim,), dtype=theano.config.floatX) b_v = theano.shared(value=b_init, name='b_v') # Grab pointers to the now-initialized weights and biases self.W = W self.b_h = b_h self.b_v = b_v # Put the learnable/optimizable parameters into a list self.params = [self.W, self.b_h, self.b_v] # Beep boop... layer construction complete... return
def __init__(self, \ rng=None, \ Xd=None, \ params=None, \ shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params if 'build_theano_funcs' in params: self.build_theano_funcs = params['build_theano_funcs'] else: self.build_theano_funcs = True if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.0 if 'input_noise' in params: self.input_noise = params['input_noise'] else: self.input_noise = 0.0 if 'bias_noise' in params: self.bias_noise = params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 if 'sigma_init_scale' in params: self.sigma_init_scale = params['sigma_init_scale'] else: self.sigma_init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.shared_config = params['shared_config'] self.mu_config = params['mu_config'] self.sigma_config = params['sigma_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun ######################################### # Initialize the shared part of network # ######################################### self.shared_layers = [] layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:]) layer_num = 0 # Construct input to the inference network next_input = self.Xd for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "share_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if first_layer: i_noise = self.input_noise b_noise = 0.0 else: i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append( \ new_layer.shared_param_dicts) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) next_input = self.shared_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ##################################### # Initialize the mu part of network # ##################################### self.mu_layers = [] layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "mu_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) self.shared_param_dicts['mu'].append( \ new_layer.shared_param_dicts) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['mu'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) next_input = self.mu_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ######################################## # Initialize the sigma part of network # ######################################## self.sigma_layers = [] layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "sigma_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if last_layer: # set in-bound weights for logvar predictions to 0 i_scale = 0.0 * i_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) self.shared_param_dicts['sigma'].append( \ new_layer.shared_param_dicts) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['sigma'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) next_input = self.sigma_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # Create a shared parameter for rescaling posterior "sigmas" to allow # control over the velocity of the markov chain generated by repeated # cycling through the INF -> GEN loop. if not ('sigma_scale' in self.shared_param_dicts['sigma'][-1]): # we use a hack-ish check to remain compatible with loading models # that were saved before the addition of the sigma_scale param. zero_ary = to_fX(np.zeros((1,))) self.sigma_scale = theano.shared(value=zero_ary) new_dict = {'sigma_scale': self.sigma_scale} self.shared_param_dicts['sigma'].append(new_dict) self.set_sigma_scale(1.0) else: # this is a clone of some other InfNet, and that InfNet was made # after adding the sigma_scale param, so use its sigma_scale self.sigma_scale = \ self.shared_param_dicts['sigma'][-1]['sigma_scale'] # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.mu_layers: self.mlp_params.extend(layer.params) for layer in self.sigma_layers: self.mlp_params.extend(layer.params) # The output of this inference network is given by the noisy output # of the final layers of its mu and sigma networks. self.output_mean, self.output_logvar, self.output_samples = \ self.apply(Xd) self.output = self.output_samples self.out_dim = self.sigma_layers[-1].out_dim # Construct a theano function for sampling from the approximate # posteriors inferred by this model for some collection of points # in the "data space". if self.build_theano_funcs: self.sample_posterior = self._construct_sample_posterior() self.mean_posterior = theano.function([self.Xd], \ outputs=self.output_mean) else: self.sample_posterior = None self.mean_posterior = None ######################################################## # CONSTRUCT FUNCTIONS FOR RICA PRETRAINING INPUT LAYER # ######################################################## self.rica_func = None self.W_rica = self.shared_layers[0].W return
def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, \ use_bias=True, name=""): # Setup a shared random generator for this layer #self.srng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.srng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + \ (input_noise * self.srng.normal(size=input.shape, \ dtype=theano.config.floatX)) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation: self.activation = activation else: if self.pool_size <= 1: self.activation = lambda x: relu_actfun(x) else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: if self.pool_size <= 1: # Generate random initial filters in a typical way W_init = np.asarray(0.04 * rng.standard_normal( \ size=(self.in_dim, self.filt_count)), \ dtype=theano.config.floatX) else: # Generate groups of random filters to pool over such that # intra-group correlations are stronger than inter-group # correlations, to encourage pooling over similar filters... filters = [] for g_num in range(self.pool_count): g_filt = 0.01 * rng.standard_normal(size=(self.in_dim,1)) for f_num in range(self.pool_size): f_filt = g_filt + (0.005 * rng.standard_normal( \ size=(self.in_dim,1))) filters.append(f_filt) W_init = np.hstack(filters).astype(theano.config.floatX) W = theano.shared(value=W_init, name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count,), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer if use_bias: self.linear_output = T.dot(self.noisy_input, self.W) + self.b else: self.linear_output = T.dot(self.noisy_input, self.W) # Add noise to the pre-activation features (if desired) self.noisy_linear = self.linear_output + \ (bias_noise * self.srng.normal(size=self.linear_output.shape, \ dtype=theano.config.floatX)) # Apply activation function self.output = self.activation(self.noisy_linear) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.output**2.) / self.output.size self.row_l1_sum = T.sum(abs(row_normalize(self.output))) / \ self.output.shape[0] self.col_l1_sum = T.sum(abs(col_normalize(self.output))) / \ self.output.shape[1] # Conveniently package layer parameters if use_bias: self.params = [self.W, self.b] else: self.params = [self.W] # Layer construction complete... return
class HiddenLayer(object): def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, \ use_bias=True, name=""): # Setup a shared random generator for this layer #self.srng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.srng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + \ (input_noise * self.srng.normal(size=input.shape, \ dtype=theano.config.floatX)) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation: self.activation = activation else: if self.pool_size <= 1: self.activation = lambda x: relu_actfun(x) else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: if self.pool_size <= 1: # Generate random initial filters in a typical way W_init = np.asarray(0.04 * rng.standard_normal( \ size=(self.in_dim, self.filt_count)), \ dtype=theano.config.floatX) else: # Generate groups of random filters to pool over such that # intra-group correlations are stronger than inter-group # correlations, to encourage pooling over similar filters... filters = [] for g_num in range(self.pool_count): g_filt = 0.01 * rng.standard_normal(size=(self.in_dim,1)) for f_num in range(self.pool_size): f_filt = g_filt + (0.005 * rng.standard_normal( \ size=(self.in_dim,1))) filters.append(f_filt) W_init = np.hstack(filters).astype(theano.config.floatX) W = theano.shared(value=W_init, name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count,), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer if use_bias: self.linear_output = T.dot(self.noisy_input, self.W) + self.b else: self.linear_output = T.dot(self.noisy_input, self.W) # Add noise to the pre-activation features (if desired) self.noisy_linear = self.linear_output + \ (bias_noise * self.srng.normal(size=self.linear_output.shape, \ dtype=theano.config.floatX)) # Apply activation function self.output = self.activation(self.noisy_linear) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.output**2.) / self.output.size self.row_l1_sum = T.sum(abs(row_normalize(self.output))) / \ self.output.shape[0] self.col_l1_sum = T.sum(abs(col_normalize(self.output))) / \ self.output.shape[1] # Conveniently package layer parameters if use_bias: self.params = [self.W, self.b] else: self.params = [self.W] # Layer construction complete... return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p #drop_mask = self.srng.binomial(n=1, p=1-p, size=input.shape, \ # dtype=theano.config.floatX) noise_rnd = self.srng.uniform(input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = noise_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" #P_nz = P + self.srng.normal(size=P.shape, avg=0., std=noise_lvl, \ # dtype=theano.config.floatX) P_nz = P + self.srng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
def __init__(self, rng=None, x_out=None, \ p_z_given_x=None, \ p_x_given_z=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this WalkoutModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.walkout_steps = self.params['walkout_steps'] self.x_type = self.params['x_type'] self.shared_param_dicts = shared_param_dicts if 'x_transform' in self.params: assert((self.params['x_transform'] == 'sigmoid') or \ (self.params['x_transform'] == 'none')) if self.params['x_transform'] == 'sigmoid': self.x_transform = lambda x: T.nnet.sigmoid(x) else: self.x_transform = lambda x: x else: self.x_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.x_transform = lambda x: T.nnet.sigmoid(x) assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) assert ((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant networks self.p_z_given_x = p_z_given_x self.p_x_given_z = p_x_given_z # record the symbolic variables that will provide inputs to the # computation graph created for this WalkoutModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for walk-out wobble if self.shared_param_dicts is None: # initialize the parameters "owned" by this model zero_ary = to_fX(np.zeros((1, ))) self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar') self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) ############################################################### # Setup the forwards (i.e. training) walk-out loop using scan # ############################################################### def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw): # get samples of next zi, according to the forwards model zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv) # check reverse direction probability p(xi_fw | zi_fw) xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_bw_mean = self.x_transform(xi_bw_mean) nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \ log_vars=xi_bw_logvar, mask=None) nll_xi_bw = nll_xi_bw.flatten() # get samples of next xi, according to the forwards model xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_fw_mean = self.x_transform(xi_fw_mean) xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv) # check reverse direction probability p(zi_fw | xi_fw) zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \ log_vars=zi_bw_logvar, mask=None) nll_zi_bw = nll_zi_bw.flatten() # each loop iteration produces the following values: # xi_fw: xi generated fom zi by forwards walk # zi_fw: zi generated fom xi by forwards walk # xi_fw_mean: ---- # xi_fw_logvar: ---- # zi_fw_mean: ---- # zi_fw_logvar: ---- # nll_xi_bw: NLL for reverse step zi_fw -> xi_fw # nll_zi_bw: NLL for reverse step xi_fw -> zi_fw return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw # initialize states for x/z self.x0 = self.x_out self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim) # setup initial values to pass to scan op outputs_init = [self.x0, self.z0, None, None, None, None, None, None] sequences_init = [self.xi_zmuv, self.zi_zmuv] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan(forwards_loop, \ outputs_info=outputs_init, \ sequences=sequences_init) # grab results of the scan op. all values are computed for each step self.xi = self.scan_results[0] self.zi = self.scan_results[1] self.xi_fw_mean = self.scan_results[2] self.xi_fw_logvar = self.scan_results[3] self.zi_fw_mean = self.scan_results[4] self.zi_fw_logvar = self.scan_results[5] self.nll_xi_bw = self.scan_results[6] self.nll_zi_bw = self.scan_results[7] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='srr_lr') # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w') self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs( p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return
class WalkoutModel(object): """ Controller for training a forwards-backwards chainy model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_out: the goal state for forwards-backwards walking process p_z_given_x: InfNet for stochastic part of step p_x_given_z: HydraNet for deterministic part of step params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of observations to construct z_dim: dimension of latent space for policy wobble walkout_steps: number of steps to walk out x_type: can be "bernoulli" or "gaussian" x_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, x_out=None, \ p_z_given_x=None, \ p_x_given_z=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this WalkoutModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.walkout_steps = self.params['walkout_steps'] self.x_type = self.params['x_type'] self.shared_param_dicts = shared_param_dicts if 'x_transform' in self.params: assert((self.params['x_transform'] == 'sigmoid') or \ (self.params['x_transform'] == 'none')) if self.params['x_transform'] == 'sigmoid': self.x_transform = lambda x: T.nnet.sigmoid(x) else: self.x_transform = lambda x: x else: self.x_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.x_transform = lambda x: T.nnet.sigmoid(x) assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) assert ((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant networks self.p_z_given_x = p_z_given_x self.p_x_given_z = p_x_given_z # record the symbolic variables that will provide inputs to the # computation graph created for this WalkoutModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for walk-out wobble if self.shared_param_dicts is None: # initialize the parameters "owned" by this model zero_ary = to_fX(np.zeros((1, ))) self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar') self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) ############################################################### # Setup the forwards (i.e. training) walk-out loop using scan # ############################################################### def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw): # get samples of next zi, according to the forwards model zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv) # check reverse direction probability p(xi_fw | zi_fw) xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_bw_mean = self.x_transform(xi_bw_mean) nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \ log_vars=xi_bw_logvar, mask=None) nll_xi_bw = nll_xi_bw.flatten() # get samples of next xi, according to the forwards model xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_fw_mean = self.x_transform(xi_fw_mean) xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv) # check reverse direction probability p(zi_fw | xi_fw) zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \ log_vars=zi_bw_logvar, mask=None) nll_zi_bw = nll_zi_bw.flatten() # each loop iteration produces the following values: # xi_fw: xi generated fom zi by forwards walk # zi_fw: zi generated fom xi by forwards walk # xi_fw_mean: ---- # xi_fw_logvar: ---- # zi_fw_mean: ---- # zi_fw_logvar: ---- # nll_xi_bw: NLL for reverse step zi_fw -> xi_fw # nll_zi_bw: NLL for reverse step xi_fw -> zi_fw return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw # initialize states for x/z self.x0 = self.x_out self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim) # setup initial values to pass to scan op outputs_init = [self.x0, self.z0, None, None, None, None, None, None] sequences_init = [self.xi_zmuv, self.zi_zmuv] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan(forwards_loop, \ outputs_info=outputs_init, \ sequences=sequences_init) # grab results of the scan op. all values are computed for each step self.xi = self.scan_results[0] self.zi = self.scan_results[1] self.xi_fw_mean = self.scan_results[2] self.xi_fw_logvar = self.scan_results[3] self.zi_fw_mean = self.scan_results[4] self.zi_fw_logvar = self.scan_results[5] self.nll_xi_bw = self.scan_results[6] self.nll_zi_bw = self.scan_results[7] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='srr_lr') # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w') self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs( p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1, )) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_s self.lam_kld_s.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1, )) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xo): """ Construct the necessary ZMUV gaussian samples for generating trajectories from this WalkoutModel, for input matrix xo. """ zi_zmuv = self.rng.normal( \ size=(self.total_steps, xo.shape[0], self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return zi_zmuv def _construct_rev_masks(self, xo): """ Compute the sequential revelation masks for the input batch in xo. -- We need to construct mask sequences for both p and q. """ if self.use_rev_masks: # make batch copies of self.rev_masks_p and self.rev_masks_q pmasks = self.rev_masks_p.dimshuffle(0, 'x', 1).repeat(xo.shape[0], axis=1) qmasks = self.rev_masks_q.dimshuffle(0, 'x', 1).repeat(xo.shape[0], axis=1) else: pm_list = [] qm_list = [] # make a zero mask that does nothing zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1]) # generate independently sampled masks for each revelation block for rb in self.rev_sched: # make a random binary mask with ones at rate rb[1] rand_vals = self.rng.uniform( \ size=(1, xo.shape[0], xo.shape[1]), \ low=0.0, high=1.0, dtype=theano.config.floatX) rand_mask = rand_vals < rb[1] # append the masks for this revleation block to the mask lists # # the guide policy (in q) gets to peek at the values that will be # revealed to the primary policy (in p) for the entire block. The # primary policy only gets to see these values at end of the final # step of the block. Within a given step, values are revealed to q # at the beginning of the step, and to p at the end. # # e.g. in a revelation block with only a single step, the guide # policy sees the values at the beginning of the step, which allows # it to guide the step. the primary policy only gets to see the # values at the end of the step. # # i.e. a standard variational auto-encoder is equivalent to a # sequential revelation and refinement model with only one # revelation block, which has one step and a reveal rate of 1.0. # for refine_step in range(rb[0] - 1): pm_list.append(zero_mask) qm_list.append(rand_mask) pm_list.append(rand_mask) qm_list.append(rand_mask) # concatenate each mask list into a 3-tensor pmasks = T.cast(T.concatenate(pm_list, axis=0), 'floatX') qmasks = T.cast(T.concatenate(qm_list, axis=0), 'floatX') return [pmasks, qmasks] def _construct_nll_costs(self, si, xo, nll_mask): """ Construct the negative log-likelihood part of free energy. -- only check NLL where nll_mask == 1 """ xh = self._from_si_to_x(si) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar, mask=nll_mask) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_s(self, s_i, s_j): """ Compute KL(s_i || s_j) -- assuming bernoullish outputs """ x_i = self._from_si_to_x(s_i) x_j = self._from_si_to_x(s_j) kld_s = (x_i * (T.log(x_i) - T.log(x_j))) + \ ((1.0 - x_i) * (T.log(1.0-x_i) - T.log(1.0-x_j))) sum_kld = T.sum(kld_s, axis=1) return sum_kld def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] kld_sis = [] s0 = 0.0 * self.si[0] + self.s0 for i in range(self.total_steps): kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1)) if i == 0: kld_sis.append(self._construct_kld_s(self.si[i], s0)) else: kld_sis.append( self._construct_kld_s(self.si[i], self.si[i - 1])) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) kld_si = sum(kld_sis) return [kld_pi, kld_qi, kld_gi, kld_si] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xo ], \ outputs=[nll, kld], \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XO, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XO.shape[0], )) kld_sum = np.zeros((XO.shape[0], )) for i in range(sample_count): result = fe_term_sample(XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # compile theano function for computing the costs all_step_costs = [ self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g ] cost_func = theano.function(inputs=[ xo ], \ outputs=all_step_costs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # make a function for computing batch-based estimates of costs. # _step_nlls: the expected NLL cost for each step # _step_klds: the expected KL(q||p) cost for each step # _kld_q2p: the expected KL(q||p) cost for each latent dim # _kld_p2q: the expected KL(p||q) cost for each latent dim # _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim def raw_cost_computer(XO): _all_costs = cost_func(to_fX(XO)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX(np.asarray([k for k in _step_klds])) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX(np.asarray([k for k in _step_nlls])) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \ self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') return func def _construct_sequence_sampler(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function states = [self._from_si_to_x(self.s0_full)] + \ [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)] masks = [self.m0_full ] + [self.mi_p[i] for i in range(self.total_steps)] outputs = states + masks # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') # visualize trajectories generated by the model def sample_func(XO, use_guide_policy=False): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # get belief states and masks generated by the scan loop scan_vals = func(to_fX(XO)) step_count = self.total_steps + 1 seq_shape = (step_count, XO.shape[0], XO.shape[1]) xm_seq = np.zeros(seq_shape).astype(theano.config.floatX) xi_seq = np.zeros(seq_shape).astype(theano.config.floatX) mi_seq = np.zeros(seq_shape).astype(theano.config.floatX) for i in range(step_count): _xi = scan_vals[i] _mi = scan_vals[i + step_count] _xm = (_mi * XO) + ((1.0 - _mi) * _xi) xm_seq[i, :, :] = _xm xi_seq[i, :, :] = _xi mi_seq[i, :, :] = _mi # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return [xm_seq, xi_seq, mi_seq] return sample_func def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert (not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict() child_model_dicts[ 'p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict() child_model_dicts['p_x_given_si'] = self.p_x_given_si.save_to_dict() child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return
class ConvPoolLayer(object): """ A simple convolution --> max-pooling layer. The (symbolic) input to this layer must be a theano.tensor.dtensor4 shaped like (batch_size, chan_count, im_dim_1, im_dim_2). filt_def should be a 4-tuple like (filt_count, in_chans, filt_def_1, filt_def_2) pool_def should be a 3-tuple like (pool_dim, pool_stride) """ def __init__(self, rng, input=None, filt_def=None, pool_def=(2, 2), \ activation=None, drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, name="", W_scale=1.0): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + self.rng.normal(size=input.shape, \ avg=0.0, std=input_noise, dtype=theano.config.floatX) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set the activation function for the conv filters if activation: self.activation = activation else: self.activation = lambda x: relu_actfun(x) # initialize weights with random weights W_init = 0.01 * np.asarray(rng.normal( \ size=filt_def), dtype=theano.config.floatX) self.W = theano.shared(value=(W_scale*W_init), \ name="{0:s}_W".format(name)) # the bias is a 1D tensor -- one bias per output feature map b_init = np.zeros((filt_def[0], ), dtype=theano.config.floatX) + 0.1 self.b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # convolve input feature maps with filters input_c01b = self.noisy_input.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_c01b = self.W.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_op = FilterActs(stride=1, partial_sum=1) contig_input = gpu_contiguous(input_c01b) contig_filters = gpu_contiguous(filters_c01b) conv_out_c01b = conv_op(contig_input, contig_filters) if (bias_noise > 1e-4): noisy_conv_out_c01b = conv_out_c01b + self.rng.normal( \ size=conv_out_c01b.shape, avg=0.0, std=bias_noise, \ dtype=theano.config.floatX) else: noisy_conv_out_c01b = conv_out_c01b # downsample each feature map individually, using maxpooling pool_op = MaxPool(ds=pool_def[0], stride=pool_def[1]) mp_out_c01b = pool_op(noisy_conv_out_c01b) mp_out_bc01 = mp_out_c01b.dimshuffle(3, 0, 1, 2) # c01b to bc01 # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.noisy_linear_output = mp_out_bc01 + self.b.dimshuffle( 'x', 0, 'x', 'x') self.linear_output = self.noisy_linear_output self.output = self.activation(self.noisy_linear_output) # store parameters of this layer self.params = [self.W, self.b] return def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) return P_nz
def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_h_given_z=None, \ p_x_given_h=None, \ q_z_given_x=None, \ q_h_given_z_x=None, \ x_dim=None, \ z_dim=None, \ h_dim=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.h_dim = h_dim # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_h_given_z_x = q_h_given_z_x self.p_h_given_z = p_h_given_z self.p_x_given_h = p_x_given_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((1,self.z_dim)) ) self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar') self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "hidden" latent state (from both p and q) z_q_mean, z_q_logvar, z_q = \ self.q_z_given_x.apply(self.x_in, do_samples=True) z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0) z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0) zmuv = self.rng.normal(size=z_q.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) z_p = (T.exp(0.5*z_p_logvar) * zmuv) + z_p_mean self.z = (self.train_switch[0] * z_q) + \ ((1.0 - self.train_switch[0]) * z_p) # compute relevant KLds for this step self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, \ z_p_mean, z_p_logvar) self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, \ z_q_mean, z_q_logvar) # samples of "hidden" latent state (from both p and q) h_p_mean, h_p_logvar, h_p = self.p_h_given_z.apply(self.z) h_q_mean, h_q_logvar, h_q = self.q_h_given_z_x.apply( \ T.horizontal_stack(h_p_mean, h_p_logvar, self.x_out)) self.h = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) # compute relevant KLds for this step self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, \ h_p_mean, h_p_logvar) self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, \ h_q_mean, h_q_logvar) # p_x_given_h generates an observation x conditioned on the "hidden" # latent variables h. self.x_gen, _ = self.p_x_given_h.apply(self.h, do_samples=False) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='tsm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w') self.set_lam_l2w(1e-5) # get optimizable parameters belonging to the TwoStageModel self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar] # get optimizable parameters belonging to the underlying networks child_params = [] child_params.extend(self.q_z_given_x.mlp_params) child_params.extend(self.q_h_given_z_x.mlp_params) child_params.extend(self.p_h_given_z.mlp_params) child_params.extend(self.p_x_given_h.mlp_params) # make a joint list of all optimizable parameters self.joint_params = self_params + child_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = T.sum(self.kld_z, axis=1) + \ T.sum(self.kld_h, axis=1) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # construct the updates for the generator and inferencer networks all_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0) self.joint_updates = OrderedDict() for k in all_updates: self.joint_updates[k] = all_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() return
def __init__(self, \ rng=None, \ Xp=None, \ prior_sigma=None, \ params=None, \ shared_param_dicts=None): # First, setup a shared random number generator for this layer self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xp = Xp self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### assert(not (params is None)) self.params = params lam_l2a = self.params['lam_l2a'] if 'vis_drop' in self.params: # Drop rate on the latent variables self.vis_drop = self.params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in self.params: # Drop rate on hidden layer activations self.hid_drop = self.params['hid_drop'] else: self.hid_drop = 0.0 if 'bias_noise' in self.params: # Noise sigma for hidden layer biases self.bias_noise = self.params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 if 'out_type' in params: # check which type of output distribution to generate self.out_type = params['out_type'] assert((self.out_type == 'bernoulli') or \ (self.out_type == 'gaussian')) else: # default to bernoulli-valued outputs self.out_type = 'bernoulli' # Check if the params for this net were given a priori. This option # will be used for creating "clones" of a generative network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = [] self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.mlp_config = params['mlp_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun self.mlp_depth = len(self.mlp_config) - 1 self.latent_dim = self.mlp_config[0] self.data_dim = self.mlp_config[-1] ########################## # Initialize the network # ########################## self.mlp_layers = [] self.logvar_layer = None layer_def_pairs = zip(self.mlp_config[:-1],self.mlp_config[1:]) layer_num = 0 next_input = self.Xp for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "gn_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.mlp_layers.append(new_layer) self.shared_param_dicts.append({'W': new_layer.W, 'b': new_layer.b}) if (last_layer and (self.out_type == 'gaussian')): # add an extra layer/transform for encoding log-variance lv_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name+'_logvar', W_scale=self.init_scale) self.logvar_layer = lv_layer self.mlp_layers.append(lv_layer) self.shared_param_dicts.append({'W': lv_layer.W, 'b': lv_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts[layer_num] self.mlp_layers.append(HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale)) if (last_layer and (self.out_type == 'gaussian')): init_params = self.shared_param_dicts[layer_num+1] self.mlp_layers.append(HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale)) next_input = self.mlp_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # construct a mask for deciding which output dimensions to keep/ignore if self.is_clone: self.output_mask = self.shared_param_dicts[-1]['output_mask'] self.output_bias = self.shared_param_dicts[-1]['output_bias'] else: row_mask = np.ones((self.data_dim,)).astype(theano.config.floatX) self.output_mask = theano.shared(value=row_mask, name='gn_output_mask') row_mask = 0.0 * row_mask self.output_bias = theano.shared(value=row_mask, name='gn_output_bias') op_dict = {'output_mask': self.output_mask, \ 'output_bias': self.output_bias} self.shared_param_dicts.append(op_dict) # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.mlp_layers: self.mlp_params.extend(layer.params) # add the output bias vector to the param list self.mlp_params.append(self.output_bias) # The output of this generator network is given by the noisy output # of its final layer. We will keep a running estimate of the mean and # covariance of the distribution induced by combining this network's # latent noise source with its deep non-linear transform. These will # be used to encourage the induced distribution to match the first and # second-order moments of the distribution we are trying to match. if self.out_type == 'bernoulli': self.output = (T.nnet.sigmoid(self.mlp_layers[-1].linear_output + self.output_bias) * \ self.output_mask) self.output_mu = self.output self.output_logvar = self.output self.output_sigma = self.output else: self.output_mu = self.mlp_layers[-1].linear_output + self.output_bias self.output_logvar = self.mlp_layers[-2].linear_output self.output_sigma = T.sqrt(T.exp(self.output_logvar)) self.output = self._construct_post_samples() * self.output_mask self.out_dim = self.mlp_layers[-1].out_dim C_init = np.zeros((self.out_dim,self.out_dim)).astype(theano.config.floatX) m_init = np.zeros((self.out_dim,)).astype(theano.config.floatX) self.dist_mean = theano.shared(m_init, name='gn_dist_mean') self.dist_cov = theano.shared(C_init, name='gn_dist_cov') # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = lam_l2a * self._act_reg_cost() # Construct a sampler for drawing independent samples from this model's # isotropic Gaussian prior, and a sampler for the model distribution. self.sample_from_prior = self._construct_prior_sampler() self.sample_from_model = self._construct_model_sampler() # Construct a function for passing points from the latent/prior space # through the transform induced by the current model parameters. self.transform_prior = self._construct_transform_prior() return
def __init__(self, rng, input=None, filt_def=None, pool_def=(2, 2), \ activation=None, drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, name="", W_scale=1.0): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + self.rng.normal(size=input.shape, \ avg=0.0, std=input_noise, dtype=theano.config.floatX) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set the activation function for the conv filters if activation: self.activation = activation else: self.activation = lambda x: relu_actfun(x) # initialize weights with random weights W_init = 0.01 * np.asarray(rng.normal( \ size=filt_def), dtype=theano.config.floatX) self.W = theano.shared(value=(W_scale*W_init), \ name="{0:s}_W".format(name)) # the bias is a 1D tensor -- one bias per output feature map b_init = np.zeros((filt_def[0], ), dtype=theano.config.floatX) + 0.1 self.b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # convolve input feature maps with filters input_c01b = self.noisy_input.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_c01b = self.W.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_op = FilterActs(stride=1, partial_sum=1) contig_input = gpu_contiguous(input_c01b) contig_filters = gpu_contiguous(filters_c01b) conv_out_c01b = conv_op(contig_input, contig_filters) if (bias_noise > 1e-4): noisy_conv_out_c01b = conv_out_c01b + self.rng.normal( \ size=conv_out_c01b.shape, avg=0.0, std=bias_noise, \ dtype=theano.config.floatX) else: noisy_conv_out_c01b = conv_out_c01b # downsample each feature map individually, using maxpooling pool_op = MaxPool(ds=pool_def[0], stride=pool_def[1]) mp_out_c01b = pool_op(noisy_conv_out_c01b) mp_out_bc01 = mp_out_c01b.dimshuffle(3, 0, 1, 2) # c01b to bc01 # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.noisy_linear_output = mp_out_bc01 + self.b.dimshuffle( 'x', 0, 'x', 'x') self.linear_output = self.noisy_linear_output self.output = self.activation(self.noisy_linear_output) # store parameters of this layer self.params = [self.W, self.b] return
def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_s0_given_z=None, \ p_hi_given_si=None, \ p_sip1_given_si_hi=None, \ q_z_given_x=None, \ q_hi_given_x_si=None, \ obs_dim=None, \ z_dim=None, h_dim=None, \ ir_steps=4, params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.obs_dim = obs_dim self.z_dim = z_dim self.h_dim = h_dim self.ir_steps = ir_steps # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_hi_given_x_si = q_hi_given_x_si self.p_s0_given_z = p_s0_given_z self.p_hi_given_si = p_hi_given_si self.p_sip1_given_si_hi = p_sip1_given_si_hi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.hi_zmuv = T.tensor3() # for ZMUV Gaussian samples to use in scan # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) # this weight balances l1 vs. l2 penalty on posterior KLds self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2') self.set_lam_kld_l1l2(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar') init_vec = to_fX( np.zeros((self.obs_dim,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) # setup a function for computing reconstruction log likelihood if self.x_type == 'bernoulli': self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_bernoulli(xo, xh)) else: self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar)) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.x_out.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) ############################# # Setup self.z and self.s0. # ############################# print("Building MSM step 0...") drop_x = drop_mask * self.x_in self.q_z_mean, self.q_z_logvar, self.z = \ self.q_z_given_x.apply(drop_x, do_samples=True) # get initial observation state self.s0, _ = self.p_s0_given_z.apply(self.z, do_samples=False) # gather KLd and NLL for the initialization step self.init_klds = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) self.init_nlls = -1.0 * \ self.log_prob_func(self.x_out, self.obs_transform(self.s0)) ################################################## # Setup the iterative generation loop using scan # ################################################## def ir_step_func(hi_zmuv, sim1): # get variables used throughout this refinement step sim1_obs = self.obs_transform(sim1) # transform state -> obs grad_ll = self.x_out - sim1_obs # get samples of next hi, conditioned on current si hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \ sim1_obs, do_samples=False) # now we build the model for variational hi given si hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \ T.horizontal_stack(grad_ll, sim1_obs), \ do_samples=False) hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean # make hi samples that can be switched between hi_p and hi_q hi = ( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on si and hi. ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi) # get the transformed values (for an LSTM style update) i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0) f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0) # perform an LSTM-like update of the state sim1 -> si si = (in_vals * i_gate) + (sim1 * f_gate) # compute generator NLL for this step nlli = self.log_prob_func(self.x_out, self.obs_transform(si)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \ hi_p_mean, hi_p_logvar) kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \ hi_q_mean, hi_q_logvar) return si, nlli, kldi_q2p, kldi_p2q init_values = [self.s0, None, None, None] self.scan_results, self.scan_updates = theano.scan(ir_step_func, \ outputs_info=init_values, sequences=self.hi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z') self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q') self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.q_params = [] self.q_params.extend(self.q_z_given_x.mlp_params) self.q_params.extend(self.q_hi_given_x_si.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.p_params = [self.p_z_mean, self.p_z_logvar] self.p_params.extend(self.p_hi_given_si.mlp_params) self.p_params.extend(self.p_sip1_given_si_hi.mlp_params) self.p_params.extend(self.p_s0_given_z.mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.q_params + self.p_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q, self.kld_hi_q2p, self.kld_hi_p2q = \ self._construct_kld_costs(p=1.0) self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_hi = (self.lam_kld_q2p[0] * self.kld_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kld_hi_p2q) self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_hi # now do l2 KLd costs self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_hi_q2p, self.kl2_hi_p2q = \ self._construct_kld_costs(p=2.0) self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_z_p2q) self.kl2_hi = (self.lam_kld_q2p[0] * self.kl2_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_hi_p2q) self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_hi # compute joint l1/l2 KLd cost self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \ ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) self.kl2_cost = T.mean(self.kl2_costs) self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_l1l2_cost + \ self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_l1l2_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.q_updates = get_adam_updates(params=self.q_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.p_updates = get_adam_updates(params=self.p_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.q_updates: self.joint_updates[k] = self.q_updates[k] for k in self.p_updates: self.joint_updates[k] = self.p_updates[k] # add scan updates, which seem to be required for k in self.scan_updates: self.joint_updates[k] = self.scan_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_klds = self._construct_raw_klds() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() print("Compiling data-guided model sampler...") self.sample_from_input = self._construct_sample_from_input() return
class GenFCModule(object): """ Module that transforms random values through a single fully connected layer, and then a linear transform (with another relu, optionally). """ def __init__(self, rand_dim, out_dim, fc_dim, apply_bn_1=True, apply_bn_2=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_fc'): self.rand_dim = rand_dim self.out_dim = out_dim self.fc_dim = fc_dim self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ self.w1 = self.init_func((self.rand_dim, self.fc_dim), "{}_w1".format(self.mod_name)) self.w2 = self.init_func((self.fc_dim, self.out_dim), "{}_w2".format(self.mod_name)) self.params = [self.w1, self.w2] # make gains and biases for transforms that will get batch normed if self.apply_bn_1: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.fc_dim), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.fc_dim), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) if self.apply_bn_2: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g2 = gain_ifn((self.out_dim), "{}_g2".format(self.mod_name)) self.b2 = bias_ifn((self.out_dim), "{}_b2".format(self.mod_name)) self.params.extend([self.g2, self.b2]) return def apply(self, batch_size=None, rand_vals=None): """ Apply this generator module. Pass _either_ batch_size or rand_vals. """ assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals" if rand_vals is None: rand_shape = (batch_size, self.rand_dim) if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) else: rand_shape = (rand_vals.shape[0], self.rand_dim) rand_vals = rand_vals.reshape(rand_shape) # transform random values into fc layer h1 = T.dot(rand_vals, self.w1) if self.apply_bn_1: h1 = batchnorm(h1, g=self.g1, b=self.b1) h1 = relu(h1) # transform from fc layer to output h2 = T.dot(h1, self.w2) if self.apply_bn_2: h2 = batchnorm(h2, g=self.g2, b=self.b2) if self.final_relu: h2 = relu(h2) return h2
class DAELayer(object): def __init__(self, rng, clean_input=None, fuzzy_input=None, \ in_dim=0, out_dim=0, activation=None, input_noise=0., \ W=None, b_h=None, b_v=None, W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) # Grab the layer input and perturb it with some sort of noise. This # is, afterall, a _denoising_ autoencoder... self.clean_input = clean_input self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise) # Set some basic layer properties self.activation = activation self.in_dim = in_dim self.out_dim = out_dim # Get some random initial weights and biases, if not given if W is None: W_init = np.asarray(1.0 * DCG(rng.standard_normal( \ size=(in_dim, out_dim)), dtype=theano.config.floatX)) W = theano.shared(value=(W_scale*W_init), name='W') if b_h is None: b_init = np.zeros((out_dim,), dtype=theano.config.floatX) b_h = theano.shared(value=b_init, name='b_h') if b_v is None: b_init = np.zeros((in_dim,), dtype=theano.config.floatX) b_v = theano.shared(value=b_init, name='b_v') # Grab pointers to the now-initialized weights and biases self.W = W self.b_h = b_h self.b_v = b_v # Put the learnable/optimizable parameters into a list self.params = [self.W, self.b_h, self.b_v] # Beep boop... layer construction complete... return def compute_costs(self, lam_l1=None): """Compute reconstruction and activation sparsity costs.""" # Get noise-perturbed encoder/decoder parameters W_nz = self._noisy_params(self.W, 0.01) b_nz = self.b_h #self._noisy_params(self.b_h, 0.05) # Compute hidden and visible activations A_v, A_h = self._compute_activations(self.noisy_input, \ W_nz, b_nz, self.b_v) # Compute reconstruction error cost recon_cost = T.sum((self.clean_input - A_v)**2.0) / \ self.clean_input.shape[0] # Compute sparsity penalty (over both population and lifetime) row_l1_sum = T.sum(abs(row_normalize(A_h))) / A_h.shape[0] col_l1_sum = T.sum(abs(col_normalize(A_h))) / A_h.shape[1] sparse_cost = lam_l1[0] * (row_l1_sum + col_l1_sum) return [recon_cost, sparse_cost] def _compute_hidden_acts(self, X, W, b_h): """Compute activations of encoder (at hidden layer).""" A_h = self.activation(T.dot(X, W) + b_h) return A_h def _compute_activations(self, X, W, b_h, b_v): """Compute activations of decoder (at visible layer).""" A_h = self._compute_hidden_acts(X, W, b_h) A_v = T.dot(A_h, W.T) + b_v return [A_v, A_h] def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" if noise_lvl > 1e-3: P_nz = P + DCG(self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX)) else: P_nz = P return P_nz def _get_noisy_input(self, input, p): """p is the probability of dropping elements of input.""" drop_rnd = self.rng.uniform(input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # Cast mask from int to float32, to keep things on GPU noisy_input = input * DCG(drop_mask) return noisy_input
def __init__(self, rng=None, x_in=None, x_mask=None, x_out=None, \ p_zi_given_xi=None, \ p_sip1_given_zi=None, \ q_zi_given_xi=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.imp_steps = self.params['imp_steps'] self.step_type = self.params['step_type'] self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) self.shared_param_dicts = shared_param_dicts # grab handles to the relevant InfNets self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.x_mask = x_mask self.zi_zmuv = T.tensor3() # setup switching variable for changing between sampling/training zero_ary = to_fX(np.zeros((1, ))) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize parameters "owned" by this model s0_init = to_fX(np.zeros((self.x_dim, ))) init_ary = to_fX(np.zeros((self.x_dim, ))) self.x_null = theano.shared(value=init_ary, name='gpis_xn') self.grad_null = theano.shared(value=init_ary, name='gpsi_gn') self.s0 = theano.shared(value=s0_init, name='gpsi_s0') self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar') self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['x_null'] = self.x_null self.shared_param_dicts['grad_null'] = self.grad_null self.shared_param_dicts['s0'] = self.s0 self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.x_null = self.shared_param_dicts['x_null'] self.grad_null = self.shared_param_dicts['grad_null'] self.s0 = self.shared_param_dicts['s0'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) ################################################## # Setup the iterative imputation loop using scan # ################################################## self.ones_mask = T.ones_like(self.x_mask) def imp_step_func(zi_zmuv, si): si_as_x = self._si_as_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = self.x_mask * grad_unmasked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.concatenate([xi_masked, xi_unmasked], axis=1)) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step elif (self.step_type == 'add'): # add steps just update the guesses additively sip1 = si + si_step elif (self.step_type == 'lstm'): # LSTM-style updates with write and erase gates write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1]) erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) elif (self.step_type == 'layer'): alpha_gate = T.nnet.sigmoid(hydra_out[1]) sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step) else: assert False, "Unknown step type!" # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g # apply scan op for the sequential imputation loop self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0 init_vals = [self.s0_full, None, None, None, None] self.scan_results, self.scan_updates = theano.scan(imp_step_func, \ outputs_info=init_vals, sequences=self.zi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] self.kldi_p2g = self.scan_results[4] # get the initial imputation state self.x0 = (self.x_mask * self.x_in) + \ ((1.0 - self.x_mask) * self._si_as_x(self.s0_full)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='gpsi_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g') self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.joint_params = [self.s0, self.obs_logvar] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling best step cost computer...") self.compute_per_step_cost = self._construct_compute_per_step_cost() print("Compiling data-guided imputer sampler...") self.sample_imputer = self._construct_sample_imputer() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return
class DAELayer(object): def __init__(self, rng, clean_input=None, fuzzy_input=None, \ in_dim=0, out_dim=0, activation=None, input_noise=0., \ W=None, b_h=None, b_v=None): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) # Grab the layer input and perturb it with some sort of noise. This # is, afterall, a _denoising_ autoencoder... self.clean_input = clean_input self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise) # Set some basic layer properties self.activation = activation self.in_dim = in_dim self.out_dim = out_dim # Get some random initial weights and biases, if not given if W is None: W_init = np.asarray(0.01 * rng.standard_normal( \ size=(in_dim, out_dim)), dtype=theano.config.floatX) W = theano.shared(value=W_init, name='W') if b_h is None: b_init = np.zeros((out_dim, ), dtype=theano.config.floatX) b_h = theano.shared(value=b_init, name='b_h') if b_v is None: b_init = np.zeros((in_dim, ), dtype=theano.config.floatX) b_v = theano.shared(value=b_init, name='b_v') # Grab pointers to the now-initialized weights and biases self.W = W self.b_h = b_h self.b_v = b_v # Put the learnable/optimizable parameters into a list self.params = [self.W, self.b_h, self.b_v] # Beep boop... layer construction complete... return def compute_costs(self, lam_l1=None): """Compute reconstruction and activation sparsity costs.""" # Get noise-perturbed encoder/decoder parameters W_nz = self._noisy_params(self.W, 0.01) b_nz = self.b_h #self._noisy_params(self.b_h, 0.05) # Compute hidden and visible activations A_v, A_h = self._compute_activations(self.noisy_input, \ W_nz, b_nz, self.b_v) # Compute reconstruction error cost recon_cost = T.sum((self.clean_input - A_v)**2.0) / \ self.clean_input.shape[0] # Compute sparsity penalty (over both population and lifetime) row_l1_sum = T.sum(abs(row_normalize(A_h))) / A_h.shape[0] col_l1_sum = T.sum(abs(col_normalize(A_h))) / A_h.shape[1] sparse_cost = lam_l1[0] * (row_l1_sum + col_l1_sum) return [recon_cost, sparse_cost] def _compute_hidden_acts(self, X, W, b_h): """Compute activations of encoder (at hidden layer).""" A_h = self.activation(T.dot(X, W) + b_h) return A_h def _compute_activations(self, X, W, b_h, b_v): """Compute activations of decoder (at visible layer).""" A_h = self._compute_hidden_acts(X, W, b_h) A_v = T.dot(A_h, W.T) + b_v return [A_v, A_h] def _noisy_params(self, P, noise_lvl=0.): """Noisy weights, like convolving energy surface with a gaussian.""" if noise_lvl > 1e-3: P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \ dtype=theano.config.floatX) else: P_nz = P return P_nz def _get_noisy_input(self, input, p): """p is the probability of dropping elements of input.""" drop_rnd = self.rng.uniform(input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # Cast mask from int to float32, to keep things on GPU noisy_input = input * drop_mask return noisy_input
class GenNet(object): """ A net that transforms a simple distribution so that it matches some more complicated distribution, for some definition of match.... Parameters: rng: a numpy.random RandomState object Xp: symbolic matrix for inputting latent variable samples prior_sigma: standard deviation of isotropic Gaussian prior that this generator will transform to match some other distribution params: a dict of parameters describing the desired network: lam_l2a: L2 regularization weight on neuron activations vis_drop: drop rate to use on the latent variable space hid_drop: drop rate to use on the hidden layer activations -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0 bias_noise: standard dev for noise on the biases of hidden layers mlp_config: list of "layer descriptions" out_type: set this to "bernoulli" for generating outputs to match bernoulli-valued observations and set it to "gaussian" to match general real-valued observations. activation: "function handle" for the desired non-linearity init_scale: scaling factor for hidden layer weights (__ * 0.01) shared_param_dicts: parameters for the MLP controlled by this GenNet """ def __init__(self, \ rng=None, \ Xp=None, \ prior_sigma=None, \ params=None, \ shared_param_dicts=None): # First, setup a shared random number generator for this layer self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xp = Xp self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### assert(not (params is None)) self.params = params lam_l2a = self.params['lam_l2a'] if 'vis_drop' in self.params: # Drop rate on the latent variables self.vis_drop = self.params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in self.params: # Drop rate on hidden layer activations self.hid_drop = self.params['hid_drop'] else: self.hid_drop = 0.0 if 'bias_noise' in self.params: # Noise sigma for hidden layer biases self.bias_noise = self.params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 if 'out_type' in params: # check which type of output distribution to generate self.out_type = params['out_type'] assert((self.out_type == 'bernoulli') or \ (self.out_type == 'gaussian')) else: # default to bernoulli-valued outputs self.out_type = 'bernoulli' # Check if the params for this net were given a priori. This option # will be used for creating "clones" of a generative network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = [] self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.mlp_config = params['mlp_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun self.mlp_depth = len(self.mlp_config) - 1 self.latent_dim = self.mlp_config[0] self.data_dim = self.mlp_config[-1] ########################## # Initialize the network # ########################## self.mlp_layers = [] self.logvar_layer = None layer_def_pairs = zip(self.mlp_config[:-1],self.mlp_config[1:]) layer_num = 0 next_input = self.Xp for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "gn_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.mlp_layers.append(new_layer) self.shared_param_dicts.append({'W': new_layer.W, 'b': new_layer.b}) if (last_layer and (self.out_type == 'gaussian')): # add an extra layer/transform for encoding log-variance lv_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name+'_logvar', W_scale=self.init_scale) self.logvar_layer = lv_layer self.mlp_layers.append(lv_layer) self.shared_param_dicts.append({'W': lv_layer.W, 'b': lv_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts[layer_num] self.mlp_layers.append(HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale)) if (last_layer and (self.out_type == 'gaussian')): init_params = self.shared_param_dicts[layer_num+1] self.mlp_layers.append(HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale)) next_input = self.mlp_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # construct a mask for deciding which output dimensions to keep/ignore if self.is_clone: self.output_mask = self.shared_param_dicts[-1]['output_mask'] self.output_bias = self.shared_param_dicts[-1]['output_bias'] else: row_mask = np.ones((self.data_dim,)).astype(theano.config.floatX) self.output_mask = theano.shared(value=row_mask, name='gn_output_mask') row_mask = 0.0 * row_mask self.output_bias = theano.shared(value=row_mask, name='gn_output_bias') op_dict = {'output_mask': self.output_mask, \ 'output_bias': self.output_bias} self.shared_param_dicts.append(op_dict) # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.mlp_layers: self.mlp_params.extend(layer.params) # add the output bias vector to the param list self.mlp_params.append(self.output_bias) # The output of this generator network is given by the noisy output # of its final layer. We will keep a running estimate of the mean and # covariance of the distribution induced by combining this network's # latent noise source with its deep non-linear transform. These will # be used to encourage the induced distribution to match the first and # second-order moments of the distribution we are trying to match. if self.out_type == 'bernoulli': self.output = (T.nnet.sigmoid(self.mlp_layers[-1].linear_output + self.output_bias) * \ self.output_mask) self.output_mu = self.output self.output_logvar = self.output self.output_sigma = self.output else: self.output_mu = self.mlp_layers[-1].linear_output + self.output_bias self.output_logvar = self.mlp_layers[-2].linear_output self.output_sigma = T.sqrt(T.exp(self.output_logvar)) self.output = self._construct_post_samples() * self.output_mask self.out_dim = self.mlp_layers[-1].out_dim C_init = np.zeros((self.out_dim,self.out_dim)).astype(theano.config.floatX) m_init = np.zeros((self.out_dim,)).astype(theano.config.floatX) self.dist_mean = theano.shared(m_init, name='gn_dist_mean') self.dist_cov = theano.shared(C_init, name='gn_dist_cov') # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = lam_l2a * self._act_reg_cost() # Construct a sampler for drawing independent samples from this model's # isotropic Gaussian prior, and a sampler for the model distribution. self.sample_from_prior = self._construct_prior_sampler() self.sample_from_model = self._construct_model_sampler() # Construct a function for passing points from the latent/prior space # through the transform induced by the current model parameters. self.transform_prior = self._construct_transform_prior() return def _act_reg_cost(self): """ Apply L2 regularization to the activations in this network. """ act_sq_sums = [] for layer in self.mlp_layers: act_sq_sums.append(layer.act_l2_sum) full_act_sq_sum = T.sum(act_sq_sums) return full_act_sq_sum def _construct_post_samples(self): """ Draw a single sample from each of the approximate posteriors encoded in self.output_mu and self.output_sigma. """ post_samples = self.output_mu + (self.output_sigma * \ self.rng.normal(size=self.output_sigma.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) return post_samples def _construct_prior_sampler(self): """ Draw independent samples from this model's isotropic Gaussian prior. """ samp_count = T.lscalar() prior_samples = self.prior_sigma * self.rng.normal( \ size=(samp_count, self.latent_dim), avg=0.0, std=1.0, \ dtype=theano.config.floatX) prior_sampler = theano.function([samp_count], outputs=prior_samples) return prior_sampler def _construct_model_sampler(self): """ Draw independent samples from this model's distribution. """ samp_count = T.lscalar() prior_samples = self.prior_sigma * self.rng.normal( \ size=(samp_count, self.latent_dim), avg=0.0, std=1.0, \ dtype=theano.config.floatX) prior_sampler = theano.function([samp_count], outputs=self.output, \ givens={self.Xp: prior_samples}) return prior_sampler def _construct_transform_prior(self): """ Apply the tranform induced by the current model parameters to some set of points in the latent/prior space. """ feedforward = theano.function([self.Xp], outputs=self.output) return feedforward def _batch_moments(self): """ Compute covariance and mean of the current sample outputs. """ mu = T.mean(self.output, axis=0, keepdims=True) sigma = T.dot((self.output.T - mu.T), (self.output - mu)) return [mu, sigma] def init_biases(self, b_init=0.0): """ Initialize the biases in all hidden layers to some constant. """ for layer in self.mlp_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init layer.b.set_value(b_vec) return def init_moments(self, X_noise): """ Initialize the running mean and covariance estimates. """ X_noise_sym = T.matrix() out_func = theano.function(inputs=[ X_noise_sym ], \ outputs=[ self.output ], \ givens={self.Xp: X_noise_sym}) # Compute outputs for the input latent noise matrix X_out = out_func(X_noise.astype(theano.config.floatX))[0] # Compute mean and covariance of the outputs mu = np.mean(X_out, axis=0) X_out_minus_mu = X_out - mu sigma = np.dot(X_out_minus_mu.T,X_out_minus_mu) / X_out.shape[0] # Initialize the network's running estimates self.dist_cov.set_value(sigma.astype(theano.config.floatX)) self.dist_mean.set_value(mu.astype(theano.config.floatX)) return def set_output_mask(self, output_mask): """ Set a (probably) binary mask on the output dimensions. """ assert(output_mask.size == self.data_dim) output_mask = output_mask.reshape((self.data_dim,)) self.output_mask.set_value(output_mask.astype(theano.config.floatX)) return def compute_log_prob(self, Xd=None): """ Compute negative log likelihood of the data in Xd, with respect to the output distributions currently at self.output_.... Compute log-prob for all entries in Xd. """ if (self.out_type == 'bernoulli'): log_prob_cost = log_prob_bernoulli(Xd, self.output, mask=self.output_mask) else: log_prob_cost = log_prob_gaussian2(Xd, self.output_mu, \ les_logvars=self.output_logvar, mask=self.output_mask) return log_prob_cost def masked_log_prob(self, Xc=None, Xm=None): """ Compute negative log likelihood of the data in Xc, with respect to the output distributions currently at self.output_.... Select entries in Xd to compute log-prob for based on the mask Xm. When Xm[i] == 1, don't measure NLL Xc[i]... """ # to measure NLL for Xc[i] only when Xm[i] is 0, we need to make an # inverse mask Xm_inv = 1 - X_m, because the masking in the log pdf # functions measures NLL only for observations where the mask != 0. Xm_inv = 1.0 - Xm if (self.out_type == 'bernoulli'): log_prob_cost = log_prob_bernoulli(Xc, self.output, mask=Xm_inv) else: log_prob_cost = log_prob_gaussian2(Xc, self.output_mu, \ les_logvars=self.output_logvar, mask=Xm_inv) return log_prob_cost def shared_param_clone(self, rng=None, Xp=None): """ Return a clone of this network, with shared parameters but with different symbolic input variables. This can be used for "unrolling" a generate->infer->generate->infer... loop. Then, we can do backprop through time for various objectives. """ clone_net = GenNet(rng=rng, Xp=Xp, \ prior_sigma=self.prior_sigma, params=self.params, \ shared_param_dicts=self.shared_param_dicts) return clone_net
def __init__(self, rng=None, x_in=None, x_mask=None, x_out=None, \ p_zi_given_xi=None, \ p_sip1_given_zi=None, \ q_zi_given_xi=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.imp_steps = self.params['imp_steps'] self.step_type = self.params['step_type'] self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) self.shared_param_dicts = shared_param_dicts # grab handles to the relevant InfNets self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.x_mask = x_mask self.zi_zmuv = T.tensor3() # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize parameters "owned" by this model s0_init = to_fX( np.zeros((self.x_dim,)) ) init_ary = to_fX( np.zeros((self.x_dim,)) ) self.x_null = theano.shared(value=init_ary, name='gpis_xn') self.grad_null = theano.shared(value=init_ary, name='gpsi_gn') self.s0 = theano.shared(value=s0_init, name='gpsi_s0') self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['x_null'] = self.x_null self.shared_param_dicts['grad_null'] = self.grad_null self.shared_param_dicts['s0'] = self.s0 self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.x_null = self.shared_param_dicts['x_null'] self.grad_null = self.shared_param_dicts['grad_null'] self.s0 = self.shared_param_dicts['s0'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) ################################################## # Setup the iterative imputation loop using scan # ################################################## self.ones_mask = T.ones_like(self.x_mask) def imp_step_func(zi_zmuv, si): si_as_x = self._si_as_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = self.x_mask * grad_unmasked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.concatenate([xi_masked, xi_unmasked], axis=1)) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step elif (self.step_type == 'add'): # add steps just update the guesses additively sip1 = si + si_step elif (self.step_type == 'lstm'): # LSTM-style updates with write and erase gates write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1]) erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) elif (self.step_type == 'layer'): alpha_gate = T.nnet.sigmoid(hydra_out[1]) sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step) else: assert False, "Unknown step type!" # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g # apply scan op for the sequential imputation loop self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0 init_vals = [self.s0_full, None, None, None, None] self.scan_results, self.scan_updates = theano.scan(imp_step_func, \ outputs_info=init_vals, sequences=self.zi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] self.kldi_p2g = self.scan_results[4] # get the initial imputation state self.x0 = (self.x_mask * self.x_in) + \ ((1.0 - self.x_mask) * self._si_as_x(self.s0_full)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='gpsi_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g') self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.joint_params = [self.s0, self.obs_logvar] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling best step cost computer...") self.compute_per_step_cost = self._construct_compute_per_step_cost() print("Compiling data-guided imputer sampler...") self.sample_imputer = self._construct_sample_imputer() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return
class TwoStageModel2(object): """ Controller for training a two-step hierarchical generative model. x: the "observation" variables z: the "prior" latent variables h: the "hidden" latent variables Generative model is: p(x) = \sum_{z,h} p(x|h) p(h|z) p(z) Variational model is: q(h,z|x) = q(h|x) q(z|h) Parameters: rng: numpy.random.RandomState (for reproducibility) x_in: the input data to encode x_out: the target output to decode p_h_given_z: InfNet for h given z p_x_given_h: InfNet for x given h q_h_given_x: InfNet for h given x q_z_given_h: InfNet for z given h x_dim: dimension of the "observation" space z_dim: dimension of the "prior" latent space h_dim: dimension of the "hidden" latent space params: REQUIRED PARAMS SHOWN BELOW x_type: can be "bernoulli" or "gaussian" obs_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_h_given_z=None, \ p_x_given_h=None, \ q_h_given_x=None, \ q_z_given_h=None, \ x_dim=None, \ z_dim=None, \ h_dim=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.h_dim = h_dim # grab handles to the relevant InfNets self.q_h_given_x = q_h_given_x self.q_z_given_h = q_z_given_h self.p_h_given_z = p_h_given_z self.p_x_given_h = p_x_given_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this TSM init_vec = to_fX( np.zeros((1,self.z_dim)) ) self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar') self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "hidden" latent state (from q) h_q_mean, h_q_logvar, h_q = \ self.q_h_given_x.apply(self.x_in, do_samples=True) # samples of "prior" latent state (from q) z_q_mean, z_q_logvar, z_q = \ self.q_z_given_h.apply(h_q, do_samples=True) # samples of "prior" latent state (from p) z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0) z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0) zmuv = self.rng.normal(size=z_q.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) z_p = (T.exp(0.5*z_p_logvar) * zmuv) + z_p_mean # samples from z -- switched between q/p self.z = (self.train_switch[0] * z_q) + \ ((1.0 - self.train_switch[0]) * z_p) # samples of "hidden" latent state (from p) h_p_mean, h_p_logvar, h_p = \ self.p_h_given_z.apply(self.z, do_samples=True) # samples from h -- switched between q/p self.h = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) # compute KLds for "prior" and "hidden" latent distributions self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, \ z_p_mean, z_p_logvar) self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, \ z_q_mean, z_q_logvar) self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, \ h_p_mean, h_p_logvar) self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, \ h_q_mean, h_q_logvar) # p_x_given_h generates an observation x conditioned on the "hidden" # latent variables h. self.x_gen, _ = self.p_x_given_h.apply(self.h, do_samples=False) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='tsm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w') self.set_lam_l2w(1e-5) # get optimizable parameters belonging to the TwoStageModel self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar] # get optimizable parameters belonging to the underlying networks child_params = [] child_params.extend(self.q_h_given_x.mlp_params) child_params.extend(self.q_z_given_h.mlp_params) child_params.extend(self.p_h_given_z.mlp_params) child_params.extend(self.p_x_given_h.mlp_params) # make a joint list of all optimizable parameters self.joint_params = self_params + child_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = T.sum(self.kld_z, axis=1) + \ T.sum(self.kld_h, axis=1) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # construct the updates for the generator and inferencer networks all_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0) self.joint_updates = OrderedDict() for k in all_updates: self.joint_updates[k] = all_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() return def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_nll(self, lam_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_nll self.lam_nll.set_value(to_fX(new_lam)) return def set_lam_kld(self, lam_kld_q2p=1.0, lam_kld_p2q=1.0): """ Set the relative weight of various KL-divergences. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_q2p self.lam_kld_q2p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_p2q self.lam_kld_p2q.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_nll_costs(self, xo): """ Construct the negative log-likelihood part of free energy. """ # average log-likelihood over the refinement sequence xh = self.obs_transform(self.x_gen) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar) nll_costs = -ll_costs return nll_costs def _construct_kld_costs(self, p=1.0): """ Construct the posterior KL-divergence part of cost to minimize. """ kld_z_q2p = T.sum(self.kld_z_q2p**p, axis=1, keepdims=True) kld_z_p2q = T.sum(self.kld_z_p2q**p, axis=1, keepdims=True) kld_h_q2p = T.sum(self.kld_h_q2p**p, axis=1, keepdims=True) kld_h_p2q = T.sum(self.kld_h_p2q**p, axis=1, keepdims=True) return [kld_z_q2p, kld_z_p2q, kld_h_q2p, kld_h_p2q] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() br = T.lscalar() # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \ self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xi, xo, br ], \ outputs=outputs, \ givens={ self.x_in: xi.repeat(br, axis=0), \ self.x_out: xo.repeat(br, axis=0) }, \ updates=self.joint_updates) return func def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # construct values to output nll = self._construct_nll_costs(self.x_out) kld_z = self.kld_z_q2p kld_h = self.kld_h_q2p # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[self.x_in, self.x_out], \ outputs=[nll, kld_z, kld_h]) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XI, XO, sample_count): # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XI.shape[0],)) kld_z_sum = np.zeros((XI.shape[0],)) kld_h_sum = np.zeros((XI.shape[0],)) for i in range(sample_count): result = fe_term_sample(XI, XO) nll_sum += result[0].ravel() kld_z_sum += np.sum(result[1], axis=1).ravel() kld_h_sum += np.sum(result[2], axis=1).ravel() mean_nll = nll_sum / float(sample_count) mean_kld = (kld_z_sum + kld_h_sum) / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator def _construct_sample_from_prior(self): """ Construct a function for drawing independent samples from the distribution generated by this TwoStageModel. """ x_sym = T.matrix() sample_func = theano.function(inputs=[x_sym], \ outputs=self.obs_transform(self.x_gen), \ givens={self.x_in: T.zeros_like(x_sym), \ self.x_out: T.zeros_like(x_sym)}) def prior_sampler(samp_count): x_samps = to_fX( np.zeros((samp_count, self.x_dim)) ) old_switch = self.train_switch.get_value(borrow=False) # set model to generation mode self.set_train_switch(switch_val=0.0) # generate samples from model model_samps = sample_func(x_samps) # set model back to previous mode self.set_train_switch(switch_val=old_switch) return model_samps return prior_sampler
class GPSImputer(object): """ Controller for training a multi-step imputater via guided policy search. Parameters: rng: numpy.random.RandomState (for reproducibility) x_in: the initial state for imputation x_out: the goal state for imputation x_mask: mask for state dims to keep fixed during imputation p_zi_given_xi: HydraNet for stochastic part of step (2 outputs) p_sip1_given_zi: HydraNet for deterministic part of step (3 outputs) q_zi_given_xi: HydraNet for the guide policy (2 outputs) params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of inputs to reconstruct z_dim: dimension of latent space for policy wobble imp_steps: number of reconstruction steps to perform step_type: either "add", "jump", "lstm", or "layer" x_type: can be "bernoulli" or "gaussian" """ def __init__(self, rng=None, x_in=None, x_mask=None, x_out=None, \ p_zi_given_xi=None, \ p_sip1_given_zi=None, \ q_zi_given_xi=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.imp_steps = self.params['imp_steps'] self.step_type = self.params['step_type'] self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) self.shared_param_dicts = shared_param_dicts # grab handles to the relevant InfNets self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.x_mask = x_mask self.zi_zmuv = T.tensor3() # setup switching variable for changing between sampling/training zero_ary = to_fX(np.zeros((1, ))) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize parameters "owned" by this model s0_init = to_fX(np.zeros((self.x_dim, ))) init_ary = to_fX(np.zeros((self.x_dim, ))) self.x_null = theano.shared(value=init_ary, name='gpis_xn') self.grad_null = theano.shared(value=init_ary, name='gpsi_gn') self.s0 = theano.shared(value=s0_init, name='gpsi_s0') self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar') self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['x_null'] = self.x_null self.shared_param_dicts['grad_null'] = self.grad_null self.shared_param_dicts['s0'] = self.s0 self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.x_null = self.shared_param_dicts['x_null'] self.grad_null = self.shared_param_dicts['grad_null'] self.s0 = self.shared_param_dicts['s0'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh( (1.0 / 8.0) * self.obs_logvar[0]) ################################################## # Setup the iterative imputation loop using scan # ################################################## self.ones_mask = T.ones_like(self.x_mask) def imp_step_func(zi_zmuv, si): si_as_x = self._si_as_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = self.x_mask * grad_unmasked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.concatenate([xi_masked, xi_unmasked], axis=1)) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step elif (self.step_type == 'add'): # add steps just update the guesses additively sip1 = si + si_step elif (self.step_type == 'lstm'): # LSTM-style updates with write and erase gates write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1]) erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) elif (self.step_type == 'layer'): alpha_gate = T.nnet.sigmoid(hydra_out[1]) sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step) else: assert False, "Unknown step type!" # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g # apply scan op for the sequential imputation loop self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0 init_vals = [self.s0_full, None, None, None, None] self.scan_results, self.scan_updates = theano.scan(imp_step_func, \ outputs_info=init_vals, sequences=self.zi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] self.kldi_p2g = self.scan_results[4] # get the initial imputation state self.x0 = (self.x_mask * self.x_in) + \ ((1.0 - self.x_mask) * self._si_as_x(self.s0_full)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX(np.zeros((1, ))) self.lr = theano.shared(value=zero_ary, name='gpsi_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g') self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.joint_params = [self.s0, self.obs_logvar] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling best step cost computer...") self.compute_per_step_cost = self._construct_compute_per_step_cost() print("Compiling data-guided imputer sampler...") self.sample_imputer = self._construct_sample_imputer() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def _si_as_x(self, si): """ Convert from "state" to "observation". """ si_as_x = T.nnet.sigmoid(si) return si_as_x def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1, )) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_nll(self, lam_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_nll self.lam_nll.set_value(to_fX(new_lam)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1, )) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1, )) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xi, br): """ Construct the necessary (symbolic) samples for computing through this GPSImputer for input (sybolic) matrix xi. """ zi_zmuv = self.rng.normal( \ size=(self.imp_steps, xi.shape[0]*br, self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return zi_zmuv def _construct_nll_costs(self, si, xo, xm): """ Construct the negative log-likelihood part of free energy. """ # average log-likelihood over the refinement sequence xh = self._si_as_x(si) xm_inv = 1.0 - xm # we will measure nll only where xm_inv is 1 if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh, mask=xm_inv) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar, mask=xm_inv) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] for i in range(self.imp_steps): kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1)) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) return [kld_pi, kld_qi, kld_gi] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xi, xo, xm ], \ outputs=[nll, kld], \ givens={self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv}, \ updates=self.scan_updates, \ on_unused_input='ignore') # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XI, XO, XM, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's imputation policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XI.shape[0], )) kld_sum = np.zeros((XI.shape[0], )) for i in range(sample_count): result = fe_term_sample(XI, XO, XM) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # compile theano function for computing the costs all_step_costs = [ self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g ] cost_func = theano.function(inputs=[xi, xo, xm], \ outputs=all_step_costs, \ givens={ self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv }, \ updates=self.scan_updates, \ on_unused_input='ignore') # make a function for computing multi-sample estimates of cost def raw_cost_computer(XI, XO, XM): _all_costs = cost_func(to_fX(XI), to_fX(XO), to_fX(XM)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX(np.asarray([k for k in _step_klds])) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX(np.asarray([k for k in _step_nlls])) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_compute_per_step_cost(self): """ Construct a theano function for computing the best possible cost achieved by sequential imputation. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # construct symbolic variables for the step-wise cost step_mean_nll = T.mean(self.nlli, axis=1).flatten() step_lone_kld = T.sum(self.kldi_q2p, axis=2) step_cumu_kld = T.extra_ops.cumsum(step_lone_kld, axis=0) step_mean_kld = T.mean(step_cumu_kld, axis=1).flatten() # compile theano function for computing the step-wise cost step_cost_func = theano.function(inputs=[xi, xo, xm], \ outputs=[step_mean_nll, step_mean_kld], \ givens={ self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv }, \ updates=self.scan_updates, \ on_unused_input='ignore') def best_cost_computer(XI, XO, XM, sample_count=20): # compute a multi-sample estimate of variational free-energy step_nll_sum = np.zeros((self.imp_steps, )) step_kld_sum = np.zeros((self.imp_steps, )) for i in range(sample_count): result = step_cost_func(XI, XO, XM) step_nll_sum += result[0].ravel() step_kld_sum += result[1].ravel() mean_step_nll = step_nll_sum / float(sample_count) mean_step_kld = step_kld_sum / float(sample_count) return [mean_step_nll, mean_step_kld] return best_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() br = T.lscalar() zizmuv = self._construct_zi_zmuv(xi, br) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \ self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xi, xo, xm, br ], \ outputs=outputs, \ givens={ self.x_in: xi.repeat(br, axis=0), \ self.x_out: xo.repeat(br, axis=0), \ self.x_mask: xm.repeat(br, axis=0), \ self.zi_zmuv: zizmuv }, \ updates=self.joint_updates, \ on_unused_input='ignore') return func def _construct_sample_imputer(self): """ Construct a function for drawing samples from the distribution generated by running this imputer. """ xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) oputs = [self.x0] + [ self._si_as_x(self.si[i]) for i in range(self.imp_steps) ] sample_func = theano.function(inputs=[xi, xo, xm], outputs=oputs, \ givens={self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv}, \ updates=self.scan_updates, \ on_unused_input='ignore') def imputer_sampler(XI, XO, XM, use_guide_policy=False): XI = to_fX(XI) XO = to_fX(XO) XM = to_fX(XM) # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's imputation policy self.set_train_switch(switch_val=0.0) # draw guided/unguided conditional samples model_samps = sample_func(XI, XO, XM) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) # reverse engineer the "masked" samples... masked_samps = [] for xs in model_samps: xsm = (XM * XI) + ((1.0 - XM) * xs) masked_samps.append(xsm) return model_samps, masked_samps return imputer_sampler def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert (not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict() child_model_dicts[ 'p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict() child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return
class GPSImputer(object): """ Controller for training a multi-step imputater via guided policy search. Parameters: rng: numpy.random.RandomState (for reproducibility) x_in: the initial state for imputation x_out: the goal state for imputation x_mask: mask for state dims to keep fixed during imputation p_zi_given_xi: HydraNet for stochastic part of step (2 outputs) p_sip1_given_zi: HydraNet for deterministic part of step (3 outputs) q_zi_given_xi: HydraNet for the guide policy (2 outputs) params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of inputs to reconstruct z_dim: dimension of latent space for policy wobble imp_steps: number of reconstruction steps to perform step_type: either "add", "jump", "lstm", or "layer" x_type: can be "bernoulli" or "gaussian" """ def __init__(self, rng=None, x_in=None, x_mask=None, x_out=None, \ p_zi_given_xi=None, \ p_sip1_given_zi=None, \ q_zi_given_xi=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.imp_steps = self.params['imp_steps'] self.step_type = self.params['step_type'] self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) self.shared_param_dicts = shared_param_dicts # grab handles to the relevant InfNets self.p_zi_given_xi = p_zi_given_xi self.p_sip1_given_zi = p_sip1_given_zi self.q_zi_given_xi = q_zi_given_xi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.x_mask = x_mask self.zi_zmuv = T.tensor3() # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize parameters "owned" by this model s0_init = to_fX( np.zeros((self.x_dim,)) ) init_ary = to_fX( np.zeros((self.x_dim,)) ) self.x_null = theano.shared(value=init_ary, name='gpis_xn') self.grad_null = theano.shared(value=init_ary, name='gpsi_gn') self.s0 = theano.shared(value=s0_init, name='gpsi_s0') self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['x_null'] = self.x_null self.shared_param_dicts['grad_null'] = self.grad_null self.shared_param_dicts['s0'] = self.s0 self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.x_null = self.shared_param_dicts['x_null'] self.grad_null = self.shared_param_dicts['grad_null'] self.s0 = self.shared_param_dicts['s0'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) ################################################## # Setup the iterative imputation loop using scan # ################################################## self.ones_mask = T.ones_like(self.x_mask) def imp_step_func(zi_zmuv, si): si_as_x = self._si_as_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = self.x_mask * grad_unmasked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.concatenate([xi_masked, xi_unmasked], axis=1)) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step elif (self.step_type == 'add'): # add steps just update the guesses additively sip1 = si + si_step elif (self.step_type == 'lstm'): # LSTM-style updates with write and erase gates write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1]) erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) elif (self.step_type == 'layer'): alpha_gate = T.nnet.sigmoid(hydra_out[1]) sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step) else: assert False, "Unknown step type!" # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g # apply scan op for the sequential imputation loop self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0 init_vals = [self.s0_full, None, None, None, None] self.scan_results, self.scan_updates = theano.scan(imp_step_func, \ outputs_info=init_vals, sequences=self.zi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] self.kldi_p2g = self.scan_results[4] # get the initial imputation state self.x0 = (self.x_mask * self.x_in) + \ ((1.0 - self.x_mask) * self._si_as_x(self.s0_full)) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='gpsi_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g') self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.joint_params = [self.s0, self.obs_logvar] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling best step cost computer...") self.compute_per_step_cost = self._construct_compute_per_step_cost() print("Compiling data-guided imputer sampler...") self.sample_imputer = self._construct_sample_imputer() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def _si_as_x(self, si): """ Convert from "state" to "observation". """ si_as_x = T.nnet.sigmoid(si) return si_as_x def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_nll(self, lam_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_nll self.lam_nll.set_value(to_fX(new_lam)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xi, br): """ Construct the necessary (symbolic) samples for computing through this GPSImputer for input (sybolic) matrix xi. """ zi_zmuv = self.rng.normal( \ size=(self.imp_steps, xi.shape[0]*br, self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return zi_zmuv def _construct_nll_costs(self, si, xo, xm): """ Construct the negative log-likelihood part of free energy. """ # average log-likelihood over the refinement sequence xh = self._si_as_x(si) xm_inv = 1.0 - xm # we will measure nll only where xm_inv is 1 if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh, mask=xm_inv) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar, mask=xm_inv) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] for i in range(self.imp_steps): kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1)) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) return [kld_pi, kld_qi, kld_gi] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xi, xo, xm ], \ outputs=[nll, kld], \ givens={self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv}, \ updates=self.scan_updates, \ on_unused_input='ignore') # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XI, XO, XM, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's imputation policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XI.shape[0],)) kld_sum = np.zeros((XI.shape[0],)) for i in range(sample_count): result = fe_term_sample(XI, XO, XM) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # compile theano function for computing the costs all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g] cost_func = theano.function(inputs=[xi, xo, xm], \ outputs=all_step_costs, \ givens={ self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv }, \ updates=self.scan_updates, \ on_unused_input='ignore') # make a function for computing multi-sample estimates of cost def raw_cost_computer(XI, XO, XM): _all_costs = cost_func(to_fX(XI), to_fX(XO), to_fX(XM)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX( np.asarray([k for k in _step_klds]) ) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX( np.asarray([k for k in _step_nlls]) ) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_compute_per_step_cost(self): """ Construct a theano function for computing the best possible cost achieved by sequential imputation. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) # construct symbolic variables for the step-wise cost step_mean_nll = T.mean(self.nlli, axis=1).flatten() step_lone_kld = T.sum(self.kldi_q2p, axis=2) step_cumu_kld = T.extra_ops.cumsum(step_lone_kld, axis=0) step_mean_kld = T.mean(step_cumu_kld, axis=1).flatten() # compile theano function for computing the step-wise cost step_cost_func = theano.function(inputs=[xi, xo, xm], \ outputs=[step_mean_nll, step_mean_kld], \ givens={ self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv }, \ updates=self.scan_updates, \ on_unused_input='ignore') def best_cost_computer(XI, XO, XM, sample_count=20): # compute a multi-sample estimate of variational free-energy step_nll_sum = np.zeros((self.imp_steps,)) step_kld_sum = np.zeros((self.imp_steps,)) for i in range(sample_count): result = step_cost_func(XI, XO, XM) step_nll_sum += result[0].ravel() step_kld_sum += result[1].ravel() mean_step_nll = step_nll_sum / float(sample_count) mean_step_kld = step_kld_sum / float(sample_count) return [mean_step_nll, mean_step_kld] return best_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() xm = T.matrix() br = T.lscalar() zizmuv = self._construct_zi_zmuv(xi, br) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \ self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xi, xo, xm, br ], \ outputs=outputs, \ givens={ self.x_in: xi.repeat(br, axis=0), \ self.x_out: xo.repeat(br, axis=0), \ self.x_mask: xm.repeat(br, axis=0), \ self.zi_zmuv: zizmuv }, \ updates=self.joint_updates, \ on_unused_input='ignore') return func def _construct_sample_imputer(self): """ Construct a function for drawing samples from the distribution generated by running this imputer. """ xi = T.matrix() xo = T.matrix() xm = T.matrix() zizmuv = self._construct_zi_zmuv(xi, 1) oputs = [self.x0] + [self._si_as_x(self.si[i]) for i in range(self.imp_steps)] sample_func = theano.function(inputs=[xi, xo, xm], outputs=oputs, \ givens={self.x_in: xi, \ self.x_out: xo, \ self.x_mask: xm, \ self.zi_zmuv: zizmuv}, \ updates=self.scan_updates, \ on_unused_input='ignore') def imputer_sampler(XI, XO, XM, use_guide_policy=False): XI = to_fX( XI ) XO = to_fX( XO ) XM = to_fX( XM ) # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's imputation policy self.set_train_switch(switch_val=0.0) # draw guided/unguided conditional samples model_samps = sample_func(XI, XO, XM) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) # reverse engineer the "masked" samples... masked_samps = [] for xs in model_samps: xsm = (XM * XI) + ((1.0 - XM) * xs) masked_samps.append(xsm) return model_samps, masked_samps return imputer_sampler def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert(not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict() child_model_dicts['p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict() child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return
def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, name="", W_scale=1.0): # Setup a shared random generator for this layer #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ # rng.randint(100000)) self.rng = CURAND_RandomStreams(rng.randint(1000000)) self.clean_input = input # Add gaussian noise to the input (if desired) if (input_noise > 1e-4): self.fuzzy_input = input + self.rng.normal(size=input.shape, \ avg=0.0, std=input_noise, dtype=theano.config.floatX) else: self.fuzzy_input = input # Apply masking noise to the input (if desired) if (drop_rate > 1e-4): self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate) else: self.noisy_input = self.fuzzy_input # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation: self.activation = activation else: if self.pool_size <= 1: self.activation = lambda x: relu_actfun(x) else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: if self.pool_size <= 1: # Generate random initial filters in a typical way W_init = 0.01 * np.asarray(rng.normal( \ size=(self.in_dim, self.filt_count)), \ dtype=theano.config.floatX) else: # Generate groups of random filters to pool over such that # intra-group correlations are stronger than inter-group # correlations, to encourage pooling over similar filters... filters = [] f_size = (self.in_dim, 1) for g_num in range(self.pool_count): g_filt = 0.01 * rng.normal(size=f_size) for f_num in range(self.pool_size): f_filt = g_filt + 0.003 * rng.normal(size=f_size) filters.append(f_filt) W_init = np.hstack(filters).astype(theano.config.floatX) W = theano.shared(value=(W_scale * W_init), name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count, ), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer self.linear_output = T.dot(self.noisy_input, self.W) + self.b # Add noise to the pre-activation features (if desired) if bias_noise > 1e-3: self.noisy_linear = self.linear_output + \ self.rng.normal(size=self.linear_output.shape, \ avg=0.0, std=bias_noise, dtype=theano.config.floatX) else: self.noisy_linear = self.linear_output # Apply activation function self.output = self.activation(self.noisy_linear) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.output**2.) / self.output.size self.row_l1_sum = T.sum(abs(row_normalize(self.output))) / \ self.output.shape[0] self.col_l1_sum = T.sum(abs(col_normalize(self.output))) / \ self.output.shape[1] # Conveniently package layer parameters self.params = [self.W, self.b] # Layer construction complete... return
def __init__(self, \ rng=None, \ Xd=None, \ prior_sigma=None, \ params=None, \ shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params self.lam_l2a = params['lam_l2a'] if 'build_theano_funcs' in params: self.build_theano_funcs = params['build_theano_funcs'] else: self.build_theano_funcs = True if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.0 if 'input_noise' in params: self.input_noise = params['input_noise'] else: self.input_noise = 0.0 if 'bias_noise' in params: self.bias_noise = params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 if 'encoder' in params: self.encoder = params['encoder'] self.decoder = params['decoder'] self.use_encoder = True self.Xd_encoded = self.encoder(self.Xd) else: self.encoder = lambda x: x self.decoder = lambda x: x self.use_encoder = False self.Xd_encoded = self.encoder(self.Xd) if 'kld2_scale' in params: self.kld2_scale = params['kld2_scale'] else: self.kld2_scale = 0.0 if 'sigma_init_scale' in params: self.sigma_init_scale = params['sigma_init_scale'] else: self.sigma_init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.shared_config = params['shared_config'] self.mu_config = params['mu_config'] self.sigma_config = params['sigma_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun ######################################### # Initialize the shared part of network # ######################################### self.shared_layers = [] layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:]) layer_num = 0 # Construct input to the inference network if self.use_encoder: next_input = self.encoder(self.Xd) else: next_input = self.Xd for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "share_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if first_layer: i_noise = self.input_noise b_noise = 0.0 else: i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.shared_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ##################################### # Initialize the mu part of network # ##################################### self.mu_layers = [] layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "mu_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) self.shared_param_dicts['mu'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['mu'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.mu_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ######################################## # Initialize the sigma part of network # ######################################## self.sigma_layers = [] layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "sigma_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if last_layer: # set in-bound weights for logvar predictions to 0 i_scale = 0.0 * i_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) self.shared_param_dicts['sigma'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['sigma'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.sigma_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # Create a shared parameter for rescaling posterior "sigmas" to allow # control over the velocity of the markov chain generated by repeated # cycling through the INF -> GEN loop. if not ('sigma_scale' in self.shared_param_dicts['sigma'][-1]): # we use a hack-ish check to remain compatible with loading models # that were saved before the addition of the sigma_scale param. zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.sigma_scale = theano.shared(value=zero_ary) new_dict = {'sigma_scale': self.sigma_scale} self.shared_param_dicts['sigma'].append(new_dict) self.set_sigma_scale(1.0) else: # this is a clone of some other InfNet, and that InfNet was made # after adding the sigma_scale param, so use its sigma_scale self.sigma_scale = \ self.shared_param_dicts['sigma'][-1]['sigma_scale'] # Create a shared parameter for maintaining an exponentially decaying # estimate of the population mean of posterior KL divergence. if not ('kld_mean' in self.shared_param_dicts['sigma'][-1]): # add a kld_mean if none was already present zero_ary = np.zeros((1,)).astype(theano.config.floatX) + 100.0 self.kld_mean = theano.shared(value=zero_ary) self.shared_param_dicts['sigma'][-1]['kld_mean'] = self.kld_mean else: # use a kld_mean that's already present self.kld_mean = self.shared_param_dicts['sigma'][-1]['kld_mean'] # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.mu_layers: self.mlp_params.extend(layer.params) for layer in self.sigma_layers: self.mlp_params.extend(layer.params) # The output of this inference network is given by the noisy output # of the final layers of its mu and sigma networks. self.output_mean = self.mu_layers[-1].linear_output self.output_logvar = self.sigma_layers[-1].linear_output self.output_sigma = self.sigma_init_scale * self.sigma_scale[0] * \ T.exp(0.5 * self.output_logvar) # We'll also construct an output containing a single samples from each # of the distributions represented by the rows of self.output_mean and # self.output_sigma. self.output = self._construct_post_samples() self.out_dim = self.sigma_layers[-1].out_dim # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = self.lam_l2a * self._act_reg_cost() # Construct a function for penalizing KL divergence between the # approximate posteriors produced by this model and some isotropic # Gaussian distribution. self.kld_cost = self._construct_kld_cost() self.kld_mean_update = T.cast((0.98 * self.kld_mean) + \ (0.02 * T.mean(self.kld_cost)), 'floatX') # Construct a theano function for sampling from the approximate # posteriors inferred by this model for some collection of points # in the "data space". if self.build_theano_funcs: self.sample_posterior = self._construct_sample_posterior() self.mean_posterior = theano.function([self.Xd], \ outputs=self.output_mean) else: self.sample_posterior = None self.mean_posterior = None return
class GenConvModule(object): """ Module of one "fractionally strided" convolution layer followed by one regular convolution layer. Inputs to the fractionally strided convolution can optionally be augmented with some random values. Params: filt_shape: shape for convolution filters -- should be square and odd in_chans: number of channels in the inputs to module out_chans: number of channels in the outputs from module rand_chans: number of random channels to augment input use_rand: flag for whether or not to augment inputs apply_bn_1: flag for whether to batch normalize following first conv apply_bn_2: flag for whether to batch normalize following second conv us_stride: upsampling ratio in the fractionally strided convolution use_pooling: whether to use unpooling or fractional striding init_func: function for initializing module parameters mod_name: text name for identifying module in theano graph rand_type: whether to use Gaussian or uniform randomness """ def __init__(self, filt_shape, in_chans, out_chans, rand_chans, use_rand=True, apply_bn_1=True, apply_bn_2=True, us_stride=2, use_pooling=True, init_func=None, mod_name='gm_conv', rand_type='normal'): assert ((filt_shape[0] % 2) > 0), "filter dim should be odd (not even)" self.filt_dim = filt_shape[0] self.in_chans = in_chans self.out_chans = out_chans self.rand_chans = rand_chans self.use_rand = use_rand self.apply_bn_1 = apply_bn_1 self.apply_bn_2 = apply_bn_2 self.us_stride = us_stride self.use_pooling = use_pooling self.mod_name = mod_name self.rand_type = rand_type self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ if self.use_rand: # random values will be stacked on exogenous input self.w1 = self.init_func( (self.out_chans, (self.in_chans + self.rand_chans), self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name)) else: # random values won't be stacked on exogenous input self.w1 = self.init_func( (self.out_chans, self.in_chans, self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name)) self.w2 = self.init_func( (self.out_chans, self.out_chans, self.filt_dim, self.filt_dim), "{}_w2".format(self.mod_name)) self.params = [self.w1, self.w2] # make gains and biases for transforms that will get batch normed if self.apply_bn_1: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.out_chans), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.out_chans), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) if self.apply_bn_2: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g2 = gain_ifn((self.out_chans), "{}_g2".format(self.mod_name)) self.b2 = bias_ifn((self.out_chans), "{}_b2".format(self.mod_name)) self.params.extend([self.g2, self.b2]) return def apply(self, input, rand_vals=None): """ Apply this generator module to some input. """ batch_size = input.shape[0] bm = int((self.filt_dim - 1) / 2) # use "same" mode convolutions ss = self.us_stride # stride for "learned upsampling" if self.use_pooling: # "unpool" the input if desired input = input.repeat(ss, axis=2).repeat(ss, axis=3) # get shape for random values that will augment input rand_shape = (batch_size, self.rand_chans, input.shape[2], input.shape[3]) if self.use_rand: # augment input with random channels if rand_vals is None: if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) rand_vals = rand_vals.reshape(rand_shape) # stack random values on top of input full_input = T.concatenate([rand_vals, input], axis=1) else: # don't augment input with random channels full_input = input # apply first convolution, perhaps with fractional striding if self.use_pooling: h1 = dnn_conv(full_input, self.w1, subsample=(1, 1), border_mode=(bm, bm)) else: # apply first conv layer (with fractional stride for upsampling) h1 = deconv(full_input, self.w1, subsample=(ss, ss), border_mode=(bm, bm)) if self.apply_bn_1: h1 = batchnorm(h1, g=self.g1, b=self.b1) h1 = relu(h1) # apply second conv layer h2 = dnn_conv(h1, self.w2, subsample=(1, 1), border_mode=(bm, bm)) if self.apply_bn_2: h2 = batchnorm(h2, g=self.g2, b=self.b2) h2 = relu(h2) return h2
def __init__(self, rng, input, in_dim, out_dim, \ activation=None, pool_size=0, \ drop_rate=0., input_noise=0., bias_noise=0., \ W=None, b=None, b_in=None, s_in=None, name="", W_scale=1.0): # Setup a shared random generator for this layer self.rng = RandStream(rng.randint(1000000)) # setup scale and bias params for the input if b_in is None: # input biases are always initialized to zero ary = np.zeros((in_dim, ), dtype=theano.config.floatX) b_in = theano.shared(value=ary, name="{0:s}_b_in".format(name)) if s_in is None: # input scales are always initialized to one ary = 0.541325 * np.ones((in_dim, ), dtype=theano.config.floatX) s_in = theano.shared(value=ary, name="{0:s}_s_in".format(name)) self.b_in = b_in self.s_in = s_in # allow an early shift and rescale for inputs to this layer #self.clean_input = T.nnet.softplus(self.s_in) * (input + self.b_in) # use the input directly self.clean_input = input zero_ary = np.zeros((1, )).astype(theano.config.floatX) self.input_noise = theano.shared(value=(zero_ary+input_noise), \ name="{0:s}_input_noise".format(name)) self.bias_noise = theano.shared(value=(zero_ary+bias_noise), \ name="{0:s}_bias_noise".format(name)) self.drop_rate = theano.shared(value=(zero_ary+drop_rate), \ name="{0:s}_bias_noise".format(name)) # Add gaussian noise to the input (if desired) self.fuzzy_input = self.clean_input + (self.input_noise[0] * \ self.rng.normal(size=self.clean_input.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) # Apply masking noise to the input (if desired) self.noisy_input = self._drop_from_input(self.fuzzy_input, \ self.drop_rate[0]) # Set some basic layer properties self.pool_size = pool_size self.in_dim = in_dim self.out_dim = out_dim if self.pool_size <= 1: self.filt_count = self.out_dim else: self.filt_count = self.out_dim * self.pool_size self.pool_count = self.filt_count / max(self.pool_size, 1) if activation is None: activation = relu_actfun if self.pool_size <= 1: self.activation = activation else: self.activation = lambda x: \ maxout_actfun(x, self.pool_size, self.filt_count) # Get some random initial weights and biases, if not given if W is None: # Generate initial filters using orthogonal random trick #W_shape = (self.in_dim, self.filt_count) #W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) #W_init = W_scale * npr.normal(0.0, 1.0, W_shape) W_init = ortho_matrix(shape=(self.in_dim, self.filt_count), \ gain=W_scale) W_init = W_init.astype(theano.config.floatX) W = theano.shared(value=W_init, name="{0:s}_W".format(name)) if b is None: b_init = np.zeros((self.filt_count, ), dtype=theano.config.floatX) b = theano.shared(value=b_init, name="{0:s}_b".format(name)) # Set layer weights and biases self.W = W self.b = b # Compute linear "pre-activation" for this layer self.linear_output = T.dot(self.noisy_input, self.W) + self.b # Add noise to the pre-activation features (if desired) self.noisy_linear = self.linear_output + (self.bias_noise[0] * \ self.rng.normal(size=self.linear_output.shape, avg=0.0, \ std=1.0, dtype=theano.config.floatX)) # Apply activation function self.output = self.activation(self.noisy_linear) # Compute some properties of the activations, probably to regularize self.act_l2_sum = T.sum(self.noisy_linear**2.) / self.output.size # Conveniently package layer parameters self.params = [self.W, self.b, self.b_in, self.s_in] # Layer construction complete... return
class GenUniModule(object): """ Module that applies a linear transform followed by an non-linearity. """ def __init__(self, rand_dim, out_dim, apply_bn=True, init_func=None, rand_type='normal', final_relu=True, mod_name='dm_uni'): self.rand_dim = rand_dim self.out_dim = out_dim self.apply_bn = apply_bn self.mod_name = mod_name self.rand_type = rand_type self.final_relu = final_relu self.rng = RandStream(123) if init_func is None: self.init_func = inits.Normal(scale=0.02) else: self.init_func = init_func self._init_params() # initialize parameters return def _init_params(self): """ Initialize parameters for the layers in this generator module. """ self.w1 = self.init_func((self.rand_dim, self.out_dim), "{}_w1".format(self.mod_name)) self.params = [self.w1] # make gains and biases for transforms that will get batch normed if self.apply_bn: gain_ifn = inits.Normal(loc=1., scale=0.02) bias_ifn = inits.Constant(c=0.) self.g1 = gain_ifn((self.out_dim), "{}_g1".format(self.mod_name)) self.b1 = bias_ifn((self.out_dim), "{}_b1".format(self.mod_name)) self.params.extend([self.g1, self.b1]) return def apply(self, batch_size=None, rand_vals=None): """ Apply this generator module. Pass _either_ batch_size or rand_vals. """ assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals" if rand_vals is None: rand_shape = (batch_size, self.rand_dim) if self.rand_type == 'normal': rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) else: rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \ dtype=theano.config.floatX) else: rand_shape = (rand_vals.shape[0], self.rand_dim) rand_vals = rand_vals.reshape(rand_shape) # transform random values linearly h1 = T.dot(rand_vals, self.w1) if self.apply_bn: h1 = batchnorm(h1, g=self.g1, b=self.b1) if self.final_relu: h1 = relu(h1) return h1 ############## # EYE BUFFER # ##############
class SimpleInfNet(object): def __init__(self, rng, in_dim, out_dim, \ W_mean=None, b_mean=None, \ W_logvar=None, b_logvar=None, \ name="", W_scale=1.0): # setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # set some basic layer properties self.in_dim = in_dim self.out_dim = out_dim # initialize weights and biases for mean estimate if W_mean is None: # Generate initial filters using orthogonal random trick W_shape = (self.in_dim, self.out_dim) if W_scale > 0.1: W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) W_init = W_scale * npr.normal(0.0, 1.0, W_shape) W_init = W_init.astype(theano.config.floatX) W_mean = theano.shared(value=W_init, \ name="{0:s}_W_mean".format(name)) if b_mean is None: b_init = np.zeros((self.out_dim,), \ dtype=theano.config.floatX) b_mean = theano.shared(value=b_init, \ name="{0:s}_b_mean".format(name)) # grab handles for easy access self.W_mean = W_mean self.b_mean = b_mean # initialize weights and biases for log-variance estimate if W_logvar is None: # Generate initial filters using orthogonal random trick W_shape = (self.in_dim, self.out_dim) W_scale = W_scale * (1.0 / np.sqrt(self.in_dim)) W_init = W_scale * npr.normal(0.0, 1.0, W_shape) #W_init = ortho_matrix(shape=W_shape, gain=W_scale) W_init = W_init.astype(theano.config.floatX) W_logvar = theano.shared(value=W_init, \ name="{0:s}_W_logvar".format(name)) if b_logvar is None: b_init = np.zeros((self.out_dim,), \ dtype=theano.config.floatX) b_logvar = theano.shared(value=b_init, \ name="{0:s}_b_logvar".format(name)) # grab handles for easy access self.W_logvar = W_logvar self.b_logvar = b_logvar # Conveniently package layer parameters self.mlp_params = [self.W_mean, self.b_mean, \ self.W_logvar, self.b_logvar] # Layer construction complete... return def get_bias(self): """ Get the bias at output layer. """ out_bias = self.b_mean return out_bias def apply(self, x, do_samples=True): """ Apply this SimpleInfNet to some input. """ z_mean = T.dot(x, self.W_mean) + self.b_mean z_logvar = T.dot(x, self.W_logvar) + self.b_logvar z_samples = z_mean + ( (T.exp(0.5*z_logvar)) * \ DCG(self.rng.normal(size=z_mean.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) ) # wrap them up for easy returnage result = [z_mean, z_logvar] if do_samples: result.append(z_samples) return result
class WalkoutModel(object): """ Controller for training a forwards-backwards chainy model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_out: the goal state for forwards-backwards walking process p_z_given_x: InfNet for stochastic part of step p_x_given_z: HydraNet for deterministic part of step params: REQUIRED PARAMS SHOWN BELOW x_dim: dimension of observations to construct z_dim: dimension of latent space for policy wobble walkout_steps: number of steps to walk out x_type: can be "bernoulli" or "gaussian" x_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, x_out=None, \ p_z_given_x=None, \ p_x_given_z=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this WalkoutModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.walkout_steps = self.params['walkout_steps'] self.x_type = self.params['x_type'] self.shared_param_dicts = shared_param_dicts if 'x_transform' in self.params: assert((self.params['x_transform'] == 'sigmoid') or \ (self.params['x_transform'] == 'none')) if self.params['x_transform'] == 'sigmoid': self.x_transform = lambda x: T.nnet.sigmoid(x) else: self.x_transform = lambda x: x else: self.x_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.x_transform = lambda x: T.nnet.sigmoid(x) assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) assert((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant networks self.p_z_given_x = p_z_given_x self.p_x_given_z = p_x_given_z # record the symbolic variables that will provide inputs to the # computation graph created for this WalkoutModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for walk-out wobble if self.shared_param_dicts is None: # initialize the parameters "owned" by this model zero_ary = to_fX( np.zeros((1,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) ############################################################### # Setup the forwards (i.e. training) walk-out loop using scan # ############################################################### def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw): # get samples of next zi, according to the forwards model zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv) # check reverse direction probability p(xi_fw | zi_fw) xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_bw_mean = self.x_transform(xi_bw_mean) nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \ log_vars=xi_bw_logvar, mask=None) nll_xi_bw = nll_xi_bw.flatten() # get samples of next xi, according to the forwards model xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_fw_mean = self.x_transform(xi_fw_mean) xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv) # check reverse direction probability p(zi_fw | xi_fw) zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \ log_vars=zi_bw_logvar, mask=None) nll_zi_bw = nll_zi_bw.flatten() # each loop iteration produces the following values: # xi_fw: xi generated fom zi by forwards walk # zi_fw: zi generated fom xi by forwards walk # xi_fw_mean: ---- # xi_fw_logvar: ---- # zi_fw_mean: ---- # zi_fw_logvar: ---- # nll_xi_bw: NLL for reverse step zi_fw -> xi_fw # nll_zi_bw: NLL for reverse step xi_fw -> zi_fw return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw # initialize states for x/z self.x0 = self.x_out self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim) # setup initial values to pass to scan op outputs_init = [self.x0, self.z0, None, None, None, None, None, None] sequences_init = [self.xi_zmuv, self.zi_zmuv] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan(forwards_loop, \ outputs_info=outputs_init, \ sequences=sequences_init) # grab results of the scan op. all values are computed for each step self.xi = self.scan_results[0] self.zi = self.scan_results[1] self.xi_fw_mean = self.scan_results[2] self.xi_fw_logvar = self.scan_results[3] self.zi_fw_mean = self.scan_results[4] self.zi_fw_logvar = self.scan_results[5] self.nll_xi_bw = self.scan_results[6] self.nll_zi_bw = self.scan_results[7] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='srr_lr') # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w') self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rate new_lr = zero_ary + lr self.lr.set_value(to_fX(new_lr)) # set momentums (use first and second order "momentum") new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0): """ Set the relative weight of prior KL-divergence vs. data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_p self.lam_kld_p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q self.lam_kld_q.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_g self.lam_kld_g.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_s self.lam_kld_s.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def _construct_zi_zmuv(self, xo): """ Construct the necessary ZMUV gaussian samples for generating trajectories from this WalkoutModel, for input matrix xo. """ zi_zmuv = self.rng.normal( \ size=(self.total_steps, xo.shape[0], self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return zi_zmuv def _construct_rev_masks(self, xo): """ Compute the sequential revelation masks for the input batch in xo. -- We need to construct mask sequences for both p and q. """ if self.use_rev_masks: # make batch copies of self.rev_masks_p and self.rev_masks_q pmasks = self.rev_masks_p.dimshuffle(0,'x',1).repeat(xo.shape[0], axis=1) qmasks = self.rev_masks_q.dimshuffle(0,'x',1).repeat(xo.shape[0], axis=1) else: pm_list = [] qm_list = [] # make a zero mask that does nothing zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1]) # generate independently sampled masks for each revelation block for rb in self.rev_sched: # make a random binary mask with ones at rate rb[1] rand_vals = self.rng.uniform( \ size=(1, xo.shape[0], xo.shape[1]), \ low=0.0, high=1.0, dtype=theano.config.floatX) rand_mask = rand_vals < rb[1] # append the masks for this revleation block to the mask lists # # the guide policy (in q) gets to peek at the values that will be # revealed to the primary policy (in p) for the entire block. The # primary policy only gets to see these values at end of the final # step of the block. Within a given step, values are revealed to q # at the beginning of the step, and to p at the end. # # e.g. in a revelation block with only a single step, the guide # policy sees the values at the beginning of the step, which allows # it to guide the step. the primary policy only gets to see the # values at the end of the step. # # i.e. a standard variational auto-encoder is equivalent to a # sequential revelation and refinement model with only one # revelation block, which has one step and a reveal rate of 1.0. # for refine_step in range(rb[0]-1): pm_list.append(zero_mask) qm_list.append(rand_mask) pm_list.append(rand_mask) qm_list.append(rand_mask) # concatenate each mask list into a 3-tensor pmasks = T.cast(T.concatenate(pm_list, axis=0), 'floatX') qmasks = T.cast(T.concatenate(qm_list, axis=0), 'floatX') return [pmasks, qmasks] def _construct_nll_costs(self, si, xo, nll_mask): """ Construct the negative log-likelihood part of free energy. -- only check NLL where nll_mask == 1 """ xh = self._from_si_to_x( si ) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar, mask=nll_mask) nll_costs = -ll_costs.flatten() return nll_costs def _construct_kld_s(self, s_i, s_j): """ Compute KL(s_i || s_j) -- assuming bernoullish outputs """ x_i = self._from_si_to_x( s_i ) x_j = self._from_si_to_x( s_j ) kld_s = (x_i * (T.log(x_i) - T.log(x_j))) + \ ((1.0 - x_i) * (T.log(1.0-x_i) - T.log(1.0-x_j))) sum_kld = T.sum(kld_s, axis=1) return sum_kld def _construct_kld_costs(self, p=1.0): """ Construct the policy KL-divergence part of cost to minimize. """ kld_pis = [] kld_qis = [] kld_gis = [] kld_sis = [] s0 = 0.0*self.si[0] + self.s0 for i in range(self.total_steps): kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1)) kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1)) kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1)) if i == 0: kld_sis.append(self._construct_kld_s(self.si[i], s0)) else: kld_sis.append(self._construct_kld_s(self.si[i], self.si[i-1])) # compute the batch-wise costs kld_pi = sum(kld_pis) kld_qi = sum(kld_qis) kld_gi = sum(kld_gis) kld_si = sum(kld_sis) return [kld_pi, kld_qi, kld_gi, kld_si] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # construct values to output nll = self.nll_costs.flatten() kld = self.kld_q.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xo ], \ outputs=[nll, kld], \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XO, sample_count=20, use_guide_policy=True): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XO.shape[0],)) kld_sum = np.zeros((XO.shape[0],)) for i in range(sample_count): result = fe_term_sample(XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) if not use_guide_policy: # no KLd if samples are from the primary policy... mean_kld = 0.0 * mean_kld return [mean_nll, mean_kld] return fe_term_estimator def _construct_raw_costs(self): """ Construct all the raw, i.e. not weighted by any lambdas, costs. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # compile theano function for computing the costs all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g] cost_func = theano.function(inputs=[ xo ], \ outputs=all_step_costs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.scan_updates, \ on_unused_input='ignore') # make a function for computing batch-based estimates of costs. # _step_nlls: the expected NLL cost for each step # _step_klds: the expected KL(q||p) cost for each step # _kld_q2p: the expected KL(q||p) cost for each latent dim # _kld_p2q: the expected KL(p||q) cost for each latent dim # _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim def raw_cost_computer(XO): _all_costs = cost_func(to_fX(XO)) _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0) _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1) _step_klds = to_fX( np.asarray([k for k in _step_klds]) ) _step_nlls = np.mean(_all_costs[0], axis=1) _step_nlls = to_fX( np.asarray([k for k in _step_nlls]) ) results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g] return results return raw_cost_computer def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \ self.kld_cost, self.reg_cost, self.obs_costs] # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') return func def _construct_sequence_sampler(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xo = T.matrix() zizmuv = self._construct_zi_zmuv(xo) pmasks, qmasks = self._construct_rev_masks(xo) # collect the outputs to return from this function states = [self._from_si_to_x(self.s0_full)] + \ [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)] masks = [self.m0_full] + [self.mi_p[i] for i in range(self.total_steps)] outputs = states + masks # compile the theano function func = theano.function(inputs=[ xo ], \ outputs=outputs, \ givens={self.x_out: xo, \ self.zi_zmuv: zizmuv, \ self.p_masks: pmasks, \ self.q_masks: qmasks}, \ updates=self.joint_updates, \ on_unused_input='ignore') # visualize trajectories generated by the model def sample_func(XO, use_guide_policy=False): # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if use_guide_policy: # take samples from the guide policy self.set_train_switch(switch_val=1.0) else: # take samples from the primary policy self.set_train_switch(switch_val=0.0) # get belief states and masks generated by the scan loop scan_vals = func(to_fX(XO)) step_count = self.total_steps + 1 seq_shape = (step_count, XO.shape[0], XO.shape[1]) xm_seq = np.zeros(seq_shape).astype(theano.config.floatX) xi_seq = np.zeros(seq_shape).astype(theano.config.floatX) mi_seq = np.zeros(seq_shape).astype(theano.config.floatX) for i in range(step_count): _xi = scan_vals[i] _mi = scan_vals[i + step_count] _xm = (_mi * XO) + ((1.0 - _mi) * _xi) xm_seq[i,:,:] = _xm xi_seq[i,:,:] = _xi mi_seq[i,:,:] = _mi # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return [xm_seq, xi_seq, mi_seq] return sample_func def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. """ assert(not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {} for key in self.shared_param_dicts: numpy_ary = self.shared_param_dicts[key].get_value(borrow=False) numpy_param_dicts[key] = numpy_ary # dump the numpy version of self.shared_param_dicts to pickle file cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) # get numpy dicts for each of the "child" models that we must save child_model_dicts = {} child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict() child_model_dicts['p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict() child_model_dicts['p_x_given_si'] = self.p_x_given_si.save_to_dict() child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict() # dump the numpy child model dicts to the pickle file cPickle.dump(child_model_dicts, f_handle, protocol=-1) f_handle.close() return
class MultiStageModel(object): """ Controller for training a multi-step iterative refinement model. Parameters: rng: numpy.random.RandomState (for reproducibility) x_in: the input data to encode x_out: the target output to decode p_s0_given_z: InfNet for initializing "canvas" state p_hi_given_si: InfNet for hi given si p_sip1_given_si_hi: HydraNet for sip1 given si and hi q_z_given_x: InfNet for z given x q_hi_given_x_si: InfNet for hi given x and si obs_dim: dimension of the observations to generate z_dim: dimension of the "initial" latent space h_dim: dimension of the "primary" latent space ir_steps: number of "iterative refinement" steps to perform params: REQUIRED PARAMS SHOWN BELOW x_type: can be "bernoulli" or "gaussian" obs_transform: can be 'none' or 'sigmoid' """ def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_s0_given_z=None, \ p_hi_given_si=None, \ p_sip1_given_si_hi=None, \ q_z_given_x=None, \ q_hi_given_x_si=None, \ obs_dim=None, \ z_dim=None, h_dim=None, \ ir_steps=4, params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x)) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.obs_dim = obs_dim self.z_dim = z_dim self.h_dim = h_dim self.ir_steps = ir_steps # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_hi_given_x_si = q_hi_given_x_si self.p_s0_given_z = p_s0_given_z self.p_hi_given_si = p_hi_given_si self.p_sip1_given_si_hi = p_sip1_given_si_hi # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.hi_zmuv = T.tensor3() # for ZMUV Gaussian samples to use in scan # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) # this weight balances l1 vs. l2 penalty on posterior KLds self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2') self.set_lam_kld_l1l2(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar') init_vec = to_fX( np.zeros((self.obs_dim,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) # setup a function for computing reconstruction log likelihood if self.x_type == 'bernoulli': self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_bernoulli(xo, xh)) else: self.log_prob_func = lambda xo, xh: \ (-1.0 * log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar)) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.x_out.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) ############################# # Setup self.z and self.s0. # ############################# print("Building MSM step 0...") drop_x = drop_mask * self.x_in self.q_z_mean, self.q_z_logvar, self.z = \ self.q_z_given_x.apply(drop_x, do_samples=True) # get initial observation state self.s0, _ = self.p_s0_given_z.apply(self.z, do_samples=False) # gather KLd and NLL for the initialization step self.init_klds = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) self.init_nlls = -1.0 * \ self.log_prob_func(self.x_out, self.obs_transform(self.s0)) ################################################## # Setup the iterative generation loop using scan # ################################################## def ir_step_func(hi_zmuv, sim1): # get variables used throughout this refinement step sim1_obs = self.obs_transform(sim1) # transform state -> obs grad_ll = self.x_out - sim1_obs # get samples of next hi, conditioned on current si hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \ sim1_obs, do_samples=False) # now we build the model for variational hi given si hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \ T.horizontal_stack(grad_ll, sim1_obs), \ do_samples=False) hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean # make hi samples that can be switched between hi_p and hi_q hi = ( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on si and hi. ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi) # get the transformed values (for an LSTM style update) i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0) f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0) # perform an LSTM-like update of the state sim1 -> si si = (in_vals * i_gate) + (sim1 * f_gate) # compute generator NLL for this step nlli = self.log_prob_func(self.x_out, self.obs_transform(si)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \ hi_p_mean, hi_p_logvar) kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \ hi_q_mean, hi_q_logvar) return si, nlli, kldi_q2p, kldi_p2q init_values = [self.s0, None, None, None] self.scan_results, self.scan_updates = theano.scan(ir_step_func, \ outputs_info=init_values, sequences=self.hi_zmuv) self.si = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi_q2p = self.scan_results[2] self.kldi_p2q = self.scan_results[3] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z') self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q') self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.q_params = [] self.q_params.extend(self.q_z_given_x.mlp_params) self.q_params.extend(self.q_hi_given_x_si.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.p_params = [self.p_z_mean, self.p_z_logvar] self.p_params.extend(self.p_hi_given_si.mlp_params) self.p_params.extend(self.p_sip1_given_si_hi.mlp_params) self.p_params.extend(self.p_s0_given_z.mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.q_params + self.p_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q, self.kld_hi_q2p, self.kld_hi_p2q = \ self._construct_kld_costs(p=1.0) self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_hi = (self.lam_kld_q2p[0] * self.kld_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kld_hi_p2q) self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_hi # now do l2 KLd costs self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_hi_q2p, self.kl2_hi_p2q = \ self._construct_kld_costs(p=2.0) self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_z_p2q) self.kl2_hi = (self.lam_kld_q2p[0] * self.kl2_hi_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_hi_p2q) self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_hi # compute joint l1/l2 KLd cost self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \ ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) self.kl2_cost = T.mean(self.kl2_costs) self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self.nlli[-1] self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_l1l2_cost + \ self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_l1l2_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.q_updates = get_adam_updates(params=self.q_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.p_updates = get_adam_updates(params=self.p_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.q_updates: self.joint_updates[k] = self.q_updates[k] for k in self.p_updates: self.joint_updates[k] = self.p_updates[k] # add scan updates, which seem to be required for k in self.scan_updates: self.joint_updates[k] = self.scan_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling cost computer...") self.compute_raw_klds = self._construct_raw_klds() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() print("Compiling data-guided model sampler...") self.sample_from_input = self._construct_sample_from_input() return def set_sgd_params(self, lr_1=0.01, lr_2=0.01, \ mom_1=0.9, mom_2=0.999): """ Set learning rate and momentum parameter for all updates. """ zero_ary = np.zeros((1,)) # set learning rates new_lr_1 = zero_ary + lr_1 self.lr_1.set_value(to_fX(new_lr_1)) new_lr_2 = zero_ary + lr_2 self.lr_2.set_value(to_fX(new_lr_2)) # set momentums new_mom_1 = zero_ary + mom_1 self.mom_1.set_value(to_fX(new_mom_1)) new_mom_2 = zero_ary + mom_2 self.mom_2.set_value(to_fX(new_mom_2)) return def set_lam_nll(self, lam_nll=1.0): """ Set weight for controlling the influence of the data likelihood. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_nll self.lam_nll.set_value(to_fX(new_lam)) return def set_lam_kld(self, lam_kld_z=1.0, lam_kld_q2p=1.0, lam_kld_p2q=1.0): """ Set the relative weight of various KL-divergences. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_kld_z self.lam_kld_z.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_q2p self.lam_kld_q2p.set_value(to_fX(new_lam)) new_lam = zero_ary + lam_kld_p2q self.lam_kld_p2q.set_value(to_fX(new_lam)) return def set_lam_l2w(self, lam_l2w=1e-3): """ Set the relative strength of l2 regularization on network params. """ zero_ary = np.zeros((1,)) new_lam = zero_ary + lam_l2w self.lam_l2w.set_value(to_fX(new_lam)) return def set_train_switch(self, switch_val=0.0): """ Set the switch for changing between training and sampling behavior. """ if (switch_val < 0.5): switch_val = 0.0 else: switch_val = 1.0 zero_ary = np.zeros((1,)) new_val = zero_ary + switch_val self.train_switch.set_value(to_fX(new_val)) return def set_lam_kld_l1l2(self, lam_kld_l1l2=1.0): """ Set the weight for shaping penalty on conditional priors over zt. """ zero_ary = np.zeros((1,)) new_val = zero_ary + lam_kld_l1l2 self.lam_kld_l1l2.set_value(to_fX(new_val)) return def set_drop_rate(self, drop_rate=0.0): """ Set the weight for shaping penalty on conditional priors over zt. """ zero_ary = np.zeros((1,)) new_val = zero_ary + drop_rate self.drop_rate.set_value(to_fX(new_val)) return def _construct_zmuv_samples(self, xi, br): """ Construct the necessary (symbolic) samples for computing through this MultiStageModel for input (sybolic) matrix X. """ z_zmuv = self.rng.normal( \ size=(xi.shape[0]*br, self.z_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) hi_zmuv = self.rng.normal( \ size=(self.ir_steps, xi.shape[0]*br, self.h_dim), \ avg=0.0, std=1.0, dtype=theano.config.floatX) return z_zmuv, hi_zmuv def _construct_nll_costs(self, si, xo): """ Construct the negative log-likelihood part of free energy. """ # average log-likelihood over the refinement sequence xh = self.obs_transform(si) if self.x_type == 'bernoulli': ll_costs = log_prob_bernoulli(xo, xh) else: ll_costs = log_prob_gaussian2(xo, xh, \ log_vars=self.bounded_logvar) nll_costs = -ll_costs return nll_costs def _construct_kld_costs(self, p=1.0): """ Construct the posterior KL-divergence part of cost to minimize. """ kld_hi_q2ps = [] kld_hi_p2qs = [] for i in range(self.ir_steps): kld_hi_q2p = self.kldi_q2p[i] kld_hi_p2q = self.kldi_p2q[i] kld_hi_q2ps.append(T.sum(kld_hi_q2p**p, \ axis=1, keepdims=True)) kld_hi_p2qs.append(T.sum(kld_hi_p2q**p, \ axis=1, keepdims=True)) # compute the batch-wise costs kld_hi_q2p = sum(kld_hi_q2ps) kld_hi_p2q = sum(kld_hi_p2qs) # construct KLd cost for the distributions over z kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \ self.p_z_mean, self.p_z_logvar) kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \ self.q_z_mean, self.q_z_logvar) kld_z_q2p = T.sum(kld_z_q2ps**p, axis=1, keepdims=True) kld_z_p2q = T.sum(kld_z_p2qs**p, axis=1, keepdims=True) return [kld_z_q2p, kld_z_p2q, kld_hi_q2p, kld_hi_p2q] def _construct_reg_costs(self): """ Construct the cost for low-level basic regularization. E.g. for applying l2 regularization to the network activations and parameters. """ param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params]) return param_reg_cost def _construct_train_joint(self): """ Construct theano function to train all networks jointly. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() br = T.lscalar() # collect the outputs to return from this function outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \ self.reg_cost, self.obs_costs] # compile the theano function _, hi_zmuv = self._construct_zmuv_samples(xi, br) func = theano.function(inputs=[ xi, xo, br ], \ outputs=outputs, \ givens={ self.x_in: xi.repeat(br, axis=0), \ self.x_out: xo.repeat(br, axis=0), \ self.hi_zmuv: hi_zmuv }, \ updates=self.joint_updates) return func def _construct_raw_klds(self): """ Construct function for computing KLd per latent dimension. """ # gather step-wise costs into a single list (init costs at the end) all_step_costs = [self.init_klds, self.kldi_q2p, self.kldi_p2q] # compile theano function for computing all relevant costs inputs = [self.x_in, self.x_out, self.hi_zmuv] cost_func = theano.function(inputs=inputs, outputs=all_step_costs, \ updates=self.scan_updates) def raw_kld_computer(XI, XO): hi_zmuv = to_fX( npr.randn(self.ir_steps, XI.shape[0], self.h_dim) ) _all_costs = cost_func(XI, XO, hi_zmuv) _init_klds = _all_costs[0] _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0) _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0) results = [_init_klds, _kld_q2p, _kld_p2q] return results return raw_kld_computer def _construct_compute_fe_terms(self): """ Construct a function for computing terms in variational free energy. """ # setup some symbolic variables for theano to deal with xi = T.matrix() xo = T.matrix() _, hi_zmuv = self._construct_zmuv_samples(xi, 1) # construct values to output nll = self.nlli[-1] kld = self.kld_z.flatten() + self.kld_hi_q2p.flatten() # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[ xi, xo ], \ outputs=[nll, kld], \ givens={self.x_in: xi, \ self.x_out: xo, \ self.hi_zmuv: hi_zmuv}, \ updates=self.scan_updates) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(XI, XO, sample_count): # compute a multi-sample estimate of variational free-energy nll_sum = np.zeros((XI.shape[0],)) kld_sum = np.zeros((XI.shape[0],)) for i in range(sample_count): result = fe_term_sample(XI, XO) nll_sum += result[0].ravel() kld_sum += result[1].ravel() mean_nll = nll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator def _construct_sample_from_prior(self): """ Construct a function for drawing independent samples from the distribution generated by this MultiStageModel. This function returns the full sequence of "partially completed" examples. """ z_sym = T.matrix() x_sym = T.matrix() irs = self.ir_steps oputs = [self.obs_transform(self.s0)] oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)]) _, hi_zmuv = self._construct_zmuv_samples(x_sym, 1) sample_func = theano.function(inputs=[z_sym, x_sym], outputs=oputs, \ givens={ self.z: z_sym, \ self.x_in: T.zeros_like(x_sym), \ self.x_out: T.zeros_like(x_sym), \ self.hi_zmuv: hi_zmuv }, \ updates=self.scan_updates) def prior_sampler(samp_count): x_samps = to_fX( np.zeros((samp_count, self.obs_dim)) ) old_switch = self.train_switch.get_value(borrow=False) # set model to generation mode self.set_train_switch(switch_val=0.0) z_samps = to_fX( npr.randn(samp_count, self.z_dim) ) model_samps = sample_func(z_samps, x_samps) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return model_samps return prior_sampler def _construct_sample_from_input(self): """ Construct a function for drawing samples from the distribution generated by this MultiStageModel, conditioned on some inputs to the initial encoder stage (i.e. self.q_z_given_x). This returns the full sequence of "partially completed" examples. """ xi = T.matrix() xo = T.matrix() irs = self.ir_steps oputs = [self.obs_transform(self.s0)] oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)]) _, hi_zmuv = self._construct_zmuv_samples(xi, 1) sample_func = theano.function(inputs=[xi, xo], outputs=oputs, \ givens={ self.x_in: xi, \ self.x_out: xo, \ self.hi_zmuv: hi_zmuv }, \ updates=self.scan_updates) def conditional_sampler(XI, XO=None, guided_decoding=False): XI = to_fX( XI ) if XO is None: XO = XI XO = to_fX( XO ) # set model to desired generation mode old_switch = self.train_switch.get_value(borrow=False) if guided_decoding: # take samples from guide policies (i.e. variational q) self.set_train_switch(switch_val=1.0) else: # take samples from model's generative policy self.set_train_switch(switch_val=0.0) # draw guided/unguided conditional samples model_samps = sample_func(XI, XO) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return model_samps return conditional_sampler
def __init__(self, rng=None, x_out=None, \ p_z_given_x=None, \ p_x_given_z=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this WalkoutModel self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_dim = self.params['x_dim'] self.z_dim = self.params['z_dim'] self.walkout_steps = self.params['walkout_steps'] self.x_type = self.params['x_type'] self.shared_param_dicts = shared_param_dicts if 'x_transform' in self.params: assert((self.params['x_transform'] == 'sigmoid') or \ (self.params['x_transform'] == 'none')) if self.params['x_transform'] == 'sigmoid': self.x_transform = lambda x: T.nnet.sigmoid(x) else: self.x_transform = lambda x: x else: self.x_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.x_transform = lambda x: T.nnet.sigmoid(x) assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) assert((self.step_type == 'add') or (self.step_type == 'jump')) # grab handles to the relevant networks self.p_z_given_x = p_z_given_x self.p_x_given_z = p_x_given_z # record the symbolic variables that will provide inputs to the # computation graph created for this WalkoutModel self.x_out = x_out # target output for generation self.zi_zmuv = T.tensor3() # ZMUV gauss noise for walk-out wobble if self.shared_param_dicts is None: # initialize the parameters "owned" by this model zero_ary = to_fX( np.zeros((1,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) self.shared_param_dicts = {} self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: # grab the parameters required by this model from a given dict self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0]) ############################################################### # Setup the forwards (i.e. training) walk-out loop using scan # ############################################################### def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw): # get samples of next zi, according to the forwards model zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv) # check reverse direction probability p(xi_fw | zi_fw) xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_bw_mean = self.x_transform(xi_bw_mean) nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \ log_vars=xi_bw_logvar, mask=None) nll_xi_bw = nll_xi_bw.flatten() # get samples of next xi, according to the forwards model xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \ do_samples=False) xi_fw_mean = self.x_transform(xi_fw_mean) xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv) # check reverse direction probability p(zi_fw | xi_fw) zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \ do_samples=False) nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \ log_vars=zi_bw_logvar, mask=None) nll_zi_bw = nll_zi_bw.flatten() # each loop iteration produces the following values: # xi_fw: xi generated fom zi by forwards walk # zi_fw: zi generated fom xi by forwards walk # xi_fw_mean: ---- # xi_fw_logvar: ---- # zi_fw_mean: ---- # zi_fw_logvar: ---- # nll_xi_bw: NLL for reverse step zi_fw -> xi_fw # nll_zi_bw: NLL for reverse step xi_fw -> zi_fw return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw # initialize states for x/z self.x0 = self.x_out self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim) # setup initial values to pass to scan op outputs_init = [self.x0, self.z0, None, None, None, None, None, None] sequences_init = [self.xi_zmuv, self.zi_zmuv] # apply scan op for the sequential imputation loop self.scan_results, self.scan_updates = theano.scan(forwards_loop, \ outputs_info=outputs_init, \ sequences=sequences_init) # grab results of the scan op. all values are computed for each step self.xi = self.scan_results[0] self.zi = self.scan_results[1] self.xi_fw_mean = self.scan_results[2] self.xi_fw_logvar = self.scan_results[3] self.zi_fw_mean = self.scan_results[4] self.zi_fw_logvar = self.scan_results[5] self.nll_xi_bw = self.scan_results[6] self.nll_zi_bw = self.scan_results[7] ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='srr_lr') # shared var momentum parameters for ADAM optimization self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared vars for weighting prior kld against reconstruction self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p') self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q') self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g') self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s') self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w') self.set_lam_l2w(1e-5) # grab all of the "optimizable" parameters from the base networks self.joint_params = [self.s0, self.obs_logvar, self.step_scales] self.joint_params.extend(self.p_zi_given_xi.mlp_params) self.joint_params.extend(self.p_sip1_given_zi.mlp_params) self.joint_params.extend(self.p_x_given_si.mlp_params) self.joint_params.extend(self.q_zi_given_xi.mlp_params) ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0) self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \ (self.lam_kld_q[0] * self.kld_q) + \ (self.lam_kld_g[0] * self.kld_g) + \ (self.lam_kld_s[0] * self.kld_s) self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs self.nll_cost = T.mean(self.nll_costs) self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel() self.nll_bound = T.mean(self.nll_bounds) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-TRIAL COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) for k, v in self.scan_updates.items(): self.joint_updates[k] = v # Construct theano functions for training and diagnostic computations print("Compiling cost computer...") self.compute_raw_costs = self._construct_raw_costs() print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling sequence sampler...") self.sequence_sampler = self._construct_sequence_sampler() # make easy access points for some interesting parameters #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W return
class InfNet(object): """ A net that tries to infer an approximate posterior for some observation, given some deep, directed generative model. The output of this network comprises two constructs: an approximate mean vector and an approximate standard deviation vector (i.e. diagonal matrix) for a Gaussian posterior. Parameters: rng: a numpy.random RandomState object Xd: symbolic input matrix for inputs params: a dict of parameters describing the desired network: vis_drop: drop rate to use on observable variables hid_drop: drop rate to use on hidden layer activations -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0 input_noise: standard dev for noise on the input of this net bias_noise: standard dev for noise on the biases of hidden layers shared_config: list of "layer descriptions" for shared part mu_config: list of "layer descriptions" for mu part sigma_config: list of "layer descriptions" for sigma part activation: "function handle" for the desired non-linearity init_scale: scaling factor for hidden layer weights (__ * 0.01) shared_param_dicts: parameters for the MLP controlled by this InfNet """ def __init__(self, \ rng=None, \ Xd=None, \ params=None, \ shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params if 'build_theano_funcs' in params: self.build_theano_funcs = params['build_theano_funcs'] else: self.build_theano_funcs = True if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.0 if 'input_noise' in params: self.input_noise = params['input_noise'] else: self.input_noise = 0.0 if 'bias_noise' in params: self.bias_noise = params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 if 'sigma_init_scale' in params: self.sigma_init_scale = params['sigma_init_scale'] else: self.sigma_init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.shared_config = params['shared_config'] self.mu_config = params['mu_config'] self.sigma_config = params['sigma_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun ######################################### # Initialize the shared part of network # ######################################### self.shared_layers = [] layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:]) layer_num = 0 # Construct input to the inference network next_input = self.Xd for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "share_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if first_layer: i_noise = self.input_noise b_noise = 0.0 else: i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append( \ new_layer.shared_param_dicts) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) next_input = self.shared_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ##################################### # Initialize the mu part of network # ##################################### self.mu_layers = [] layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "mu_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) self.shared_param_dicts['mu'].append( \ new_layer.shared_param_dicts) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['mu'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) next_input = self.mu_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ######################################## # Initialize the sigma part of network # ######################################## self.sigma_layers = [] layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "sigma_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if last_layer: # set in-bound weights for logvar predictions to 0 i_scale = 0.0 * i_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) self.shared_param_dicts['sigma'].append( \ new_layer.shared_param_dicts) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['sigma'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) next_input = self.sigma_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # Create a shared parameter for rescaling posterior "sigmas" to allow # control over the velocity of the markov chain generated by repeated # cycling through the INF -> GEN loop. if not ('sigma_scale' in self.shared_param_dicts['sigma'][-1]): # we use a hack-ish check to remain compatible with loading models # that were saved before the addition of the sigma_scale param. zero_ary = to_fX(np.zeros((1,))) self.sigma_scale = theano.shared(value=zero_ary) new_dict = {'sigma_scale': self.sigma_scale} self.shared_param_dicts['sigma'].append(new_dict) self.set_sigma_scale(1.0) else: # this is a clone of some other InfNet, and that InfNet was made # after adding the sigma_scale param, so use its sigma_scale self.sigma_scale = \ self.shared_param_dicts['sigma'][-1]['sigma_scale'] # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.mu_layers: self.mlp_params.extend(layer.params) for layer in self.sigma_layers: self.mlp_params.extend(layer.params) # The output of this inference network is given by the noisy output # of the final layers of its mu and sigma networks. self.output_mean, self.output_logvar, self.output_samples = \ self.apply(Xd) self.output = self.output_samples self.out_dim = self.sigma_layers[-1].out_dim # Construct a theano function for sampling from the approximate # posteriors inferred by this model for some collection of points # in the "data space". if self.build_theano_funcs: self.sample_posterior = self._construct_sample_posterior() self.mean_posterior = theano.function([self.Xd], \ outputs=self.output_mean) else: self.sample_posterior = None self.mean_posterior = None ######################################################## # CONSTRUCT FUNCTIONS FOR RICA PRETRAINING INPUT LAYER # ######################################################## self.rica_func = None self.W_rica = self.shared_layers[0].W return def apply(self, X, do_samples=True): """ Pass input X through this InfNet and get the resulting Gaussian conditional distribution. """ # pass activations through the shared layers shared_acts = [X] for layer in self.shared_layers: r0, r1, layer_acts = layer.apply(shared_acts[-1]) shared_acts.append(layer_acts) # pass activations through the mean estimating layers mu_acts = [shared_acts[-1]] for layer in self.mu_layers: r0, r1, layer_acts = layer.apply(mu_acts[-1]) mu_acts.append(layer_acts) layer_acts, r0, r1 = self.mu_layers[-1].apply(mu_acts[-2]) mu_acts[-1] = layer_acts # use linear output at last layer # pass activations through the logvar estimating layers sigma_acts = [shared_acts[-1]] for layer in self.sigma_layers: r0, r1, layer_acts = layer.apply(sigma_acts[-1]) sigma_acts.append(layer_acts) layer_acts, r0, r1 = self.sigma_layers[-1].apply(sigma_acts[-2]) sigma_acts[-1] = layer_acts # use linear output at last layer # construct the outputs we will want to access output_mean = mu_acts[-1] output_logvar = sigma_acts[-1] # wrap them up for easy returnage result = [output_mean, output_logvar] if do_samples: output_samples = output_mean + \ ( (self.sigma_scale[0] * T.exp(0.5*output_logvar)) * \ self.rng.normal(size=output_mean.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) ) result.append(output_samples) return result def apply_shared(self, X): """ Pass input X through this InfNet's shared layers. """ # pass activations through the shared layers shared_acts = [X] for layer in self.shared_layers: r0, r1, layer_acts = layer.apply(shared_acts[-1]) shared_acts.append(layer_acts) result = shared_acts[-1] return result def train_rica(self, X, lr, lam): """ CONSTRUCT FUNCTIONS FOR RICA PRETRAINING INPUT LAYER """ if self.rica_func is None: l_rate = T.scalar() lam_l1 = T.scalar() X_in = T.matrix('in_X_in') W_in = self.W_rica + self.rng.normal(size=self.W_rica.shape, \ avg=0.0, std=0.01, dtype=theano.config.floatX) X_enc = X_in H_rec = T.dot(X_enc, W_in) X_rec = T.dot(H_rec, W_in.T) recon_cost = T.sum((X_enc - X_rec)**2.0) / X_enc.shape[0] spars_cost = lam_l1 * (T.sum(soft_abs(H_rec)) / H_rec.shape[0]) rica_cost = recon_cost + spars_cost dW = T.grad(rica_cost, self.W_rica) rica_updates = {self.W_rica: self.W_rica - (l_rate * dW)} rica_outputs = [rica_cost, recon_cost, spars_cost] self.rica_func = theano.function([X_in, l_rate, lam_l1], \ outputs=rica_outputs, \ updates=rica_updates) outputs = self.rica_func(X, lr, lam) return outputs def set_sigma_scale(self, sigma_scale=1.0): """ Set the posterior sigma rescaling shared parameter to some value. """ zero_ary = np.zeros((1,)) new_scale = zero_ary + sigma_scale self.sigma_scale.set_value(to_fX(new_scale)) return def set_bias_noise(self, bias_noise=0.0): """ Set the bias noise in all hidden layers to the given value. """ new_ary = np.zeros((1,)) + bias_noise new_bn = to_fX( new_ary ) for layer in self.shared_layers: layer.bias_noise.set_value(new_bn) for layer in self.mu_layers: layer.bias_noise.set_value(new_bn) for layer in self.sigma_layers: layer.bias_noise.set_value(new_bn) return def _construct_sample_posterior(self): """ Construct a sampler that draws a single sample from the inferred posterior for some set of inputs. """ psample = theano.function([self.Xd], \ outputs=self.output) return psample def init_biases(self, b_init=0.0, b_std=1e-2): """ Initialize the biases in all hidden layers to some constant. """ for layer in self.shared_layers: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(to_fX(b_vec)) for layer in self.mu_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(to_fX(b_vec)) for layer in self.sigma_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(to_fX(b_vec)) return def shared_param_clone(self, rng=None, Xd=None): """ Return a clone of this network, with shared parameters but with different symbolic input variables. This can be used for "unrolling" a generate->infer->generate->infer... loop. Then, we can do backprop through time for various objectives. """ clone_net = InfNet(rng=rng, Xd=Xd, params=self.params, \ shared_param_dicts=self.shared_param_dicts) return clone_net def forked_param_clone(self, rng=None, Xd=None): """ Return a clone of this network, with forked copies of the current shared parameters of this InfNet, with different symbolic inputs too. """ new_spds = {} old_spds = self.shared_param_dicts # shared param dicts is nested like: dict of list of dicts # i.e., spd[k] is a list and spd[k][i] is a dict for k1 in old_spds: new_spds[k1] = [] for i in range(len(old_spds[k1])): new_spds[k1].append({}) for k2 in old_spds[k1][i]: old_sp = old_spds[k1][i][k2] old_sp_forked = old_sp.get_value(borrow=False) new_sp = theano.shared(value=old_sp_forked) new_spds[k1][i][k2] = new_sp clone_net = InfNet(rng=rng, Xd=Xd, params=self.params, \ shared_param_dicts=new_spds) return clone_net def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. We'll pickle everything required to create a clone of this model given the pickle and the rng/Xd params to the cloning function: "InfNet.shared_param_clone()". """ assert(not (f_name is None)) f_handle = file(f_name, 'wb') # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {'shared': [], 'mu': [], 'sigma': []} for layer_group in ['shared', 'mu', 'sigma']: for shared_dict in self.shared_param_dicts[layer_group]: numpy_dict = {} for key in shared_dict: numpy_dict[key] = shared_dict[key].get_value(borrow=False) numpy_param_dicts[layer_group].append(numpy_dict) # dump the numpy version of self.shared_param_dicts cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) f_handle.close() return def save_to_dict(self): """ Dump important stuff to a dict that can reboot the model. """ model_dict = {} # dump the dict self.params, which just holds "simple" python values model_dict['params'] = self.params # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {'shared': [], 'mu': [], 'sigma': []} for layer_group in ['shared', 'mu', 'sigma']: for shared_dict in self.shared_param_dicts[layer_group]: numpy_dict = {} for key in shared_dict: numpy_dict[key] = shared_dict[key].get_value(borrow=False) numpy_param_dicts[layer_group].append(numpy_dict) # dump the numpy version of self.shared_param_dicts model_dict['numpy_param_dicts'] = numpy_param_dicts return model_dict
class InfNet(object): """ A net that tries to infer an approximate posterior for some observation, given some deep, directed generative model. The output of this network comprises two constructs: an approximate mean vector and an approximate standard deviation vector (i.e. diagonal matrix) for a Gaussian posterior. Parameters: rng: a numpy.random RandomState object Xd: symbolic input matrix for inputting observable data Xc: symbolic input matrix for inputting control data Xm: symbolic input matrix for a mask on which values to take from Xc and which to take from Xd prior_sigma: standard deviation of isotropic Gaussian prior that our inferred posteriors will be penalized for deviating from. params: a dict of parameters describing the desired ensemble: lam_l2a: L2 regularization weight on neuron activations vis_drop: drop rate to use on observable variables hid_drop: drop rate to use on hidden layer activations -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0 input_noise: standard dev for noise on the input of this net bias_noise: standard dev for noise on the biases of hidden layers shared_config: list of "layer descriptions" for shared part mu_config: list of "layer descriptions" for mu part sigma_config: list of "layer descriptions" for sigma part activation: "function handle" for the desired non-linearity init_scale: scaling factor for hidden layer weights (__ * 0.01) shared_param_dicts: parameters for the MLP controlled by this InfNet """ def __init__(self, \ rng=None, \ Xd=None, \ Xc=None, \ Xm=None, \ prior_sigma=None, \ params=None, \ shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd self.Xc = Xc self.Xm = Xm self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params self.lam_l2a = params['lam_l2a'] if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.0 if 'input_noise' in params: self.input_noise = params['input_noise'] else: self.input_noise = 0.0 if 'bias_noise' in params: self.bias_noise = params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.shared_config = params['shared_config'] self.mu_config = params['mu_config'] self.sigma_config = params['sigma_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun ######################################### # Initialize the shared part of network # ######################################### self.shared_layers = [] layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:]) layer_num = 0 # Construct input by combining data input and control input, taking # unmasked values from data input and others from the control input next_input = ((1.0 - self.Xm) * self.Xd) + \ (self.Xm * self.Xc) for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "share_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if first_layer: i_noise = self.input_noise b_noise = 0.0 else: i_noise = 0.0 b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append({'W': new_layer.W, 'b': new_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale) self.shared_layers.append(new_layer) next_input = self.shared_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ##################################### # Initialize the mu part of network # ##################################### self.mu_layers = [] layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "mu_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.mu_layers.append(new_layer) self.shared_param_dicts['mu'].append({'W': new_layer.W, 'b': new_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['mu'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale) self.mu_layers.append(new_layer) next_input = self.mu_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ######################################## # Initialize the sigma part of network # ######################################## self.sigma_layers = [] layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "sigma_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.sigma_layers.append(new_layer) self.shared_param_dicts['sigma'].append({'W': new_layer.W, 'b': new_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['sigma'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale) self.sigma_layers.append(new_layer) next_input = self.sigma_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.mu_layers: self.mlp_params.extend(layer.params) for layer in self.sigma_layers: self.mlp_params.extend(layer.params) # The output of this inference network is given by the noisy output # of the final layers of its mu and sigma networks. self.output_mu = self.mu_layers[-1].noisy_linear self.output_logvar = self.sigma_layers[-1].noisy_linear self.output_sigma = T.exp(0.5 * self.output_logvar) # We'll also construct an output containing a single samples from each # of the distributions represented by the rows of self.output_mu and # self.output_sigma. self.output = self._construct_post_samples() self.out_dim = self.sigma_layers[-1].out_dim # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = self.lam_l2a * self._act_reg_cost() # Construct a function for penalizing KL divergence between the # approximate posteriors produced by this model and some isotropic # Gaussian distribution. self.kld_cost = self._construct_kld_cost() # Construct a theano function for sampling from the approximate # posteriors inferred by this model for some collection of points # in the "data space". self.sample_posterior = self._construct_sample_posterior() self.mean_posterior = theano.function([self.Xd, self.Xc, self.Xm], \ outputs=self.output_mu) return def _act_reg_cost(self): """ Apply L2 regularization to the activations in each net. """ act_sq_sums = [] for layer in self.shared_layers: act_sq_sums.append(layer.act_l2_sum) for layer in self.mu_layers: act_sq_sums.append(layer.act_l2_sum) for layer in self.sigma_layers: act_sq_sums.append(layer.act_l2_sum) full_act_sq_sum = T.sum(act_sq_sums) return full_act_sq_sum def _construct_post_samples(self): """ Draw a single sample from each of the approximate posteriors encoded in self.output_mu and self.output_sigma. """ post_samples = self.output_mu + (self.output_sigma * \ self.rng.normal(size=self.output_sigma.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) return post_samples def _construct_kld_cost(self): """ Compute (analytically) the KL divergence between each approximate posterior encoded by self.mu/self.sigma and the isotropic Gaussian distribution with mean 0 and standard deviation self.prior_sigma. """ prior_sigma_sq = self.prior_sigma**2.0 prior_log_sigma_sq = np.log(prior_sigma_sq) kld_cost = 0.5 * T.sum(((self.output_mu**2.0 / prior_sigma_sq) + \ (T.exp(self.output_logvar) / prior_sigma_sq) - \ (self.output_logvar - prior_log_sigma_sq) - 1.0), axis=1, keepdims=True) return kld_cost def _construct_sample_posterior(self): """ Construct a sampler that draws a single sample from the inferred posterior for some set of inputs. """ psample = theano.function([self.Xd, self.Xc, self.Xm], \ outputs=self.output) return psample def init_biases(self, b_init=0.0): """ Initialize the biases in all hidden layers to some constant. """ for layer in self.shared_layers: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init layer.b.set_value(b_vec) for layer in self.mu_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init layer.b.set_value(b_vec) for layer in self.sigma_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init layer.b.set_value(b_vec) return def shared_param_clone(self, rng=None, Xd=None, Xc=None, Xm=None): """ Return a clone of this network, with shared parameters but with different symbolic input variables. This can be used for "unrolling" a generate->infer->generate->infer... loop. Then, we can do backprop through time for various objectives. """ clone_net = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ prior_sigma=self.prior_sigma, params=self.params, \ shared_param_dicts=self.shared_param_dicts) return clone_net
class HiddenLayer(object): def __init__(self, rng, layer_description, W=None, b=None, b_in=None, s_in=None, name="", W_scale=1.0): # parse options from layer_description assert 'layer_type' in layer_description, \ "layer_description must provide layer_type" assert ((layer_description['layer_type'] == 'fc') or \ (layer_description['layer_type'] == 'conv')), \ "layer_type must be fc or conv" self.layer_description = layer_description self.layer_type = layer_description['layer_type'] self.in_chans = layer_description['in_chans'] self.out_chans = layer_description['out_chans'] self.activation = layer_description['activation'] self.filt_dim = layer_description.get('filt_dim', None) self.conv_stride = layer_description.get('conv_stride', None) self.apply_bn = layer_description.get('apply_bn', False) self.drop_rate = layer_description.get('drop_rate', 0.0) self.shape_func_in = layer_description.get('shape_func_in', None) self.shape_func_out = layer_description.get('shape_func_out', None) # setup additional params self.rng = RandStream(rng.randint(1000000)) self.W_scale = W_scale self.name = name if self.layer_type == 'fc': self.W, self.b, self.b_in, self.s_in = \ self._init_fc_params(W=W, b=b, b_in=b_in, s_in=s_in) else: self.W, self.b, self.b_in, self.s_in = \ self._init_conv_params(W=W, b=b, b_in=b_in, s_in=s_in) # Conveniently package layer parameters self.params = [self.W, self.b, self.b_in, self.s_in] self.shared_param_dicts = { 'W': self.W, 'b': self.b, 'b_in': self.b_in, 's_in': self.s_in } # Layer construction complete... return def _init_fc_params(self, W=None, b=None, b_in=None, s_in=None): """ Initialize all parameters that may be required for feedforward through a fully-connected hidden layer. """ # Get some random initial weights and biases, if not given if W is None: # Generate initial filters using orthogonal random trick W_shape = (self.in_chans, self.out_chans) if self.W_scale == 'xg': W_np = glorot_matrix(W_shape) else: #W_np = (self.W_scale * (1.0 / np.sqrt(self.in_chans))) * \ # npr.normal(0.0, 1.0, W_shape) W_np = ortho_matrix(shape=W_shape, gain=self.W_scale) W_np = W_np.astype(theano.config.floatX) W = theano.shared(value=W_np, name="{0:s}_W".format(self.name)) if b is None: b_np = np.zeros((self.out_chans,), dtype=theano.config.floatX) b = theano.shared(value=b_np, name="{0:s}_b".format(self.name)) # setup scale and bias params for after batch normalization if b_in is None: # batch normalization reshifts are initialized to zero ary = np.zeros((self.out_chans,), dtype=theano.config.floatX) b_in = theano.shared(value=ary, name="{0:s}_b_in".format(self.name)) if s_in is None: # batch normalization rescales are initialized to zero ary = np.zeros((self.out_chans,), dtype=theano.config.floatX) s_in = theano.shared(value=ary, name="{0:s}_s_in".format(self.name)) return W, b, b_in, s_in def _init_conv_params(self, W=None, b=None, b_in=None, s_in=None): """ Initialize all parameters that may be required for feedforward through a convolutional hidden layer. """ if W is None: W_shape = (self.out_chans, self.in_chans, self.filt_dim, self.filt_dim) ary = npr.normal(0.0, self.W_scale*0.02, W_shape).astype(theano.config.floatX) W = theano.shared(value=ary, name="{0:s}_W".format(self.name)) if b is None: b_shape = (self.out_chans,) ary = npr.normal(0.0, 0.01, b_shape).astype(theano.config.floatX) b = theano.shared(value=ary, name="{0:s}_b".format(self.name)) # setup scale and bias params for after batch normalization if b_in is None: # batch normalization reshifts are initialized to zero ary = np.zeros((self.out_chans,), dtype=theano.config.floatX) b_in = theano.shared(value=ary, name="{0:s}_b_in".format(self.name)) if s_in is None: # batch normalization rescales are initialized to zero ary = np.zeros((self.out_chans,), dtype=theano.config.floatX) s_in = theano.shared(value=ary, name="{0:s}_s_in".format(self.name)) return W, b, b_in, s_in def apply(self, input, use_drop=False): """ Apply feedforward to this input, returning several partial results. """ # Reshape input if a reshape command was provided if not (self.shape_func_in is None): input = self.shape_func_in(input) # Apply masking noise to the input (if desired) if use_drop: input = self._drop_from_input(input, self.drop_rate) if self.layer_type == 'fc': # Feedforward through fully-connected layer linear_output = T.dot(input, self.W) + self.b elif self.layer_type == 'conv': # Feedforward through convolutional layer, with adjustable stride bm = int((self.filt_dim - 1) / 2) # use "same" mode convolutions if self.conv_stride == 'double': linear_output = dnn_conv(input, self.W, subsample=(2, 2), border_mode=(bm, bm)) elif self.conv_stride == 'single': linear_output = dnn_conv(input, self.W, subsample=(1, 1), border_mode=(bm, bm)) elif self.conv_stride == 'half': linear_output = deconv(input, self.W, subsample=(2, 2), border_mode=(bm, bm)) else: assert False, "Unknown stride type!" linear_output = linear_output + self.b.dimshuffle('x',0,'x','x') else: assert False, "Unknown layer type!" # Apply batch normalization if desired if self.apply_bn: linear_output = batchnorm(linear_output, rescale=self.s_in, reshift=self.b_in, u=None, s=None) # Apply activation function final_output = self.activation(linear_output) # Reshape output if a reshape command was provided if not (self.shape_func_out is None): linear_output = self.shape_func_out(linear_output) final_output = self.shape_func_out(final_output) return final_output, linear_output def _drop_from_input(self, input, p): """p is the probability of dropping elements of input.""" # get a drop mask that drops things with probability p drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \ dtype=theano.config.floatX) drop_mask = drop_rnd > p # get a scaling factor to keep expectations fixed after droppage drop_scale = 1. / (1. - p) # apply dropout mask and rescaling factor to the input droppy_input = drop_scale * input * drop_mask return droppy_input
def __init__(self, \ rng=None, \ Xd=None, \ Xc=None, \ Xm=None, \ prior_sigma=None, \ params=None, \ shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd self.Xc = Xc self.Xm = Xm self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params self.lam_l2a = params['lam_l2a'] if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.0 if 'input_noise' in params: self.input_noise = params['input_noise'] else: self.input_noise = 0.0 if 'bias_noise' in params: self.bias_noise = params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.shared_config = params['shared_config'] self.mu_config = params['mu_config'] self.sigma_config = params['sigma_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun ######################################### # Initialize the shared part of network # ######################################### self.shared_layers = [] layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:]) layer_num = 0 # Construct input by combining data input and control input, taking # unmasked values from data input and others from the control input next_input = ((1.0 - self.Xm) * self.Xd) + \ (self.Xm * self.Xc) for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "share_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if first_layer: i_noise = self.input_noise b_noise = 0.0 else: i_noise = 0.0 b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append({'W': new_layer.W, 'b': new_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale) self.shared_layers.append(new_layer) next_input = self.shared_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ##################################### # Initialize the mu part of network # ##################################### self.mu_layers = [] layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "mu_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.mu_layers.append(new_layer) self.shared_param_dicts['mu'].append({'W': new_layer.W, 'b': new_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['mu'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale) self.mu_layers.append(new_layer) next_input = self.mu_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ######################################## # Initialize the sigma part of network # ######################################## self.sigma_layers = [] layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "sigma_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=self.init_scale) self.sigma_layers.append(new_layer) self.shared_param_dicts['sigma'].append({'W': new_layer.W, 'b': new_layer.b}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['sigma'][layer_num] new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ name=l_name, W_scale=self.init_scale) self.sigma_layers.append(new_layer) next_input = self.sigma_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.mu_layers: self.mlp_params.extend(layer.params) for layer in self.sigma_layers: self.mlp_params.extend(layer.params) # The output of this inference network is given by the noisy output # of the final layers of its mu and sigma networks. self.output_mu = self.mu_layers[-1].noisy_linear self.output_logvar = self.sigma_layers[-1].noisy_linear self.output_sigma = T.exp(0.5 * self.output_logvar) # We'll also construct an output containing a single samples from each # of the distributions represented by the rows of self.output_mu and # self.output_sigma. self.output = self._construct_post_samples() self.out_dim = self.sigma_layers[-1].out_dim # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = self.lam_l2a * self._act_reg_cost() # Construct a function for penalizing KL divergence between the # approximate posteriors produced by this model and some isotropic # Gaussian distribution. self.kld_cost = self._construct_kld_cost() # Construct a theano function for sampling from the approximate # posteriors inferred by this model for some collection of points # in the "data space". self.sample_posterior = self._construct_sample_posterior() self.mean_posterior = theano.function([self.Xd, self.Xc, self.Xm], \ outputs=self.output_mu) return
class Training(Layer): def __init__(self, rng, W=None, m=1.0, n_samples=50, shape=None, batch_size=1000): if W is None: W = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (shape[0] + shape[1])), high=numpy.sqrt(6. / (shape[0] + shape[1])), size=(shape[0], shape[1])), dtype=theano.config.floatX) self.W = theano.shared(value=W, name='Hashtag_emb', borrow=True) self.batch_size = batch_size self.n_ht = W.shape[0] self.m = m self.n_samples = n_samples self.csrng = CURAND_RandomStreams(123) mask = self.csrng.uniform(size=(self.n_samples, 1), low=0.0, high=1.0, dtype=theano.config.floatX) self.rfun = theano.function([], mask.argsort(axis=0)) self.alpha = T.constant( 1.0 / numpy.arange(start=1, stop=self.n_ht + 1, step=1)) self.weights = [self.W] self.biases = [] def __repr__(self): return "{}: W_shape: {}, m={}, n_samples={}, n_ht={}".format( self.__class__.__name__, self.W.shape.eval(), self.m, self.n_samples, self.n_ht) def output_func(self, input): self.f = T.tensordot(input.dimshuffle(0, 'x', 1), self.W.dimshuffle('x', 0, 1), axes=[[1, 2], [0, 2]]) # cosine sim self.y_pred = T.argmax(self.f, axis=0) return self.y_pred def get_tag_neg(self, f, f_y): cand = f[(f > f_y - self.m).nonzero()] rnk = cand.shape[0] - 1 # due to i != y if rnk == 0: return 0 l = T.sum(self.alpha[T.arange(rnk)]) return l / rnk def _warp_loss_cost(self, y, i): f_y = self.f[T.arange(y.shape[0]), y] s = self.m - f_y + self.f[T.arange(i.shape[0]), i] return T.maximum(0.0, s) def warp_loss_cost(self, y, idx): f_y = self.f[T.arange(y.shape[0]), y] f_yy = T.repeat(f_y.dimshuffle(0, 'x'), self.f.shape[1], axis=1) f_idx = T.maximum(0.0, f_yy - self.f + self.m) idx = f_idx.argsort(axis=1)[:, 0] s = self.m - f_y + self.f[T.arange(idx.shape[0]), idx] return T.maximum(0.0, s) def training_cost(self, y, i): return T.mean(self.warp_loss_cost(y, i))
class InfNet(object): """ A net that tries to infer an approximate posterior for some observation, given some deep, directed generative model. The output of this network comprises two constructs: an approximate mean vector and an approximate standard deviation vector (i.e. diagonal matrix) for a Gaussian posterior. Parameters: rng: a numpy.random RandomState object Xd: symbolic input matrix for inputs prior_sigma: standard deviation of isotropic Gaussian prior that our inferred posteriors will be penalized for deviating from. params: a dict of parameters describing the desired network: lam_l2a: L2 regularization weight on neuron activations vis_drop: drop rate to use on observable variables hid_drop: drop rate to use on hidden layer activations -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0 input_noise: standard dev for noise on the input of this net bias_noise: standard dev for noise on the biases of hidden layers shared_config: list of "layer descriptions" for shared part mu_config: list of "layer descriptions" for mu part sigma_config: list of "layer descriptions" for sigma part activation: "function handle" for the desired non-linearity init_scale: scaling factor for hidden layer weights (__ * 0.01) encoder: a function that will be applied to inputs prior to passing them through the network. this can be used for in-lining, e.g., PCA preprocessing on training data shared_param_dicts: parameters for the MLP controlled by this InfNet """ def __init__(self, \ rng=None, \ Xd=None, \ prior_sigma=None, \ params=None, \ shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params self.lam_l2a = params['lam_l2a'] if 'build_theano_funcs' in params: self.build_theano_funcs = params['build_theano_funcs'] else: self.build_theano_funcs = True if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.0 if 'input_noise' in params: self.input_noise = params['input_noise'] else: self.input_noise = 0.0 if 'bias_noise' in params: self.bias_noise = params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 if 'encoder' in params: self.encoder = params['encoder'] self.decoder = params['decoder'] self.use_encoder = True self.Xd_encoded = self.encoder(self.Xd) else: self.encoder = lambda x: x self.decoder = lambda x: x self.use_encoder = False self.Xd_encoded = self.encoder(self.Xd) if 'kld2_scale' in params: self.kld2_scale = params['kld2_scale'] else: self.kld2_scale = 0.0 if 'sigma_init_scale' in params: self.sigma_init_scale = params['sigma_init_scale'] else: self.sigma_init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.shared_config = params['shared_config'] self.mu_config = params['mu_config'] self.sigma_config = params['sigma_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun ######################################### # Initialize the shared part of network # ######################################### self.shared_layers = [] layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:]) layer_num = 0 # Construct input to the inference network if self.use_encoder: next_input = self.encoder(self.Xd) else: next_input = self.Xd for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "share_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if first_layer: i_noise = self.input_noise b_noise = 0.0 else: i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.shared_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ##################################### # Initialize the mu part of network # ##################################### self.mu_layers = [] layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "mu_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) self.shared_param_dicts['mu'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['mu'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.mu_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ######################################## # Initialize the sigma part of network # ######################################## self.sigma_layers = [] layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "sigma_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if last_layer: # set in-bound weights for logvar predictions to 0 i_scale = 0.0 * i_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) self.shared_param_dicts['sigma'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['sigma'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.sigma_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # Create a shared parameter for rescaling posterior "sigmas" to allow # control over the velocity of the markov chain generated by repeated # cycling through the INF -> GEN loop. if not ('sigma_scale' in self.shared_param_dicts['sigma'][-1]): # we use a hack-ish check to remain compatible with loading models # that were saved before the addition of the sigma_scale param. zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.sigma_scale = theano.shared(value=zero_ary) new_dict = {'sigma_scale': self.sigma_scale} self.shared_param_dicts['sigma'].append(new_dict) self.set_sigma_scale(1.0) else: # this is a clone of some other InfNet, and that InfNet was made # after adding the sigma_scale param, so use its sigma_scale self.sigma_scale = \ self.shared_param_dicts['sigma'][-1]['sigma_scale'] # Create a shared parameter for maintaining an exponentially decaying # estimate of the population mean of posterior KL divergence. if not ('kld_mean' in self.shared_param_dicts['sigma'][-1]): # add a kld_mean if none was already present zero_ary = np.zeros((1,)).astype(theano.config.floatX) + 100.0 self.kld_mean = theano.shared(value=zero_ary) self.shared_param_dicts['sigma'][-1]['kld_mean'] = self.kld_mean else: # use a kld_mean that's already present self.kld_mean = self.shared_param_dicts['sigma'][-1]['kld_mean'] # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.mu_layers: self.mlp_params.extend(layer.params) for layer in self.sigma_layers: self.mlp_params.extend(layer.params) # The output of this inference network is given by the noisy output # of the final layers of its mu and sigma networks. self.output_mean = self.mu_layers[-1].linear_output self.output_logvar = self.sigma_layers[-1].linear_output self.output_sigma = self.sigma_init_scale * self.sigma_scale[0] * \ T.exp(0.5 * self.output_logvar) # We'll also construct an output containing a single samples from each # of the distributions represented by the rows of self.output_mean and # self.output_sigma. self.output = self._construct_post_samples() self.out_dim = self.sigma_layers[-1].out_dim # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = self.lam_l2a * self._act_reg_cost() # Construct a function for penalizing KL divergence between the # approximate posteriors produced by this model and some isotropic # Gaussian distribution. self.kld_cost = self._construct_kld_cost() self.kld_mean_update = T.cast((0.98 * self.kld_mean) + \ (0.02 * T.mean(self.kld_cost)), 'floatX') # Construct a theano function for sampling from the approximate # posteriors inferred by this model for some collection of points # in the "data space". if self.build_theano_funcs: self.sample_posterior = self._construct_sample_posterior() self.mean_posterior = theano.function([self.Xd], \ outputs=self.output_mean) else: self.sample_posterior = None self.mean_posterior = None return def set_sigma_scale(self, sigma_scale=1.0): """ Set the posterior sigma rescaling shared parameter to some value. """ zero_ary = np.zeros((1,)) new_scale = zero_ary + sigma_scale self.sigma_scale.set_value(new_scale.astype(theano.config.floatX)) return def _act_reg_cost(self): """ Apply L2 regularization to the activations in each net. """ act_sq_sums = [] for layer in self.shared_layers: act_sq_sums.append(layer.act_l2_sum) for layer in self.mu_layers: act_sq_sums.append(layer.act_l2_sum) for layer in self.sigma_layers: act_sq_sums.append(layer.act_l2_sum) full_act_sq_sum = T.sum(act_sq_sums) return full_act_sq_sum def _construct_post_samples(self): """ Draw a single sample from each of the approximate posteriors encoded in self.output_mean and self.output_sigma. """ post_samples = self.output_mean + (self.output_sigma * \ self.rng.normal(size=self.output_sigma.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX)) return post_samples def _construct_kld_cost(self): """ Compute (analytically) the KL divergence between each approximate posterior encoded by self.mu/self.sigma and the isotropic Gaussian distribution with mean 0 and standard deviation self.prior_sigma. """ prior_mu = 0.0 prior_logvar = np.log(self.prior_sigma**2.0) post_klds = gaussian_kld(self.output_mean, self.output_logvar, \ prior_mu, prior_logvar) kld_cost = T.sum(post_klds, axis=1, keepdims=True) return kld_cost def _construct_sample_posterior(self): """ Construct a sampler that draws a single sample from the inferred posterior for some set of inputs. """ psample = theano.function([self.Xd], \ outputs=self.output) return psample def init_biases(self, b_init=0.0, b_std=1e-2): """ Initialize the biases in all hidden layers to some constant. """ for layer in self.shared_layers: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(b_vec.astype(theano.config.floatX)) for layer in self.mu_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(b_vec.astype(theano.config.floatX)) for layer in self.sigma_layers[:-1]: b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init b_vec = b_vec + (b_std * npr.randn(*b_vec.shape)) layer.b.set_value(b_vec.astype(theano.config.floatX)) return def shared_param_clone(self, rng=None, Xd=None, build_funcs=True): """ Return a clone of this network, with shared parameters but with different symbolic input variables. This can be used for "unrolling" a generate->infer->generate->infer... loop. Then, we can do backprop through time for various objectives. """ new_params = self.params new_params['build_theano_funcs'] = build_funcs clone_net = InfNet(rng=rng, Xd=Xd, \ prior_sigma=self.prior_sigma, params=self.params, \ shared_param_dicts=self.shared_param_dicts) return clone_net def save_to_file(self, f_name=None): """ Dump important stuff to a Python pickle, so that we can reload this model later. We'll pickle everything required to create a clone of this model given the pickle and the rng/Xd params to the cloning function: "InfNet.shared_param_clone()". """ assert(not (f_name is None)) f_handle = file(f_name, 'wb') # dump the "simple" python value in self.prior_sigma cPickle.dump(self.prior_sigma, f_handle, protocol=-1) # dump the dict self.params, which just holds "simple" python values cPickle.dump(self.params, f_handle, protocol=-1) # make a copy of self.shared_param_dicts, with numpy arrays in place # of the theano shared variables numpy_param_dicts = {'shared': [], 'mu': [], 'sigma': []} for layer_group in ['shared', 'mu', 'sigma']: for shared_dict in self.shared_param_dicts[layer_group]: numpy_dict = {} for key in shared_dict: numpy_dict[key] = shared_dict[key].get_value(borrow=False) numpy_param_dicts[layer_group].append(numpy_dict) # dump the numpy version of self.shared_param_dicts cPickle.dump(numpy_param_dicts, f_handle, protocol=-1) f_handle.close() return