def cov_interface_gradients(self): """ Create covariance function for the gradiens Returns: theano.tensor.matrix: covariance of the gradients. Shape number of points in rest x number of points in dip_pos """ # Euclidian distances sed_dips_rest = self.squared_euclidean_distances( self.dips_position_tiled, self.rest_layer_points) sed_dips_ref = self.squared_euclidean_distances( self.dips_position_tiled, self.ref_layer_points) # Cartesian distances between dips and interface points # Rest hu_rest = T.vertical_stack( (self.dips_position[:, 0] - self.rest_layer_points[:, 0].reshape( (self.rest_layer_points[:, 0].shape[0], 1))).T, (self.dips_position[:, 1] - self.rest_layer_points[:, 1].reshape( (self.rest_layer_points[:, 1].shape[0], 1))).T, (self.dips_position[:, 2] - self.rest_layer_points[:, 2].reshape( (self.rest_layer_points[:, 2].shape[0], 1))).T) # Reference point hu_ref = T.vertical_stack( (self.dips_position[:, 0] - self.ref_layer_points[:, 0].reshape( (self.ref_layer_points[:, 0].shape[0], 1))).T, (self.dips_position[:, 1] - self.ref_layer_points[:, 1].reshape( (self.ref_layer_points[:, 1].shape[0], 1))).T, (self.dips_position[:, 2] - self.ref_layer_points[:, 2].reshape( (self.ref_layer_points[:, 2].shape[0], 1))).T) # Cross-Covariance gradients-surface_points C_GI = self.gi_reescale * (( hu_rest * (sed_dips_rest < self.a_T) * # first derivative (-self.c_o_T * ((-14 / self.a_T**2) + 105 / 4 * sed_dips_rest / self.a_T**3 - 35 / 2 * sed_dips_rest**3 / self.a_T**5 + 21 / 4 * sed_dips_rest**5 / self.a_T**7))) - ( hu_ref * (sed_dips_ref < self.a_T) * # first derivative (-self.c_o_T * ((-14 / self.a_T**2) + 105 / 4 * sed_dips_ref / self.a_T**3 - 35 / 2 * sed_dips_ref**3 / self.a_T**5 + 21 / 4 * sed_dips_ref**5 / self.a_T**7)))).T # Add name to the theano node C_GI.name = 'Covariance gradient interface' if str(sys._getframe().f_code.co_name) + '_g' in self.verbose: theano.printing.pydotprint(C_GI, outfile="graphs/" + sys._getframe().f_code.co_name + ".png", var_with_name_simple=True) return C_GI
def reconstruct(self, x_in, x_out): # get important size and shape information batch_size = x_in.shape[0] z_mix_dim = self.get_dim('z_mix') z_gen_dim = self.get_dim('z_gen') ce_dim = self.get_dim('c_enc') cd_dim = self.get_dim('c_dec') he_dim = self.get_dim('h_enc') hd_dim = self.get_dim('h_dec') # sample zero-mean, unit std. Gaussian noise for mixture init u_mix = self.theano_rng.normal( size=(batch_size, z_mix_dim), avg=0., std=1.) # transform ZMUV noise based on q(z_mix | x_in) z_mix_mean, z_mix_logvar, z_mix = \ self.mix_enc_mlp.apply(x_in, u_mix) # transform samples from q(z_mix | x_in) into initial generator state mix_init = self.mix_dec_mlp.apply(z_mix) cd0 = mix_init[:, :cd_dim] hd0 = mix_init[:, cd_dim:(cd_dim+hd_dim)] ce0 = mix_init[:, (cd_dim+hd_dim):(cd_dim+hd_dim+ce_dim)] he0 = mix_init[:, (cd_dim+hd_dim+ce_dim):(cd_dim+hd_dim+ce_dim+he_dim)] sm0 = mix_init[:, (cd_dim+hd_dim+ce_dim+he_dim):] c0 = tensor.zeros_like(x_out) + self.c_0 # compute KL-divergence information for the mixture init step akl_q2p_mix = gaussian_kld(z_mix_mean, z_mix_logvar, \ self.zm_mean, self.zm_logvar) akl_p2q_mix = gaussian_kld(self.zm_mean, self.zm_logvar, \ z_mix_mean, z_mix_logvar) kl_q2p_mix_np = tensor.sum(akl_q2p_mix, axis=1) kl_p2q_mix_np = tensor.sum(akl_p2q_mix, axis=1) kl_q2p_mix = kl_q2p_mix_np.reshape((1, batch_size)) kl_p2q_mix = kl_p2q_mix_np.reshape((1, batch_size)) # get zero-mean, unit-std. Gaussian noise for use in scan op u_gen = self.theano_rng.normal( size=(self.n_iter, batch_size, z_gen_dim), avg=0., std=1.) # run the multi-stage guided generative process c, h_enc, c_enc, z, kl_q2p_gen, kl_p2q_gen, h_dec, c_dec = \ self.iterate(u=u_gen, c=c0, h_enc=he0, c_enc=ce0, \ h_dec=hd0, c_dec=cd0, x=x_out, s_mix=sm0) # grab the observations generated by the multi-stage process x_recons = tensor.nnet.sigmoid(c[-1,:,:]) x_recons.name = "reconstruction" # group up the klds from mixture init and multi-stage generation kl_q2p = tensor.vertical_stack(kl_q2p_mix, kl_q2p_gen) kl_q2p.name = "kl_q2p" kl_p2q = tensor.vertical_stack(kl_p2q_mix, kl_p2q_gen) kl_p2q.name = "kl_p2q" return x_recons, kl_q2p, kl_p2q
def gSat(m, v=None, i=None, e=None): ''' Reimplementation from the PILCO matlab code. Saturates the input signal to -1 to 1 through the function sat(x) = (9*sin(x) +sin(3*x))/8. If v is not None, this function returns the output mean, covariance and input-output covariance for computing he joint distribution p(input,output) as a multivariate Gaussian.''' D = m.shape[0] if i is None: i = tt.arange(D) if e is None: e = tt.ones((D, )) elif e.__class__ is list: e = tt.as_tensor_variable(np.array(e)).flatten() elif e.__class__ is np.array: e = tt.as_tensor_variable(e).flatten() e = e.astype(m.dtype) # if no input variance, return deterministic if v is None: return e * (9 * tt.sin(m) + tt.sin(3 * m)) / 8 # construct joint distribution of x and 3*x Q = tt.vertical_stack(tt.eye(D), 3 * tt.eye(D)) ma = Q.dot(m) va = Q.dot(v).dot(Q.T) # compute the joint distribution of 9*sin(x)/8 and sin(3*x)/8 i1 = tt.concatenate([i, i + D]) e1 = tt.concatenate([9.0 * e, e]) / 8.0 M2, V2, C2 = gSin(ma, va, i1, e1) # get the distribution of (9*sin(x) + sin(3*x))/8 P = tt.vertical_stack(tt.eye(D), tt.eye(D)) # mean M = M2.dot(P) # variance V = P.T.dot(V2).dot(P) # inv input covariance dot input output covariance C = Q.T.dot(C2).dot(P) retvars = [M, V, C] return retvars
def create_discriminator_func(layers, apply_updates=False): X = T.fmatrix('X') pz = T.fmatrix('pz') X_batch = T.fmatrix('X_batch') pz_batch = T.fmatrix('pz_batch') # the discriminator receives samples from q(z|x) and p(z) # and should predict to which distribution each sample belongs discriminator_outputs = get_output( layers['l_discriminator_out'], inputs={ layers['l_prior_in']: pz, layers['l_encoder_in']: X, }, deterministic=False, ) # label samples from q(z|x) as 1 and samples from p(z) as 0 discriminator_targets = T.vertical_stack( T.ones((X_batch.shape[0], 1)), T.zeros((pz_batch.shape[0], 1)) ) discriminator_loss = T.mean( T.nnet.binary_crossentropy( discriminator_outputs, discriminator_targets, ) ) if apply_updates: # only layers that are part of the discriminator should be updated discriminator_params = get_all_params( layers['l_discriminator_out'], trainable=True, discriminator=True) discriminator_updates = nesterov_momentum( discriminator_loss, discriminator_params, 0.1, 0.0) else: discriminator_updates = None discriminator_func = theano.function( inputs=[ theano.In(X_batch), theano.In(pz_batch), ], outputs=discriminator_loss, updates=discriminator_updates, givens={ X: X_batch, pz: pz_batch, }, ) return discriminator_func
def __init__(self,u_size,y_size,reservoir_size,alpha, num_max_W, memory, target_spectral): #timesteps = 5 #U is features x timesteps #W is the matrix for weights within reservoir x #W_in is matrix from input u #W_out is matrix from x to output y #First choose the number of nodes to fill. 10 reservoir_size = reservoir_size if reservoir_size != None else u_size * memory self.alpha = alpha #set the values def initWeights(M,numEntries): for i in range(M.shape[0]): indices= random.randint(0,M.shape[1]-1,numEntries) M[i,indices] = random.randn(1,numEntries) return M self.W_in = initWeights(random.rand(reservoir_size,u_size+1),num_max_W).astype(theano.config.floatX) initM = initWeights(np.zeros((reservoir_size,reservoir_size)), num_max_W) max_eig = sorted(np.absolute(linalg.eigvals(initM)),reverse=True)[0] if max_eig!=0: initM = initM*target_spectral/max_eig self.W = initM.astype(theano.config.floatX) #These are the weights that would be tuned self.W_out = theano.shared(np.zeros((y_size,reservoir_size + u_size +1)).astype(theano.config.floatX)) #self.W_fb = theano.shared(np.zeros((reservoir_size, y_size))) #W_in is size of x x size of u +1 #Un is size of u #Xn is size of x + 1 x T #for a sequence u get x def recurrence(u_t,prevX): x_t = (1-self.alpha)*prevX + self.alpha*T.tanh\ (T.dot(self.W_in, T.vertical_stack(T.as_tensor_variable(np.ones((1,1)).astype(theano.config.floatX)),u_t[:,np.newaxis])[:,0])\ + T.dot(self.W,prevX)) return x_t u = T.fmatrix() #provide with random input #u.tag.test_value =np.random.rand(5,2).astype(theano.config.floatX) x,_ = theano.scan(fn = recurrence, sequences=u, outputs_info=[T.zeros((reservoir_size)).astype(theano.config.floatX)]) timesteps = T.iscalar() y = T.dot(self.W_out, T.vertical_stack(T.ones((1,timesteps)).astype(theano.config.floatX),u.T,x.T)) self.predict = theano.function(inputs=[u,timesteps],outputs=y) #the true labels y0 = T.fmatrix() #provide with random input #y0.tag.test_value = np.random.rand(5,1).astype(theano.config.floatX) cost = T.sum((y.T - y0)**2) #cost = T.sum(y**2) g = T.grad(cost,self.W_out) lr = T.scalar() updates = OrderedDict([(self.W_out, self.W_out - lr*g)]) self.train = theano.function(inputs=[u,y0,lr,timesteps],outputs=cost,updates=updates,on_unused_input='warn')
def atData(input, left, right): sentence = input[0] min = T.switch(T.lt(left, right), left, right) max = T.switch(T.lt(left, right), right, left) sentenceHead = sentence[:(min + _N_PAD_HEAD)] sentenceMiddle = sentence[(min + _N_PAD_HEAD + 1):(max + _N_PAD_HEAD)] sentenceTail = sentence[(max + _N_PAD_HEAD + 1):] # 去掉了两个entityPair # 86×60 newSentence = T.vertical_stack(sentenceHead, sentenceMiddle, sentenceTail) leftEntity = sentence[min + _N_PAD_HEAD] rightEntity = sentence[max + _N_PAD_HEAD] LRConnect = T.concatenate([leftEntity, rightEntity]) def AtLayerData(LRConnect, newSentenceCon): def forEveryWord(word): temp = T.concatenate([word, LRConnect]) # return T.concatenate(temp, rightEntity) return temp # 将两个entitypair加在了每个句子的后面 # 86×180 sentenceAfAdd, _ = theano.scan(forEveryWord, sequences=newSentenceCon) eForWord = T.dot(sentenceAfAdd, WForATData) aForWord = T.nnet.softmax(eForWord)[0] def mulWeight(word, weight): return word * weight # 86×60 newSRep, _ = theano.scan(mulWeight, sequences=[newSentence, aForWord]) # 1×60 finalSRep = T.sum(newSRep, axis=0) return T.dot(finalSRep, linearW) finalSRep, _ = theano.scan(AtLayerData, outputs_info=LRConnect, non_sequences=newSentence, n_steps=NUMBER_DATA) return finalSRep[-1]
def __init__(self, input=None, n_visible=16, n_hidden=20, W=None, hbias=None, vbias=None, numpy_rng = None, theano_rng=None, batch_size=0, t_batch_size=1, n_beta=10, beta_lbound=0., tau=None): self.n_visible = n_visible self.n_hidden = n_hidden self.t_batch_size = t_batch_size # size of tempered minibatch self.batch_size = batch_size # size of T=1 minibatch if numpy_rng is None: numpy_rng = numpy.random.RandomState(1234) if theano_rng is None: theano_rng = RandomStreams(numpy_rng.randint(2**30)) self.rng = numpy_rng self.theano_rng = theano_rng if W is None : initial_W = numpy.asarray( 0.01 * numpy_rng.randn(n_visible, n_hidden), dtype=theano.config.floatX ) W = theano.shared(value=initial_W, name='W', borrow=True) self.W = W if hbias is None : hbias = sharedX(numpy.zeros(n_hidden), 'hbias') self.hbias = hbias if vbias is None : vbias = sharedX(numpy.zeros(n_visible), 'vbias') self.vbias = vbias if input is None: input = T.matrix('input') self.input = input ######################################################################### # Fields indexed by batch_size + mixstat: buffer, E # Fields indexed by mixstat: beta, labels, rtime # Fields indexed by temp index: mixstat, fup_target, nup, ndown, swapstat ######################################################################### ### initialize tempering stuff ### n_chain = t_batch_size * n_beta self.n_chain = theano.shared(n_chain, name='n_chain') # number of active chains in buffer array self.n_beta = theano.shared(n_beta, name='n_beta') # number of temperatures in system self.n_chain_total = batch_size + self.n_chain # configure buffers for negative particles _buffer = self.rng.randint(0,2,size=(batch_size + 2*n_chain, n_visible)) self._buffer = sharedX(_buffer, name='buffer') self.buffer = self._buffer[:self.n_chain_total] # buffer used to store mean-field activation self.mf_buffer = sharedX(numpy.zeros_like(_buffer), name='mf_buffer') # vectors containing energy of current negative particles (at T=1) self._E = sharedX(numpy.zeros(batch_size + 2*n_chain), name='E') self.E = self._E[:self.n_chain_total] # Space out inverse temperature parameters linearly in [1,beta_lbound] range . beta = numpy.zeros(2*n_chain) for bi in range(t_batch_size): base_idx = n_beta*bi beta[base_idx:base_idx+n_beta] = numpy.linspace(1, beta_lbound, n_beta) self._beta = sharedX(beta, name='beta') self.beta = self._beta[:self.n_chain] # Used to multiply the rows of "W x + b" self.beta_matrix = T.vertical_stack( T.alloc(1.0, batch_size, 1), self.beta.dimshuffle([0,'x'])) # initialize data structure to map nhid/nvis rows to a given temperature # mixstat stores pointers to self.nvis array mixstat = numpy.zeros((t_batch_size, 2*n_beta), dtype='int32') mixstat[:, :n_beta] = numpy.arange(n_chain).reshape(t_batch_size, n_beta) self._mixstat = theano.shared(mixstat, name='mixstat') self.mixstat = self._mixstat[:, :self.n_beta] ### Initialize particle properties ### # labels: 1 means going up in temperature, 0 going down in temperature labels = LBL_NONE * numpy.ones(2*n_chain, dtype='int32') labels[mixstat[:,0]] = LBL_UP self.labels = theano.shared(labels, name='labels') # return time rtime = numpy.zeros(2*n_chain, dtype='int32') self.rtime = theano.shared(rtime, name='rtime') self.avg_rtime = sharedX(rtime_deo(0.4,n_beta), name='avg_rtime') ### Initialize temperature properties ### # configure fup target for each chain (this shouldn't change very often) _fup_target = numpy.zeros(2*n_beta) _fup_target[:n_beta] = numpy.linspace(1,0,n_beta) self._fup_target = sharedX(_fup_target, name='fup_target') self.fup_target = self._fup_target[:self.n_beta] # configure histogram of up moving particles _nup = numpy.zeros(2*n_beta) _nup[:n_beta] = numpy.linspace(1,0,n_beta) self._nup = sharedX(_nup, name='nup') self.nup = self._nup[:self.n_beta] # configure histogram of down moving particles _ndown = numpy.zeros(2*n_beta) _ndown[:n_beta] = numpy.linspace(0,1,n_beta) self._ndown = sharedX(_ndown, name='ndown') self.ndown = self._ndown[:self.n_beta] # use return time as the time constant for all moving averages if not tau: self.tau = 1./self.avg_rtime else: self.tau = T.as_tensor(tau) self.get_tau = theano.function([], self.tau) # create PT Op self._swapstat = sharedX(numpy.zeros(2*n_beta), name='swapstat') self.swapstat = self._swapstat[:self.n_beta] self.pt_swaps = PT_Swaps(rng=self.rng) self.pt_swap_t1_sample = PT_SwapT1Sample(rng=self.rng, batch_size=self.batch_size)
def recurrence(u_t, prevX): x_t = (1-self.alpha)*prevX + self.alpha*T.tanh\ (T.dot(self.W_in, T.vertical_stack(T.as_tensor_variable(np.ones((1,1)).astype(theano.config.floatX)),u_t[:,np.newaxis])[:,0])\ + T.dot(self.W,prevX)) return x_t
def __init__(self, u_size, y_size, reservoir_size, alpha, num_max_W, memory, target_spectral): #timesteps = 5 #U is features x timesteps #W is the matrix for weights within reservoir x #W_in is matrix from input u #W_out is matrix from x to output y #First choose the number of nodes to fill. 10 reservoir_size = reservoir_size if reservoir_size != None else u_size * memory self.alpha = alpha #set the values def initWeights(M, numEntries): for i in range(M.shape[0]): indices = random.randint(0, M.shape[1] - 1, numEntries) M[i, indices] = random.randn(1, numEntries) return M self.W_in = initWeights(random.rand(reservoir_size, u_size + 1), num_max_W).astype(theano.config.floatX) initM = initWeights(np.zeros((reservoir_size, reservoir_size)), num_max_W) max_eig = sorted(np.absolute(linalg.eigvals(initM)), reverse=True)[0] if max_eig != 0: initM = initM * target_spectral / max_eig self.W = initM.astype(theano.config.floatX) #These are the weights that would be tuned self.W_out = theano.shared( np.zeros( (y_size, reservoir_size + u_size + 1)).astype(theano.config.floatX)) #self.W_fb = theano.shared(np.zeros((reservoir_size, y_size))) #W_in is size of x x size of u +1 #Un is size of u #Xn is size of x + 1 x T #for a sequence u get x def recurrence(u_t, prevX): x_t = (1-self.alpha)*prevX + self.alpha*T.tanh\ (T.dot(self.W_in, T.vertical_stack(T.as_tensor_variable(np.ones((1,1)).astype(theano.config.floatX)),u_t[:,np.newaxis])[:,0])\ + T.dot(self.W,prevX)) return x_t u = T.fmatrix() #provide with random input #u.tag.test_value =np.random.rand(5,2).astype(theano.config.floatX) x, _ = theano.scan(fn=recurrence, sequences=u, outputs_info=[ T.zeros((reservoir_size)).astype( theano.config.floatX) ]) timesteps = T.iscalar() y = T.dot( self.W_out, T.vertical_stack( T.ones((1, timesteps)).astype(theano.config.floatX), u.T, x.T)) self.predict = theano.function(inputs=[u, timesteps], outputs=y) #the true labels y0 = T.fmatrix() #provide with random input #y0.tag.test_value = np.random.rand(5,1).astype(theano.config.floatX) cost = T.sum((y.T - y0)**2) #cost = T.sum(y**2) g = T.grad(cost, self.W_out) lr = T.scalar() updates = OrderedDict([(self.W_out, self.W_out - lr * g)]) self.train = theano.function(inputs=[u, y0, lr, timesteps], outputs=cost, updates=updates, on_unused_input='warn')
def __init__(self, rng=None, Xd=None, Xc=None, Xm=None, Xt=None, \ i_net=None, g_net=None, d_net=None, chain_len=None, \ data_dim=None, prior_dim=None, params=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.data_dim = data_dim self.prior_dim = prior_dim self.prior_mean = 0.0 self.prior_logvar = 0.0 if params is None: self.params = {} else: self.params = params if 'cost_decay' in self.params: self.cost_decay = self.params['cost_decay'] else: self.cost_decay = 0.1 if 'chain_type' in self.params: assert((self.params['chain_type'] == 'walkback') or \ (self.params['chain_type'] == 'walkout')) self.chain_type = self.params['chain_type'] else: self.chain_type = 'walkout' if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # symbolic var for inputting samples for initializing the VAE chain self.Xd = Xd # symbolic var for masking subsets of the state variables self.Xm = Xm # symbolic var for controlling subsets of the state variables self.Xc = Xc # symbolic var for inputting samples from the target distribution self.Xt = Xt # integer number of times to cycle the VAE loop self.chain_len = chain_len # symbolic matrix of indices for data inputs self.It = T.arange(self.Xt.shape[0]) # symbolic matrix of indices for noise/generated inputs self.Id = T.arange( self.chain_len * self.Xd.shape[0]) + self.Xt.shape[0] # get a clone of the desired VAE, for easy access self.OSM = OneStageModel(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \ p_x_given_z=g_net, q_z_given_x=i_net, x_dim=self.data_dim, \ z_dim=self.prior_dim, params=self.params) self.IN = self.OSM.q_z_given_x self.GN = self.OSM.p_x_given_z self.transform_x_to_z = self.OSM.transform_x_to_z self.transform_z_to_x = self.OSM.transform_z_to_x self.bounded_logvar = self.OSM.bounded_logvar # self-loop some clones of the main VAE into a chain. # ** All VAEs in the chain share the same Xc and Xm, which are the # symbolic inputs for providing the observed portion of the input # and a mask indicating which part of the input is "observed". # These inputs are used for training "reconstruction" policies. self.IN_chain = [] self.GN_chain = [] self.Xg_chain = [] _Xd = self.Xd print("Unrolling chain...") for i in range(self.chain_len): # create a VAE infer/generate pair with _Xd as input and with # masking variables shared by all VAEs in this chain _IN = self.IN.shared_param_clone(rng=rng, \ Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm), \ build_funcs=False) _GN = self.GN.shared_param_clone(rng=rng, Xd=_IN.output, \ build_funcs=False) _Xd = self.xt_transform(_GN.output_mean) self.IN_chain.append(_IN) self.GN_chain.append(_GN) self.Xg_chain.append(_Xd) print(" step {}...".format(i)) # make a clone of the desired discriminator network, which will try # to discriminate between samples from the training data and samples # generated by the self-looped VAE chain. self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(self.Xt, *self.Xg_chain)) zero_ary = np.zeros((1, )).astype(theano.config.floatX) # init shared var for weighting nll of data given posterior sample self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll') self.set_lam_chain_nll(lam_chain_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld') self.set_lam_chain_kld(lam_chain_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w') self.set_lam_l2w(lam_l2w=1e-4) # shared var learning rates for all networks self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn') self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in') # shared var momentum parameters for all networks self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2') self.it_count = theano.shared(value=zero_ary, name='vcg_it_count') # shared var weights for adversarial classification objective self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn') self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn') # init parameters for controlling learning dynamics self.set_all_sgd_params() self.set_disc_weights() # init adversarial cost weights for GN/DN # set a shared var for regularizing the output of the discriminator self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \ name='vcg_lam_l2d') # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the VCGair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this VCGair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.in_params = [p for p in self.IN.mlp_params] self.in_params.append(self.OSM.output_logvar) self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params + self.dn_params # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on adversarial binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # first, build the cost to be optimized by the discriminator network, # in general this will be treated somewhat indepedently of the # optimization of the generator and inferencer networks. self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + \ self.disc_reg_cost # construct costs relevant to the optimization of the generator and # discriminator networks self.chain_nll_cost = self.lam_chain_nll[0] * \ self._construct_chain_nll_cost(cost_decay=self.cost_decay) self.chain_kld_cost = self.lam_chain_kld[0] * \ self._construct_chain_kld_cost(cost_decay=self.cost_decay) self.other_reg_cost = self._construct_other_reg_cost() self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \ self.chain_kld_cost + self.other_reg_cost # compute total cost on the discriminator and VB generator/inferencer self.joint_cost = self.dn_cost + self.osm_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() print("Computing VCGLoop DN cost gradients...") grad_list = T.grad(self.dn_cost, self.dn_params, disconnected_inputs='warn') for i, p in enumerate(self.dn_params): self.joint_grads[p] = grad_list[i] print("Computing VCGLoop IN cost gradients...") grad_list = T.grad(self.osm_cost, self.in_params, disconnected_inputs='warn') for i, p in enumerate(self.in_params): self.joint_grads[p] = grad_list[i] print("Computing VCGLoop GN cost gradients...") grad_list = T.grad(self.osm_cost, self.gn_params, disconnected_inputs='warn') for i, p in enumerate(self.gn_params): self.joint_grads[p] = grad_list[i] # construct the updates for the discriminator, generator and # inferencer networks. all networks share the same first/second # moment momentum and iteration count. the networks each have their # own learning rates, which lets you turn their learning on/off. self.dn_updates = get_param_updates(params=self.dn_params, \ grads=self.joint_grads, alpha=self.lr_dn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.gn_updates = get_param_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.in_updates = get_param_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) # bag up all the updates required for training self.joint_updates = OrderedDict() for k in self.dn_updates: self.joint_updates[k] = self.dn_updates[k] for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] # construct an update for tracking the mean KL divergence of # approximate posteriors for this chain new_kld_mean = (0.98 * self.IN.kld_mean) + ((0.02 / self.chain_len) * \ sum([T.mean(I_N.kld_cost) for I_N in self.IN_chain])) self.joint_updates[self.IN.kld_mean] = T.cast(new_kld_mean, 'floatX') # construct the function for training on training data print("Compiling VCGLoop theano functions....") self.train_joint = self._construct_train_joint() return
def recurrence(u_t,prevX): x_t = (1-self.alpha)*prevX + self.alpha*T.tanh\ (T.dot(self.W_in, T.vertical_stack(T.as_tensor_variable(np.ones((1,1)).astype(theano.config.floatX)),u_t[:,np.newaxis])[:,0])\ + T.dot(self.W,prevX)) return x_t
def cov_gradients(self, verbose=0): """ Create covariance function for the gradients Returns: theano.tensor.matrix: covariance of the gradients. Shape number of points in dip_pos x number of points in dip_pos """ # Euclidean distances sed_dips_dips = self.squared_euclidean_distances( self.dips_position_tiled, self.dips_position_tiled) if 'sed_dips_dips' in self.verbose: sed_dips_dips = theano.printing.Print('sed_dips_dips')( sed_dips_dips) # Cartesian distances between dips positions h_u = T.vertical_stack( T.tile( self.dips_position[:, 0] - self.dips_position[:, 0].reshape( (self.dips_position[:, 0].shape[0], 1)), self.n_dimensions), T.tile( self.dips_position[:, 1] - self.dips_position[:, 1].reshape( (self.dips_position[:, 1].shape[0], 1)), self.n_dimensions), T.tile( self.dips_position[:, 2] - self.dips_position[:, 2].reshape( (self.dips_position[:, 2].shape[0], 1)), self.n_dimensions)) # Transpose h_v = h_u.T # Perpendicularity matrix. Boolean matrix to separate cross-covariance and # every gradient direction covariance (block diagonal) perpendicularity_matrix = T.zeros_like(sed_dips_dips) # Cross-covariances of x perpendicularity_matrix = T.set_subtensor( perpendicularity_matrix[0:self.dips_position.shape[0], 0:self.dips_position.shape[0]], 1) # Cross-covariances of y perpendicularity_matrix = T.set_subtensor( perpendicularity_matrix[ self.dips_position.shape[0]:self.dips_position.shape[0] * 2, self.dips_position.shape[0]:self.dips_position.shape[0] * 2], 1) # Cross-covariances of z perpendicularity_matrix = T.set_subtensor( perpendicularity_matrix[self.dips_position.shape[0] * 2:self.dips_position.shape[0] * 3, self.dips_position.shape[0] * 2:self.dips_position.shape[0] * 3], 1) # Covariance matrix for gradients at every xyz direction and their cross-covariances C_G = T.switch( T.eq(sed_dips_dips, 0), # This is the condition 0, # If true it is equal to 0. This is how a direction affect another ( # else, following Chiles book (h_u * h_v / sed_dips_dips**2) * (((sed_dips_dips < self.a_T) * # first derivative (-self.c_o_T * ((-14 / self.a_T**2) + 105 / 4 * sed_dips_dips / self.a_T**3 - 35 / 2 * sed_dips_dips**3 / self.a_T**5 + 21 / 4 * sed_dips_dips**5 / self.a_T**7))) + (sed_dips_dips < self.a_T) * # Second derivative self.c_o_T * 7 * (9 * sed_dips_dips**5 - 20 * self.a_T**2 * sed_dips_dips**3 + 15 * self.a_T**4 * sed_dips_dips - 4 * self.a_T**5) / (2 * self.a_T**7)) - ( perpendicularity_matrix * (sed_dips_dips < self.a_T) * # first derivative self.c_o_T * ((-14 / self.a_T**2) + 105 / 4 * sed_dips_dips / self.a_T**3 - 35 / 2 * sed_dips_dips**3 / self.a_T**5 + 21 / 4 * sed_dips_dips**5 / self.a_T**7)))) # Setting nugget effect of the gradients # TODO: This function can be substitued by simply adding the nugget effect to the diag if I remove the condition C_G += T.eye(C_G.shape[0]) * self.nugget_effect_grad_T # Add name to the theano node C_G.name = 'Covariance Gradient' if verbose > 1: theano.printing.pydotprint(C_G, outfile="graphs/" + sys._getframe().f_code.co_name + ".png", var_with_name_simple=True) if str(sys._getframe().f_code.co_name) in self.verbose: C_G = theano.printing.Print('Cov Gradients')(C_G) return C_G
def __init__(self, input=None, n_visible=784, n_hidden=500, \ W=None, hbias=None, vbias=None, seed = None, theano_rng=None, batch_size=0, t_batch_size=1, n_beta=10, beta_lbound=0., tau=None): """ RBM constructor. Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing CD updates. :param input: None for standalone RBMs or symbolic variable if RBM is part of a larger graph. :param n_visible: number of visible units :param n_hidden: number of hidden units :param W: None for standalone RBMs or symbolic variable pointing to a shared weight matrix in case RBM is part of a DBN network; in a DBN, the weights are shared between RBMs and layers of a MLP :param hbias: None for standalone RBMs or symbolic variable pointing to a shared hidden units bias vector in case RBM is part of a different network :param vbias: None for standalone RBMs or a symbolic variable pointing to a shared visible units bias :param tau: optional fixed time constant (overrides return time) """ assert (n_beta > 1 and t_batch_size > 0) or (n_beta==1 and t_batch_size==0) if t_batch_size > 0: assert batch_size%t_batch_size==0 self.n_visible = n_visible self.n_hidden = n_hidden self.t_batch_size = t_batch_size # size of tempered minibatch self.batch_size = batch_size # size of T=1 minibatch # deal with random number generation if seed is None: rng = numpy.random.RandomState(123) else: rng = numpy.random.RandomState(seed) if theano_rng is None: theano_rng = RandomStreams(rng.randint(2**30)) self.rng = rng self.theano_rng = theano_rng if W is None : # W is initialized with `initial_W` which is uniformely sampled # from -4*sqrt(6./(n_visible+n_hidden)) and 4*sqrt(6./(n_hidden+n_visible)) # the output of uniform if converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU initial_W = 0.01 * self.rng.randn(n_visible, n_hidden) # theano shared variables for weights and biases W = sharedX(initial_W, 'W') self.W = W if hbias is None : # create shared variable for hidden units bias hbias = sharedX(numpy.zeros(n_hidden), 'hbias') self.hbias = hbias if vbias is None : # create shared variable for visible units bias vbias = sharedX(numpy.zeros(n_visible), 'vbias') self.vbias = vbias # initialize input layer for standalone RBM or layer0 of DBN if input is None: input = T.matrix('input') self.input = input ######################################################################### # Fields indexed by batch_size + mixstat: buffer, E # Fields indexed by mixstat: beta, labels, rtime # Fields indexed by temp index: mixstat, fup_target, nup, ndown, swapstat ######################################################################### ### initialize tempering stuff ### n_chain = t_batch_size * n_beta self.n_chain = theano.shared(n_chain, name='n_chain') # number of active chains in buffer array self.n_beta = theano.shared(n_beta, name='n_beta') # number of temperatures in system self.n_chain_total = batch_size + self.n_chain # configure buffers for negative particles _buffer = self.rng.randint(0,2,size=(batch_size + 2*n_chain, n_visible)) self._buffer = sharedX(_buffer, name='buffer') self.buffer = self._buffer[:self.n_chain_total] # buffer used to store mean-field activation self.mf_buffer = sharedX(numpy.zeros_like(_buffer), name='mf_buffer') # vectors containing energy of current negative particles (at T=1) self._E = sharedX(numpy.zeros(batch_size + 2*n_chain), name='E') self.E = self._E[:self.n_chain_total] # Space out inverse temperature parameters linearly in [1,beta_lbound] range . beta = numpy.zeros(2*n_chain) for bi in range(t_batch_size): base_idx = n_beta*bi beta[base_idx:base_idx+n_beta] = numpy.linspace(1, beta_lbound, n_beta) self._beta = sharedX(beta, name='beta') self.beta = self._beta[:self.n_chain] # Used to multiply the rows of "W x + b" self.beta_matrix = T.vertical_stack( T.alloc(1.0, batch_size, 1), self.beta.dimshuffle([0,'x'])) # initialize data structure to map nhid/nvis rows to a given temperature # mixstat stores pointers to self.nvis array mixstat = numpy.zeros((t_batch_size, 2*n_beta), dtype='int32') mixstat[:, :n_beta] = numpy.arange(n_chain).reshape(t_batch_size, n_beta) self._mixstat = theano.shared(mixstat, name='mixstat') self.mixstat = self._mixstat[:, :self.n_beta] ### Initialize particle properties ### # labels: 1 means going up in temperature, 0 going down in temperature labels = LBL_NONE * numpy.ones(2*n_chain, dtype='int32') labels[mixstat[:,0]] = LBL_UP self.labels = theano.shared(labels, name='labels') # return time rtime = numpy.zeros(2*n_chain, dtype='int32') self.rtime = theano.shared(rtime, name='rtime') self.avg_rtime = sharedX(rtime_deo(0.4,n_beta), name='avg_rtime') ### Initialize temperature properties ### # configure fup target for each chain (this shouldn't change very often) _fup_target = numpy.zeros(2*n_beta) _fup_target[:n_beta] = numpy.linspace(1,0,n_beta) self._fup_target = sharedX(_fup_target, name='fup_target') self.fup_target = self._fup_target[:self.n_beta] # configure histogram of up moving particles _nup = numpy.zeros(2*n_beta) _nup[:n_beta] = numpy.linspace(1,0,n_beta) self._nup = sharedX(_nup, name='nup') self.nup = self._nup[:self.n_beta] # configure histogram of down moving particles _ndown = numpy.zeros(2*n_beta) _ndown[:n_beta] = numpy.linspace(0,1,n_beta) self._ndown = sharedX(_ndown, name='ndown') self.ndown = self._ndown[:self.n_beta] # use return time as the time constant for all moving averages if not tau: self.tau = 1./self.avg_rtime else: self.tau = T.as_tensor(tau) self.get_tau = theano.function([], self.tau) # create PT Op self._swapstat = sharedX(numpy.zeros(2*n_beta), name='swapstat') self.swapstat = self._swapstat[:self.n_beta] self.pt_swaps = PT_Swaps(rng=self.rng) self.pt_swap_t1_sample = PT_SwapT1Sample(rng=self.rng, batch_size=self.batch_size)
def __init__(self, rng=None, Xd=None, Xc=None, Xm=None, Xt=None, \ i_net=None, g_net=None, d_net=None, chain_len=None, \ data_dim=None, prior_dim=None, params=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.data_dim = data_dim self.prior_dim = prior_dim self.prior_mean = 0.0 self.prior_logvar = 0.0 if params is None: self.params = {} else: self.params = params if 'cost_decay' in self.params: self.cost_decay = self.params['cost_decay'] else: self.cost_decay = 0.1 if 'chain_type' in self.params: assert((self.params['chain_type'] == 'walkback') or \ (self.params['chain_type'] == 'walkout')) self.chain_type = self.params['chain_type'] else: self.chain_type = 'walkout' if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # symbolic var for inputting samples for initializing the VAE chain self.Xd = Xd # symbolic var for masking subsets of the state variables self.Xm = Xm # symbolic var for controlling subsets of the state variables self.Xc = Xc # symbolic var for inputting samples from the target distribution self.Xt = Xt # integer number of times to cycle the VAE loop self.chain_len = chain_len # symbolic matrix of indices for data inputs self.It = T.arange(self.Xt.shape[0]) # symbolic matrix of indices for noise/generated inputs self.Id = T.arange(self.chain_len * self.Xd.shape[0]) + self.Xt.shape[0] # get a clone of the desired VAE, for easy access self.OSM = OneStageModel(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \ p_x_given_z=g_net, q_z_given_x=i_net, x_dim=self.data_dim, \ z_dim=self.prior_dim, params=self.params) self.IN = self.OSM.q_z_given_x self.GN = self.OSM.p_x_given_z self.transform_x_to_z = self.OSM.transform_x_to_z self.transform_z_to_x = self.OSM.transform_z_to_x self.bounded_logvar = self.OSM.bounded_logvar # self-loop some clones of the main VAE into a chain. # ** All VAEs in the chain share the same Xc and Xm, which are the # symbolic inputs for providing the observed portion of the input # and a mask indicating which part of the input is "observed". # These inputs are used for training "reconstruction" policies. self.IN_chain = [] self.GN_chain = [] self.Xg_chain = [] _Xd = self.Xd print("Unrolling chain...") for i in range(self.chain_len): # create a VAE infer/generate pair with _Xd as input and with # masking variables shared by all VAEs in this chain _IN = self.IN.shared_param_clone(rng=rng, \ Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm), \ build_funcs=False) _GN = self.GN.shared_param_clone(rng=rng, Xd=_IN.output, \ build_funcs=False) _Xd = self.xt_transform(_GN.output_mean) self.IN_chain.append(_IN) self.GN_chain.append(_GN) self.Xg_chain.append(_Xd) print(" step {}...".format(i)) # make a clone of the desired discriminator network, which will try # to discriminate between samples from the training data and samples # generated by the self-looped VAE chain. self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(self.Xt, *self.Xg_chain)) zero_ary = np.zeros((1,)).astype(theano.config.floatX) # init shared var for weighting nll of data given posterior sample self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll') self.set_lam_chain_nll(lam_chain_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld') self.set_lam_chain_kld(lam_chain_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w') self.set_lam_l2w(lam_l2w=1e-4) # shared var learning rates for all networks self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn') self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in') # shared var momentum parameters for all networks self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2') self.it_count = theano.shared(value=zero_ary, name='vcg_it_count') # shared var weights for adversarial classification objective self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn') self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn') # init parameters for controlling learning dynamics self.set_all_sgd_params() self.set_disc_weights() # init adversarial cost weights for GN/DN # set a shared var for regularizing the output of the discriminator self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \ name='vcg_lam_l2d') # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the VCGair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this VCGair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.in_params = [p for p in self.IN.mlp_params] self.in_params.append(self.OSM.output_logvar) self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params + self.dn_params # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on adversarial binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # first, build the cost to be optimized by the discriminator network, # in general this will be treated somewhat indepedently of the # optimization of the generator and inferencer networks. self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + \ self.disc_reg_cost # construct costs relevant to the optimization of the generator and # discriminator networks self.chain_nll_cost = self.lam_chain_nll[0] * \ self._construct_chain_nll_cost(cost_decay=self.cost_decay) self.chain_kld_cost = self.lam_chain_kld[0] * \ self._construct_chain_kld_cost(cost_decay=self.cost_decay) self.other_reg_cost = self._construct_other_reg_cost() self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \ self.chain_kld_cost + self.other_reg_cost # compute total cost on the discriminator and VB generator/inferencer self.joint_cost = self.dn_cost + self.osm_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() print("Computing VCGLoop DN cost gradients...") grad_list = T.grad(self.dn_cost, self.dn_params, disconnected_inputs='warn') for i, p in enumerate(self.dn_params): self.joint_grads[p] = grad_list[i] print("Computing VCGLoop IN cost gradients...") grad_list = T.grad(self.osm_cost, self.in_params, disconnected_inputs='warn') for i, p in enumerate(self.in_params): self.joint_grads[p] = grad_list[i] print("Computing VCGLoop GN cost gradients...") grad_list = T.grad(self.osm_cost, self.gn_params, disconnected_inputs='warn') for i, p in enumerate(self.gn_params): self.joint_grads[p] = grad_list[i] # construct the updates for the discriminator, generator and # inferencer networks. all networks share the same first/second # moment momentum and iteration count. the networks each have their # own learning rates, which lets you turn their learning on/off. self.dn_updates = get_param_updates(params=self.dn_params, \ grads=self.joint_grads, alpha=self.lr_dn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.gn_updates = get_param_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.in_updates = get_param_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) # bag up all the updates required for training self.joint_updates = OrderedDict() for k in self.dn_updates: self.joint_updates[k] = self.dn_updates[k] for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] # construct an update for tracking the mean KL divergence of # approximate posteriors for this chain new_kld_mean = (0.98 * self.IN.kld_mean) + ((0.02 / self.chain_len) * \ sum([T.mean(I_N.kld_cost) for I_N in self.IN_chain])) self.joint_updates[self.IN.kld_mean] = T.cast(new_kld_mean, 'floatX') # construct the function for training on training data print("Compiling VCGLoop theano functions....") self.train_joint = self._construct_train_joint() return
def raw_activation_fast(sl, e1, e2): self.Wact = T.batched_dot(theano.dot(e1, self.W[:,:,sl]), e2) self.Vact = 1e-4 * theano.dot(T.reshape(self.V[:,sl],(1,-1)), T.vertical_stack(e1.T, e2.T)) +\ //1e-4 reflects change of scale self.b[sl] # Bias part return self.Wact + self.Vact
def __init__(self, rng=None, Xd=None, Xc=None, Xm=None, Xt=None, \ i_net=None, g_net=None, d_net=None, chain_len=None, \ data_dim=None, prior_dim=None, params=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.data_dim = data_dim self.prior_dim = prior_dim # symbolic var for inputting samples for initializing the VAE chain self.Xd = Xd # symbolic var for masking subsets of the state variables self.Xm = Xm # symbolic var for controlling subsets of the state variables self.Xc = Xc # symbolic var for inputting samples from the target distribution self.Xt = Xt # integer number of times to cycle the VAE loop self.chain_len = chain_len # symbolic matrix of indices for data inputs self.It = T.arange(self.Xt.shape[0]) # symbolic matrix of indices for noise inputs self.Id = T.arange(self.chain_len * self.Xd.shape[0]) + self.Xt.shape[0] # get a clone of the desired VAE, for easy access self.GIP = GIPair(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \ g_net=g_net, i_net=i_net, data_dim=self.data_dim, \ prior_dim=self.prior_dim, params=None, shared_param_dicts=None) self.IN = self.GIP.IN self.GN = self.GIP.GN # self-loop some clones of the main VAE into a chain self.IN_chain = [] self.GN_chain = [] self.Xg_chain = [] _Xd = self.Xd for i in range(self.chain_len): if (i == 0): # start the chain with data provided by user _IN = self.IN.shared_param_clone(rng=rng, Xd=_Xd, \ Xc=self.Xc, Xm=self.Xm) _GN = self.GN.shared_param_clone(rng=rng, Xp=_IN.output) else: # continue the chain with samples from previous VAE _IN = self.IN.shared_param_clone(rng=rng, Xd=_Xd, \ Xc=self.Xc, Xm=self.Xm) _GN = self.GN.shared_param_clone(rng=rng, Xp=_IN.output) _Xd = _GN.output self.IN_chain.append(_IN) self.GN_chain.append(_GN) self.Xg_chain.append(_Xd) #Xg_stack = T.vertical_stack(*self.Xg_chain) #self.Xg = Xg_stack + (0.1 * self.rng.normal(size=Xg_stack.shape, avg=0.0, \ # std=1.0, dtype=theano.config.floatX)) # make a clone of the desired discriminator network, which will try # to discriminate between samples from the training data and samples # generated by the self-looped VAE chain. self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(self.Xt, *self.Xg_chain)) zero_ary = np.zeros((1,)).astype(theano.config.floatX) # init shared var for weighting nll of data given posterior sample self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll') self.set_lam_chain_nll(lam_chain_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld') self.set_lam_chain_kld(lam_chain_kld=1.0) # init shared var for weighting chain diffusion rate (a.k.a. velocity) self.lam_chain_vel = theano.shared(value=zero_ary, name='vcg_lam_chain_vel') self.set_lam_chain_vel(lam_chain_vel=1.0) # init shared var for weighting nll of data given posterior sample self.lam_mask_nll = theano.shared(value=zero_ary, name='vcg_lam_mask_nll') self.set_lam_mask_nll(lam_mask_nll=0.0) # init shared var for weighting posterior KL-div from prior self.lam_mask_kld = theano.shared(value=zero_ary, name='vcg_lam_mask_kld') self.set_lam_mask_kld(lam_mask_kld=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w') self.set_lam_l2w(lam_l2w=1e-4) # shared var learning rate for generator and discriminator self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn') self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in') # shared var momentum parameters for generator and discriminator self.mo_dn = theano.shared(value=zero_ary, name='vcg_mo_dn') self.mo_gn = theano.shared(value=zero_ary, name='vcg_mo_gn') self.mo_in = theano.shared(value=zero_ary, name='vcg_mo_in') # shared var weights for adversarial classification objective self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn') self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn') # init parameters for controlling learning dynamics self.set_dn_sgd_params() # init SGD rate/momentum for DN self.set_gn_sgd_params() # init SGD rate/momentum for GN self.set_in_sgd_params() # init SGD rate/momentum for IN self.set_disc_weights() # init adversarial cost weights for GN/DN self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \ name='vcg_lam_l2d') nll_weights = np.linspace(0.0, 5.0, num=self.chain_len) nll_weights = nll_weights / np.sum(nll_weights) nll_weights = nll_weights.astype(theano.config.floatX) self.mask_nll_weights = theano.shared(value=nll_weights, \ name='vcg_mask_nll_weights') # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the VCGair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this VCGair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on adversarial binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # first, build the cost to be optimized by the discriminator network, # in general this will be treated somewhat indepedently of the # optimization of the generator and inferencer networks. self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + \ self.disc_reg_cost # construct costs relevant to the optimization of the generator and # discriminator networks self.chain_nll_cost = self.lam_chain_nll[0] * \ self._construct_chain_nll_cost(data_weight=0.9) self.chain_kld_cost = self.lam_chain_kld[0] * \ self._construct_chain_kld_cost(data_weight=0.9) self.chain_vel_cost = self.lam_chain_vel[0] * \ self._construct_chain_vel_cost() self.mask_nll_cost = self.lam_mask_nll[0] * \ self._construct_mask_nll_cost() self.mask_kld_cost = self.lam_mask_kld[0] * \ self._construct_mask_kld_cost() self.other_reg_cost = self._construct_other_reg_cost() self.gip_cost = self.disc_cost_gn + self.chain_nll_cost + \ self.chain_kld_cost + self.chain_vel_cost + \ self.mask_nll_cost + self.mask_kld_cost + \ self.other_reg_cost # compute total cost on the discriminator and VB generator/inferencer self.joint_cost = self.dn_cost + self.gip_cost # Initialize momentums for mini-batch SGD updates. All parameters need # to be safely nestled in their lists by now. self.joint_moms = OrderedDict() self.dn_moms = OrderedDict() self.in_moms = OrderedDict() self.gn_moms = OrderedDict() for p in self.dn_params: p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0 self.dn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX)) self.joint_moms[p] = self.dn_moms[p] for p in self.in_params: p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0 self.in_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX)) self.joint_moms[p] = self.in_moms[p] for p in self.gn_params: p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0 self.gn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX)) self.joint_moms[p] = self.gn_moms[p] # Construct the updates for the generator and discriminator network self.joint_updates = OrderedDict() self.dn_updates = OrderedDict() self.in_updates = OrderedDict() self.gn_updates = OrderedDict() ########################################### # Construct updates for the discriminator # ########################################### for var in self.dn_params: # these updates are for trainable params in the inferencer net... # first, get gradient of cost w.r.t. var var_grad = T.grad(self.dn_cost, var, \ consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-0.1,0.1) # get the momentum for this var var_mom = self.dn_moms[var] # update the momentum for this var using its grad self.dn_updates[var_mom] = (self.mo_dn[0] * var_mom) + \ ((1.0 - self.mo_dn[0]) * (var_grad**2.0)) self.joint_updates[var_mom] = self.dn_updates[var_mom] # make basic update to the var var_new = var - (self.lr_dn[0] * (var_grad / T.sqrt(var_mom + 1e-3))) self.dn_updates[var] = var_new # add this var's update to the joint updates too self.joint_updates[var] = self.dn_updates[var] ######################################## # Construct updates for the inferencer # ######################################## for var in self.in_params: # these updates are for trainable params in the generator net... # first, get gradient of cost w.r.t. var var_grad = T.grad(self.gip_cost, var, \ consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-0.1,0.1) # get the momentum for this var var_mom = self.in_moms[var] # update the momentum for this var using its grad self.in_updates[var_mom] = (self.mo_in[0] * var_mom) + \ ((1.0 - self.mo_in[0]) * (var_grad**2.0)) self.joint_updates[var_mom] = self.in_updates[var_mom] # make basic update to the var var_new = var - (self.lr_in[0] * (var_grad / T.sqrt(var_mom + 1e-3))) self.in_updates[var] = var_new # add this var's update to the joint updates too self.joint_updates[var] = self.in_updates[var] ####################################### # Construct updates for the generator # ####################################### for var in self.gn_params: # these updates are for trainable params in the generator net... # first, get gradient of cost w.r.t. var var_grad = T.grad(self.gip_cost, var, \ consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-0.1,0.1) # get the momentum for this var var_mom = self.gn_moms[var] # update the momentum for this var using its grad self.gn_updates[var_mom] = (self.mo_gn[0] * var_mom) + \ ((1.0 - self.mo_gn[0]) * (var_grad**2.0)) self.joint_updates[var_mom] = self.gn_updates[var_mom] # make basic update to the var var_new = var - (self.lr_gn[0] * (var_grad / T.sqrt(var_mom + 1e-3))) self.gn_updates[var] = var_new # add this var's update to the joint updates too self.joint_updates[var] = self.gn_updates[var] # Construct the function for training on training data self.train_joint = self._construct_train_joint() # Construct a function for computing the ouputs of the generator # network for a batch of noise. Presumably, the noise will be drawn # from the same distribution that was used in training.... self.sample_chain_from_data = self.GIP.sample_gil_from_data return
def __init__(self, rng=None, x_d=None, x_t=None, \ i_net=None, g_net=None, d_net=None, \ chain_len=None, data_dim=None, z_dim=None, \ params=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.data_dim = data_dim self.z_dim = z_dim self.p_z_mean = 0.0 self.p_z_logvar = 0.0 if params is None: self.params = {} else: self.params = params if 'cost_decay' in self.params: self.cost_decay = self.params['cost_decay'] else: self.cost_decay = 0.1 if 'chain_type' in self.params: assert((self.params['chain_type'] == 'walkback') or \ (self.params['chain_type'] == 'walkout')) self.chain_type = self.params['chain_type'] else: self.chain_type = 'walkout' if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # grab symbolic input variables self.x_d = x_d # initial input for starting the chain self.x_t = x_t # samples from target distribution self.z_zmuv = T.tensor3() # ZMUV gaussian samples for use in scan # get the number of steps for chain unrolling self.chain_len = chain_len # symbolic matrix of indices for inputs from target distribution self.It = T.arange(self.x_t.shape[0]) # symbolic matrix of indices for noise/generated inputs self.Id = T.arange(self.chain_len * self.x_d.shape[0]) + self.x_t.shape[0] # get a clone of the desired VAE, for easy access self.OSM = OneStageModel(rng=rng, x_in=self.x_d, \ p_x_given_z=g_net, q_z_given_x=i_net, \ x_dim=self.data_dim, z_dim=self.z_dim, \ params=self.params) self.IN = self.OSM.q_z_given_x self.GN = self.OSM.p_x_given_z self.transform_x_to_z = self.OSM.transform_x_to_z self.transform_z_to_x = self.OSM.transform_z_to_x self.bounded_logvar = self.OSM.bounded_logvar ################################################## # self-loop the VAE into a multi-step Markov chain. # ** All VAEs in the chain share the same Xc and Xm, which are the # symbolic inputs for providing the observed portion of the input # and a mask indicating which part of the input is "observed". # These inputs are used for training "reconstruction" policies. ################################################## # Setup the iterative generation loop using scan # ################################################## def chain_step_func(zi_zmuv, xim1): # get mean and logvar of z samples for this step zi_mean, zi_logvar = self.IN.apply(xim1, do_samples=False) # transform ZMUV samples to get desired samples zi = (T.exp(0.5 * zi_logvar) * zi_zmuv) + zi_mean # get the next generated xi (pre-transformation) outputs = self.GN.apply(zi) xti = outputs[-1] # apply the observation "mean" transform xgi = self.xt_transform(xti) # compute NLL for this step if self.chain_type == 'walkout': x_true = self.x_d else: x_true = xim1 nlli = self._log_prob(x_true, xgi).flatten() kldi = T.sum(gaussian_kld(zi_mean, zi_logvar, \ self.p_z_mean, self.p_z_logvar), axis=1) return xgi, nlli, kldi # apply the scan op init_values = [self.x_d, None, None] self.scan_results, self.scan_updates = \ theano.scan(chain_step_func, outputs_info=init_values, \ sequences=self.z_zmuv) # get the outputs of the scan op self.xgi = self.scan_results[0] self.nlli = self.scan_results[1] self.kldi = self.scan_results[2] self.xgi_list = [self.xgi[i] for i in range(self.chain_len)] # make a clone of the desired discriminator network, which will try # to discriminate between samples from the training data and samples # generated by the self-looped VAE chain. self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(self.x_t, *self.xgi_list)) zero_ary = np.zeros((1,)).astype(theano.config.floatX) # init shared var for weighting nll of data given posterior sample self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll') self.set_lam_chain_nll(lam_chain_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld') self.set_lam_chain_kld(lam_chain_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w') self.set_lam_l2w(lam_l2w=1e-4) # shared var learning rates for all networks self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn') self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in') # shared var momentum parameters for all networks self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2') # shared var weights for adversarial classification objective self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn') self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init adversarial cost weights for GN/DN self.set_disc_weights() # set a shared var for regularizing the output of the discriminator self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \ name='vcg_lam_l2d') # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the VCGair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this VCGair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params + self.dn_params # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on adversarial binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # first, build the cost to be optimized by the discriminator network, # in general this will be treated somewhat indepedently of the # optimization of the generator and inferencer networks. self.dn_cost = self.disc_cost_dn + self.disc_reg_cost # construct costs relevant to the optimization of the generator and # discriminator networks self.chain_nll_cost = self.lam_chain_nll[0] * \ self._construct_chain_nll_cost(cost_decay=self.cost_decay) self.chain_kld_cost = self.lam_chain_kld[0] * \ self._construct_chain_kld_cost(cost_decay=self.cost_decay) self.other_reg_cost = self._construct_other_reg_cost() self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \ self.chain_kld_cost + self.other_reg_cost # compute total cost on the discriminator and VB generator/inferencer self.joint_cost = self.dn_cost + self.osm_cost print("Computing VCGLoop joint_grad...") # grab the gradients for all parameters to optimize self.joint_grads = OrderedDict() for p in self.dn_params: self.joint_grads[p] = T.grad(self.dn_cost, p) for p in self.in_params: self.joint_grads[p] = T.grad(self.osm_cost, p) for p in self.gn_params: self.joint_grads[p] = T.grad(self.osm_cost, p) # construct the updates for the discriminator, generator and # inferencer networks. all networks share the same first/second # moment momentum and iteration count. the networks each have their # own learning rates, which lets you turn their learning on/off. self.dn_updates = get_adam_updates(params=self.dn_params, \ grads=self.joint_grads, alpha=self.lr_dn, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) self.in_updates = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0) # bag up all the updates required for training self.joint_updates = OrderedDict() for k in self.dn_updates: self.joint_updates[k] = self.dn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] print("Compiling VCGLoop train_joint...") # construct the function for training on training data self.train_joint = self._construct_train_joint() return
def atData(input, left, right, Slen): sentence = input[0] min = T.switch(T.lt(left, right), left, right) max = T.switch(T.lt(left, right), right, left) sentenceHead = sentence[:(min + _N_PAD_HEAD)] sentenceMiddle = sentence[(min + _N_PAD_HEAD + 1):(max + _N_PAD_HEAD)] sentenceTail = sentence[(max + _N_PAD_HEAD + 1):] # 去掉了两个entityPair # 86×60 newSentence = T.vertical_stack(sentenceHead, sentenceMiddle, sentenceTail) # (Slen-2)×60 originSentence = newSentence[4:Slen + 2] leftEntity = sentence[min + _N_PAD_HEAD] rightEntity = sentence[max + _N_PAD_HEAD] LRConnect = T.concatenate([leftEntity, rightEntity]) # def AtLayerData(LRConnect): # def forEveryWord(word): # temp = T.concatenate([word, LRConnect]) # # return T.concatenate(temp, rightEntity) # return temp # # # 将两个entitypair加在了每个句子的后面 # # 86×180 # sentenceAfAdd, _ = theano.scan(forEveryWord, sequences=newSentence) # # # 86×1 # eForWord = T.dot(sentenceAfAdd, WForATData) # # eAfterNonL = T.tanh(eForWord + BForATData) # # (Slen - 2)×60 # eAfterNonL = eAfterNonL[4:Slen + 2] # # # Slen-2×1 # aForWord = T.nnet.softmax(eAfterNonL)[0] # # def mulWeight(word, weight): # return word * weight # # # 句子长度×60 # newSRep, _ = theano.scan(mulWeight, sequences=[originSentence, aForWord]) # # # 1×60 # finalSRep = T.sum(newSRep, axis=0) # # 1×120 # finSRepAfNon = T.dot(finalSRep, linearW) # # finSRepAfNon = finSRepAfNon + T.dot(LRConnect, WForEP) + BForEP # # return [finSRepAfNon, newSRep] # # [finalSRep, myob], _ = theano.scan(AtLayerData, outputs_info=[LRConnect, None], n_steps=NUMBER_DATA) # return [finalSRep[-1], myob[-1]] return originSentence
def vstack(tensors): return T.vertical_stack(*tensors)
def __init__(self, rng=None, \ Xd=None, Yd=None, Xc=None, Xm=None, \ g_net=None, i_net=None, p_net=None, \ data_dim=None, prior_dim=None, label_dim=None, \ params=None): # TODO: refactor for use with "encoded" inferencer/generator assert(not (i_net.use_encoder or g_net.use_decoder)) # setup a rng for this GIStack self.rng = RandStream(rng.randint(100000)) # record the symbolic variables that will provide inputs to the # computation graph created for this GIStack self.Xd = Xd self.Yd = Yd self.Xc = Xc self.Xm = Xm self.Xd2 = T.vertical_stack(self.Xd, self.Xd) self.Yd2 = T.vertical_stack(self.Yd, self.Yd) self.Xc2 = T.vertical_stack(self.Xc, self.Xc) self.Xm2 = T.vertical_stack(self.Xm, self.Xm) self.obs_count = T.cast(self.Xd2.shape[0], 'floatX') # record the dimensionality of the data handled by this GIStack self.data_dim = data_dim self.label_dim = label_dim self.prior_dim = prior_dim # create a "shared-parameter" clone of the latent inferencer self.IN2 = i_net.shared_param_clone(rng=rng, \ Xd=self.Xd2, Xc=self.Xc2, Xm=self.Xm2) # capture a handle for latent samples from the inferencer self.Xp2 = self.IN2.output # feed it into a shared-parameter clone of the generator self.GN2 = g_net.shared_param_clone(rng=rng, Xp=self.Xp2) # capture a handle for outputs from the observation generator self.Xg2 = self.GN2.output # and feed it into a shared-parameter clone of the label generator self.PN2 = p_net.shared_param_clone(rng=rng, Xd=self.Xp2) # capture handles for noisy/clean outputs of the label generator self.Yp2 = self.PN2.output_spawn[0] # noisy predictions self.Yp2_proto = self.PN2.output_proto # noise-free predictions # we require the PeaNet to have one proto-net and one spawn net assert(len(self.PN2.proto_nets) == 1) assert(len(self.PN2.spawn_nets) == 1) # check that all networks agree on the latent variable dimension assert(self.prior_dim == self.IN2.mu_layers[-1].out_dim) assert(self.prior_dim == self.IN2.sigma_layers[-1].out_dim) assert(self.prior_dim == self.GN2.mlp_layers[0].in_dim) assert(self.prior_dim == self.PN2.proto_nets[0][0].in_dim) # check that we've been told the correct cardinality for the # categorical variable we will be "decoding" assert(self.label_dim == self.PN2.proto_nets[0][-1].out_dim) zero_ary = np.zeros((1,)).astype(theano.config.floatX) # shared var learning rates for all networks self.lr_gn = theano.shared(value=zero_ary, name='gis_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='gis_lr_in') self.lr_pn = theano.shared(value=zero_ary, name='gis_lr_pn') # shared var momentum parameters for all networks self.mom_1 = theano.shared(value=zero_ary, name='gis_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gis_mom_2') self.it_count = theano.shared(value=zero_ary, name='gis_it_count') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gis_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_kld = theano.shared(value=zero_ary, name='gis_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for weighting semi-supervised classification self.lam_cat = theano.shared(value=zero_ary, name='gis_lam_cat') self.set_lam_cat(lam_cat=0.0) # init shared var for weighting PEA cost on (un)supervised inputs self.lam_pea_su = theano.shared(value=zero_ary, name='gis_lam_pea_su') self.lam_pea_un = theano.shared(value=zero_ary, name='gis_lam_pea_un') self.set_lam_pea(lam_pea_su=1.0, lam_pea_un=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='gis_lam_l2w') self.set_lam_l2w(lam_l2w=1e-3) # grab the full set of "optimizable" parameters from the generator # and inferencer networks that we'll be working with. self.gn_params = [p for p in self.GN2.mlp_params] self.in_params = [p for p in self.IN2.mlp_params] self.pn_params = [p for p in self.PN2.proto_params] self.joint_params = self.pn_params + self.in_params + self.gn_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### pea_cost_su, pea_cost_un = self._construct_post_pea_costs() self.data_nll_cost = self.lam_nll[0] * self._construct_data_nll_cost() self.post_kld_cost = self.lam_kld[0] * self._construct_post_kld_cost() self.post_cat_cost = self.lam_cat[0] * self._construct_post_cat_cost() self.post_pea_cost = (self.lam_pea_su[0] * pea_cost_su) + \ (self.lam_pea_un[0] * pea_cost_un) self.other_reg_cost = self._construct_other_reg_cost() self.joint_cost = self.data_nll_cost + self.post_kld_cost + self.post_cat_cost + \ self.post_pea_cost + self.other_reg_cost # grab the gradients for all parameters to optimize self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1) # construct the updates for all parameters to optimize self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.in_updates = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.pn_updates = get_adam_updates(params=self.pn_params, \ grads=self.joint_grads, alpha=self.lr_pn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) #self.gn_updates = get_adadelta_updates(params=self.gn_params, \ # grads=self.joint_grads, alpha=self.lr_gn, beta1=0.98) #self.in_updates = get_adadelta_updates(params=self.in_params, \ # grads=self.joint_grads, alpha=self.lr_in, beta1=0.98) #self.pn_updates = get_adadelta_updates(params=self.pn_params, \ # grads=self.joint_grads, alpha=self.lr_dn, beta1=0.98) # bag up all the updates required for training self.joint_updates = OrderedDict() for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] for k in self.pn_updates: self.joint_updates[k] = self.pn_updates[k] # construct a training function for all parameters. training for the # various networks can be switched on and off via learning rates self.train_joint = self._construct_train_joint() return
def raw_activation_fast(sl, e1, e2): return T.batched_dot(theano.dot(e1, self.W[:,:,sl]), e2) +\ theano.dot(T.reshape(self.V[:,sl],(1,-1)), T.vertical_stack(e1.T, e2.T)) +\ self.b[sl] # Bias part
def __init__(self, d, V, r, nc, nf, pairwise_constraint=False, embeddings=None, fix_embeddings=False): #d = dimensionality of embeddings #V = size of vocabulary #r = number of dependency relations #nc = number of classes for classification #|V| x d embedding matrix if embeddings is None: self.We = theano.shared( name='embeddings', value=0.2 * np.random.uniform(-1.0, 1.0, (V, d))).astype( theano.config.floatX) else: self.We = theano.shared(name='embeddings', value=embeddings).astype( theano.config.floatX) #r x d x d tensor (matrix for each dependency relation) self.Wr = theano.shared( name='dependencies', value=0.2 * np.random.uniform(-1.0, 1.0, (r, d, d))).astype( theano.config.floatX) #d x d map from embedding to hidden vector self.Wv = theano.shared( name='Wv', value=0.2 * np.random.uniform(-1.0, 1.0, (d, d))).astype(theano.config.floatX) #d long bias vector self.b = theano.shared(name='b', value=np.zeros(d, dtype=theano.config.floatX)) #weights for fine grained features plus bias #self.beta = theano.shared(name='beta', # value=0.2 * np.random.uniform(-1.0, 1.0, (nc, nf)) # ).astype(theano.config.floatX) #low dimension approximation to classification parameters self.a = [] for i in range(nc): a = [] for j in range(3): a.append( theano.shared(name='a_{}_{}'.format(i, j), value=0.2 * np.random.uniform(-1.0, 1.0, d)).astype( theano.config.floatX)) #value=np.zeros(d, dtype=theano.config.floatX))) self.a.append(a) self.pairwise_constraint = pairwise_constraint if fix_embeddings: self.params = [self.Wr, self.Wv, self.b ] + [j for i in self.a for j in i] # + [self.beta] else: self.params = [self.We, self.Wr, self.Wv, self.b ] + [j for i in self.a for j in i] # + [self.beta] self.descender = Adagrad(self.params) #self.f = T.tanh self.f = normalized_tanh def recurrence(n, hidden_states, hidden_sums, x, r, p): #at each node n in the tree, calculate Wr(p,n) \dot f(W_v \dot We_word(n) + b + sum_n) and add to sum_p h_n = self.f(T.dot(self.Wv, x[n]) + self.b + hidden_sums[n]) sum_n = T.dot(r[n], h_n) return T.set_subtensor(hidden_states[n], h_n), T.inc_subtensor( hidden_sums[p[n]], sum_n) idxs = [] x = [] rel_idxs = [] r = [] p = [] hidden_sums = [] hidden_states = [] h = [] s = [] if pairwise_constraint: num_events = 4 else: num_events = 2 for i in range(num_events): idxs.append(T.ivector('idxs')) x.append(self.We[idxs[i]]) rel_idxs.append(T.ivector('rel_idxs')) r.append(self.Wr[rel_idxs[i]]) p.append(T.ivector('parents')) hidden_states.append( T.zeros((idxs[i].shape[0], d), dtype=theano.config.floatX)) #needs to be sent_length + 1 to store final sum hidden_sums.append( T.zeros((idxs[i].shape[0] + 1, d), dtype=theano.config.floatX)) h.append(None) s.append(None) [h[i], s[i]], updates = theano.scan( fn=recurrence, sequences=T.arange(x[i].shape[0]), outputs_info=[hidden_states[i], hidden_sums[i]], non_sequences=[x[i], r[i], p[i]]) #A = T.dot(self.a_1, self.a_2.reshape((1, d))) + T.nlinalg.diag(self.a_3) #cost = T.dot(T.dot(h[0][-1, -1], A), h[1][-1, -1]) #cost = T.dot(h[0][-1, -1], h[1][-1, -1]) #grad = T.grad(cost, self.params) #self.cost_and_grad = theano.function(inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1]], # outputs=[cost] + grad) A_stack = [] for i in range(len(self.a)): A_stack.append( T.dot(self.a[i][0].reshape((d, 1)), self.a[i][1].reshape( (1, d))) + T.nlinalg.diag(self.a[i][2])) A = T.vertical_stack(*A_stack).reshape((d, d, nc)) self.states = theano.function( inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1]], outputs=[h[0], h[1]]) #add fine-grained features #phi = T.vector('phi') p_y_given_x = T.nnet.softmax( T.dot(h[0][-1, -1], A).T.dot(h[1][-1, -1])) # + T.dot(self.beta, phi)) y_pred = T.argmax(p_y_given_x, axis=1) self.classify = theano.function( inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1]], # , phi], outputs=y_pred) y = T.iscalar('y') if not pairwise_constraint: sentence_nll = -(T.log(p_y_given_x)[0, y]) grad = T.grad(sentence_nll, self.params) self.cost_and_grad = theano.function( inputs=[ idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1], y ], #, phi, y], outputs=[sentence_nll] + grad) else: lambda_e = T.scalar('lambda_e') phi2 = T.vector('phi2') p_y_given_x1 = T.nnet.softmax( T.dot(h[0][-1, -1], A).T.dot(h[1][-1, -1]) + T.dot(self.beta, phi)) p_y_given_x2 = T.nnet.softmax( T.dot(h[2][-1, -1], A).T.dot(h[3][-1, -1]) + T.dot(self.beta, phi2)) sentence_nll = -(T.log(p_y_given_x1)[0, y]) - ( T.log(p_y_given_x2)[0, y]) #add constraint that events should be maximally similar cost = sentence_nll - lambda_e * T.dot(h[0][-1, -1], h[2][ -1, -1]) - lambda_e * T.dot(h[1][-1, -1], h[3][-1, -1]) #grad = T.grad(sentence_nll, self.params[:4] + [A]) grad = T.grad(cost, self.params) self.cost_and_grad = theano.function(inputs=[ idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1], phi, idxs[2], rel_idxs[2], p[2], idxs[3], rel_idxs[3], p[3], phi2, y, theano.In(lambda_e, value=1) ], outputs=[cost] + grad)
def __init__(self, rng=None, Xd=None, Xp=None, d_net=None, g_net=None, \ data_dim=None, params=None): # Do some stuff! self.rng = theano.tensor.shared_randomstreams.RandomStreams( \ rng.randint(100000)) self.data_dim = data_dim # symbolic var for inputting samples from the data distribution self.Xd = Xd # symbolic var for inputting samples from the generator's prior self.Xp = Xp # symbolic matrix of indices for data inputs self.Id = T.lvector(name='gcp_Id') # symbolic matrix of indices for noise inputs self.In = T.lvector(name='gcp_In') # create clones of the given generator and discriminator, after # rewiring their computation graphs to take the right inputs self.GN = g_net.shared_param_clone(rng=rng, Xp=self.Xp) self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(Xd, self.GN.output)) # shared var learning rate for generator and discriminator zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lr_gn = theano.shared(value=zero_ary, name='gcp_lr_gn') self.lr_dn = theano.shared(value=zero_ary, name='gcp_lr_dn') # shared var momentum parameters for generator and discriminator self.mo_gn = theano.shared(value=zero_ary, name='gcp_mo_gn') self.mo_dn = theano.shared(value=zero_ary, name='gcp_mo_dn') # shared var weights for collaborative classification objective self.dw_gn = theano.shared(value=zero_ary, name='gcp_dw_gn') self.dw_dn = theano.shared(value=zero_ary, name='gcp_dw_dn') # init parameters for controlling learning dynamics self.set_gn_sgd_params() # init SGD rate/momentum for GN self.set_dn_sgd_params() # init SGD rate/momentum for DN self.set_disc_weights() # initcollaborative cost weights for GN/DN self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \ name='gcp_lam_l2d') ####################################################### # Welcome to: Moment Matching Cost Information Center # ####################################################### # # Get parameters for managing the moment matching cost. The moment # matching is based on exponentially-decaying estimates of the mean # and covariance of the distribution induced by the generator network # and the (latent) noise being fed to it. # # We provide the option of performing moment matching with either the # raw generator output, or with linearly-transformed generator output. # Either way, the given target mean and covariance should have the # appropriate dimension for the space in which we'll be matching the # generator's 1st/2nd moments with the target's 1st/2nd moments. For # clarity, the computation we'll perform looks like: # # Xm = X - np.mean(X, axis=0) # XmP = np.dot(Xm, P) # C = np.dot(XmP.T, XmP) # # where Xm is the mean-centered samples from the generator and P is # the matrix for the linear transform to apply prior to computing # the moment matching cost. For simplicity, the above code ignores the # use of an exponentially decaying average to track the estimated mean # and covariance of the generator's output distribution. # # The relative contribution of the current batch to these running # estimates is determined by self.mom_mix_rate. The mean estimate is # first updated based on the current batch, then the current batch # is centered with the updated mean, then the covariance estimate is # updated with the mean-centered samples in the current batch. # # Strength of the moment matching cost is given by self.mom_match_cost. # Target mean/covariance are given by self.target_mean/self.target_cov. # If a linear transform is to be applied prior to matching, it is given # by self.mom_match_proj. # zero_ary = np.zeros((1,)) mmr = zero_ary + params['mom_mix_rate'] self.mom_mix_rate = theano.shared(name='gcp_mom_mix_rate', \ value=mmr.astype(theano.config.floatX)) mmw = zero_ary + params['mom_match_weight'] self.mom_match_weight = theano.shared(name='gcp_mom_match_weight', \ value=mmw.astype(theano.config.floatX)) targ_mean = params['target_mean'].astype(theano.config.floatX) targ_cov = params['target_cov'].astype(theano.config.floatX) assert(targ_mean.size == targ_cov.shape[0]) # mean and cov use same dim assert(targ_cov.shape[0] == targ_cov.shape[1]) # cov must be square self.target_mean = theano.shared(value=targ_mean, name='gcp_target_mean') self.target_cov = theano.shared(value=targ_cov, name='gcp_target_cov') mmp = np.identity(targ_cov.shape[0]) # default to identity transform if 'mom_match_proj' in params: mmp = params['mom_match_proj'] # use a user-specified transform assert(mmp.shape[0] == self.data_dim) # transform matches data dim assert(mmp.shape[1] == targ_cov.shape[0]) # and matches mean/cov dims mmp = mmp.astype(theano.config.floatX) self.mom_match_proj = theano.shared(value=mmp, name='gcp_mom_map_proj') # finally, we can construct the moment matching cost! and the updates # for the running mean/covariance estimates too! self.mom_match_cost, self.mom_updates = self._construct_mom_stuff() ######################################### # Thank you for visiting the M.M.C.I.C. # ######################################### # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the GCPair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this GCPair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.gn_params = [p for p in self.GN.mlp_params] # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on collaborative binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # Cost w.r.t. discriminator parameters is only the collaborative binary # classification cost. Cost w.r.t. comprises a collaborative binary # classification cost and the (weighted) moment matching cost. self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + self.disc_reg_cost self.gn_cost = self.disc_cost_gn + self.mom_match_cost + self.GN.act_reg_cost self.joint_cost = self.dn_cost + self.gn_cost # Initialize momentums for mini-batch SGD updates. All parameters need # to be safely nestled in their lists by now. self.joint_moms = OrderedDict() self.dn_moms = OrderedDict() self.gn_moms = OrderedDict() for p in self.gn_params: p_mo = np.zeros(p.get_value(borrow=True).shape) + 2.0 self.gn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX)) self.joint_moms[p] = self.gn_moms[p] for p in self.dn_params: p_mo = np.zeros(p.get_value(borrow=True).shape) + 2.0 self.dn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX)) self.joint_moms[p] = self.dn_moms[p] # Construct the updates for the generator and discriminator network self.joint_updates = OrderedDict() self.dn_updates = OrderedDict() self.gn_updates = OrderedDict() ########################################### # Construct updates for the discriminator # ########################################### for var in self.dn_params: # these updates are for trainable params in the inferencer net... # first, get gradient of cost w.r.t. var var_grad = T.grad(self.dn_cost, var, \ consider_constant=[self.GN.dist_mean, self.GN.dist_cov]) # get the momentum for this var var_mom = self.dn_moms[var] # update the momentum for this var using its grad self.dn_updates[var_mom] = (self.mo_dn[0] * var_mom) + \ ((1.0 - self.mo_dn[0]) * (var_grad**2.0)) self.joint_updates[var_mom] = self.dn_updates[var_mom] # make basic update to the var var_new = var - (self.lr_dn[0] * (var_grad / T.sqrt(var_mom + 1e-2))) self.dn_updates[var] = var_new # add this var's update to the joint updates too self.joint_updates[var] = self.dn_updates[var] ######################################################## # Construct updates for the moment tracking parameters # ######################################################## for var in self.mom_updates: # these updates are for the generator distribution's running first # and second-order moment estimates self.gn_updates[var] = self.mom_updates[var] self.joint_updates[var] = self.gn_updates[var] ####################################### # Construct updates for the generator # ####################################### for var in self.gn_params: # these updates are for trainable params in the generator net... # first, get gradient of cost w.r.t. var var_grad = T.grad(self.gn_cost, var, \ consider_constant=[self.GN.dist_mean, self.GN.dist_cov]) # get the momentum for this var var_mom = self.gn_moms[var] # update the momentum for this var using its grad self.gn_updates[var_mom] = (self.mo_gn[0] * var_mom) + \ ((1.0 - self.mo_gn[0]) * (var_grad**2.0)) self.joint_updates[var_mom] = self.gn_updates[var_mom] # make basic update to the var var_new = var - (self.lr_gn[0] * (var_grad / T.sqrt(var_mom + 1e-2))) self.gn_updates[var] = var_new # add this var's update to the joint updates too self.joint_updates[var] = self.gn_updates[var] # Construct batch-based training functions for the generator and # discriminator networks, as well as a joint training function. self.train_gn = self._construct_train_gn() self.train_dn = self._construct_train_dn() self.train_joint = self._construct_train_joint() # Construct a function for computing the ouputs of the generator # network for a batch of noise. Presumably, the noise will be drawn # from the same distribution that was used in training.... self.sample_from_gn = self.GN.sample_from_model return
def __init__(self, rng=None, Xd=None, Xp=None, d_net=None, g_net=None, \ obs_dim=None, z_dim=None, params=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.obs_dim = obs_dim self.z_dim = z_dim self.params = params # check that z_dim agrees with input dim for g_net assert(self.z_dim == g_net.shared_layers[0].in_dim) # set the transform on generator's raw output if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) # symbolic var for inputting samples from the data distribution self.Xd = Xd # symbolic var for inputting samples from the generator's prior self.Xp = Xp # symbolic matrix of indices for data inputs self.Id = T.lvector(name='gcp_Id') # symbolic matrix of indices for noise inputs self.In = T.lvector(name='gcp_In') # create clones of the given generator and discriminator, after # rewiring their computation graphs to take the right inputs self.GN = g_net.shared_param_clone(rng=rng, Xd=self.Xp) self.out_mean, self.out_logvar, self.out_samples = \ self.GN.apply(self.Xp, do_samples=True) self.Xg = self.obs_transform(self.out_samples) self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(self.Xd, self.Xg)) # shared var learning rate for generator and discriminator zero_ary = to_fX( np.zeros((1,)) ) self.lr_gn = theano.shared(value=zero_ary, name='gcp_lr_gn') self.lr_dn = theano.shared(value=zero_ary, name='gcp_lr_dn') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') self.it_count = theano.shared(value=zero_ary, name='msm_it_count') # shared var weights for collaborative classification objective self.dw_gn = theano.shared(value=zero_ary, name='gcp_dw_gn') self.dw_dn = theano.shared(value=zero_ary, name='gcp_dw_dn') # init parameters for controlling learning dynamics self.set_sgd_params() # init SGD rate/momentum self.set_disc_weights() # initcollaborative cost weights for GN/DN self.lam_l2d = theano.shared(value=(zero_ary + self.params['lam_l2d']), \ name='gcp_lam_l2d') ####################################################### # Welcome to: Moment Matching Cost Information Center # ####################################################### # # Get parameters for managing the moment matching cost. The moment # matching is based on exponentially-decaying estimates of the mean # and covariance of the distribution induced by the generator network # and the (latent) noise being fed to it. # # We provide the option of performing moment matching with either the # raw generator output, or with linearly-transformed generator output. # Either way, the given target mean and covariance should have the # appropriate dimension for the space in which we'll be matching the # generator's 1st/2nd moments with the target's 1st/2nd moments. For # clarity, the computation we'll perform looks like: # # Xm = X - np.mean(X, axis=0) # XmP = np.dot(Xm, P) # C = np.dot(XmP.T, XmP) # # where Xm is the mean-centered samples from the generator and P is # the matrix for the linear transform to apply prior to computing # the moment matching cost. For simplicity, the above code ignores the # use of an exponentially decaying average to track the estimated mean # and covariance of the generator's output distribution. # # The relative contribution of the current batch to these running # estimates is determined by self.mom_mix_rate. The mean estimate is # first updated based on the current batch, then the current batch # is centered with the updated mean, then the covariance estimate is # updated with the mean-centered samples in the current batch. # # Strength of the moment matching cost is given by self.mom_match_cost. # Target mean/covariance are given by self.target_mean/self.target_cov. # If a linear transform is to be applied prior to matching, it is given # by self.mom_match_proj. # C_init = to_fX( np.zeros((self.obs_dim, self.obs_dim)) ) m_init = to_fX( np.zeros((self.obs_dim,)) ) self.dist_cov = theano.shared(C_init, name='gcp_dist_cov') self.dist_mean = theano.shared(m_init, name='gcp_dist_mean') zero_ary = np.zeros((1,)) mmr = zero_ary + self.params['mom_mix_rate'] self.mom_mix_rate = theano.shared(name='gcp_mom_mix_rate', \ value=to_fX(mmr)) mmw = zero_ary + self.params['mom_match_weight'] self.mom_match_weight = theano.shared(name='gcp_mom_match_weight', \ value=to_fX(mmw)) targ_mean = to_fX( self.params['target_mean'] ) targ_cov = to_fX( self.params['target_cov'] ) assert(targ_mean.size == targ_cov.shape[0]) # mean and cov use same dim assert(targ_cov.shape[0] == targ_cov.shape[1]) # cov must be square self.target_mean = theano.shared(value=targ_mean, name='gcp_target_mean') self.target_cov = theano.shared(value=targ_cov, name='gcp_target_cov') mmp = np.identity(targ_cov.shape[0]) # default to identity transform if 'mom_match_proj' in self.params: mmp = self.params['mom_match_proj'] # use a user-specified transform assert(mmp.shape[0] == self.obs_dim) # transform matches data dim assert(mmp.shape[1] == targ_cov.shape[0]) # and matches mean/cov dims mmp = to_fX( mmp ) self.mom_match_proj = theano.shared(value=mmp, name='gcp_mom_map_proj') # finally, we can construct the moment matching cost! and the updates # for the running mean/covariance estimates too! self.mom_match_cost, self.mom_updates = self._construct_mom_stuff() ######################################### # Thank you for visiting the M.M.C.I.C. # ######################################### # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the GCPair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this GCPair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.dn_params + self.gn_params # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on collaborative binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # compute small l2 penalty on params self.dn_l2_cost = constFX(1e-4) * T.sum([T.sum(p**2.0) for p in self.dn_params]) self.gn_l2_cost = constFX(1e-4) * T.sum([T.sum(p**2.0) for p in self.gn_params]) # Cost w.r.t. discriminator parameters is only the collaborative binary # classification cost. Cost w.r.t. comprises a collaborative binary # classification cost and the (weighted) moment matching cost. self.dn_cost = self.disc_cost_dn + self.disc_reg_cost + self.dn_l2_cost self.gn_cost = self.disc_cost_gn + self.mom_match_cost + self.gn_l2_cost self.joint_cost = self.dn_cost + self.gn_cost # Compute gradients on generator and dicriminator parameters print("Computing gradients on generator...") self.gn_grads = OrderedDict() grad_list = T.grad(self.gn_cost, self.gn_params) for i, p in enumerate(self.gn_params): self.gn_grads[p] = grad_list[i] print("Computing gradients on discriminator...") self.dn_grads = OrderedDict() grad_list = T.grad(self.dn_cost, self.dn_params) for i, p in enumerate(self.dn_params): self.dn_grads[p] = grad_list[i] # Construct the updates for the generator and discriminator network self.joint_updates = OrderedDict() self.dn_updates = OrderedDict() self.gn_updates = OrderedDict() for var in self.mom_updates: # these updates are for the generator distribution's running first # and second-order moment estimates self.gn_updates[var] = self.mom_updates[var] self.joint_updates[var] = self.gn_updates[var] # Construct the updates for the generator and inferencer networks self.dn_updates = get_adam_updates(params=self.dn_params, \ grads=self.dn_grads, alpha=self.lr_dn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.gn_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) for k in self.dn_updates: self.joint_updates[k] = self.dn_updates[k] for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] # Construct batch-based training functions for the generator and # discriminator networks, as well as a joint training function. print("Compiling generator training function...") self.train_gn = self._construct_train_gn() print("Compiling discriminator training function...") self.train_dn = self._construct_train_dn() print("Compiling joint training function...") self.train_joint = self._construct_train_joint() # Construct a function for computing the ouputs of the generator # network for a batch of noise. Presumably, the noise will be drawn # from the same distribution that was used in training.... self.sample_from_gn = self._construct_model_sampler() return