def createObjectiveFunction(self): ''' @escription: initialize objective function and minimization function @X,y data matrix/vector @u random noise for simulator @v standard normal for reparametrization trick ''' X,u = T.dmatrices("X","u") f, y, v = T.dcols("f", "y", "v") mu = self.params[0] logSigma = self.params[1] logLambda = sharedX(np.log(self.sigma_e),name='logLambda') #logLambda = self.params[2] negKL = 0.5*self.dimTheta+0.5*T.sum(2*logSigma - mu ** 2 - T.exp(logSigma) ** 2) f = self.regression_simulator(X,u,v,mu,logSigma) logLike = -self.m*(0.5 * np.log(2 * np.pi) + logLambda)-0.5*T.sum((y-f)**2)/(T.exp(logLambda)**2)/self.Lu elbo = (negKL + logLike) obj = -elbo self.lowerboundfunction = th.function([X, y, u, v], obj, on_unused_input='ignore') derivatives = T.grad(obj,self.params) self.gradientfunction = th.function([X,y,u,v], derivatives, on_unused_input='ignore')
def createGradientFunctions(self): #create X = T.dmatrices("X") mu, logSigma, u, v, f, R = T.dcols("mu", "logSigma", "u", "v", "f", "R") mu = sharedX( np.random.normal(10, 10, (self.dimTheta, 1)), name='mu') logSigma = sharedX(np.random.uniform(0, 4, (self.dimTheta, 1)), name='logSigma') logLambd = sharedX(np.matrix(np.random.uniform(0, 10)),name='logLambd') logLambd = T.patternbroadcast(T.dmatrix("logLambd"),[1,1]) negKL = 0.5 * T.sum(1 + 2*logSigma - mu ** 2 - T.exp(logSigma) ** 2) theta = mu+T.exp(logSigma)*v W=theta y=X[:,0] X_sim=X[:,1:] f = (T.dot(X_sim,W)+u).flatten() gradvariables = [mu, logSigma, logLambd] logLike = T.sum(-(0.5 * np.log(2 * np.pi) + logLambd) - 0.5 * ((y-f)/(T.exp(logLambd)))**2) logp = (negKL + logLike)/self.m optimizer = -logp self.negKL = th.function([mu, logSigma], negKL, on_unused_input='ignore') self.f = th.function(gradvariables + [X,u,v], f, on_unused_input='ignore') self.logLike = th.function(gradvariables + [X, u, v], logLike,on_unused_input='ignore') derivatives = T.grad(logp,gradvariables) derivatives.append(logp) self.gradientfunction = th.function(gradvariables + [X, u, v], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [X, u, v], logp, on_unused_input='ignore') self.optimizer = BatchGradientDescent(objective=optimizer, params=gradvariables,inputs = [X,u,v],conjugate=True,max_iter=1)
def createObjectiveFunction(self): ''' @escription: initialize objective function and minimization function @X,y data matrix/vector @u random noise for simulator @v standard normal for reparametrization trick ''' X, u = T.dmatrices("X", "u") f, y, v = T.dcols("f", "y", "v") mu = self.params[0] logSigma = self.params[1] logLambda = sharedX(np.log(self.sigma_e), name='logLambda') #logLambda = self.params[2] negKL = 0.5 * self.dimTheta + 0.5 * T.sum(2 * logSigma - mu**2 - T.exp(logSigma)**2) f = self.regression_simulator(X, u, v, mu, logSigma) logLike = -self.m * (0.5 * np.log(2 * np.pi) + logLambda) - 0.5 * T.sum( (y - f)**2) / (T.exp(logLambda)**2) / self.Lu elbo = (negKL + logLike) obj = -elbo self.lowerboundfunction = th.function([X, y, u, v], obj, on_unused_input='ignore') derivatives = T.grad(obj, self.params) self.gradientfunction = th.function([X, y, u, v], derivatives, on_unused_input='ignore')
def createGradientFunctions(self): #Create the Theano variables W1,W2,W3,W4,W5,W6,x,eps = T.dmatrices("W1","W2","W3","W4","W5","W6","x","eps") #Create biases as cols so they can be broadcasted for minibatches b1,b2,b3,b4,b5,b6 = T.dcols("b1","b2","b3","b4","b5","b6") z1 = T.col("z1") if self.continuous: #convolve x # no_filters = 100, stride = 4, filter_size = 50 h_encoder = T.tanh(T.dot(W1,x) + b1) #h_encoder = T.dot(W1,x) + b1 else: h_encoder = T.tanh(T.dot(W1,x) + b1) mu_encoder = T.dot(W2,h_encoder) + b2 log_sigma_encoder = 0.5*(T.dot(W3,h_encoder) + b3) mu_encoder = T.dot(W2,h_encoder) + b2 log_sigma_encoder = 0.5*(T.dot(W3,h_encoder) + b3) #Find the hidden variable z z = mu_encoder + T.exp(log_sigma_encoder)*eps prior = 0.5* T.sum(1 + 2*log_sigma_encoder - mu_encoder**2 - T.exp(2*log_sigma_encoder)) #Set up decoding layer if self.continuous: h_decoder = T.nnet.softplus(T.dot(W4,z) + b4) h_dec = T.nnet.softplus(T.dot(W4,z1) + b4) #h_decoder = T.dot(W4,z) + b4 #h_dec = T.dot(W4,z1) + b4 mu_decoder = T.tanh(T.dot(W5,h_decoder) + b5) mu_dec = T.tanh(T.dot(W5,h_dec) + b5) log_sigma_decoder = 0.5*(T.dot(W6,h_decoder) + b6) logpxz = T.sum(-(0.5 * np.log(2 * np.pi) + log_sigma_decoder) - 0.5 * ((x - mu_decoder) / T.exp(log_sigma_decoder))**2) gradvariables = [W1,W2,W3,W4,W5,W6,b1,b2,b3,b4,b5,b6] else: h_decoder = T.tanh(T.dot(W4,z) + b4) y = T.nnet.sigmoid(T.dot(W5,h_decoder) + b5) logpxz = -T.nnet.binary_crossentropy(y,x).sum() gradvariables = [W1,W2,W3,W4,W5,b1,b2,b3,b4,b5] logp = logpxz + prior #Compute all the gradients derivatives = T.grad(logp,gradvariables) #Add the lowerbound so we can keep track of results derivatives.append(logp) self.get_z = th.function(gradvariables+[x,eps],z,on_unused_input='ignore') self.generate = th.function(gradvariables+[z1,x,eps],mu_dec,on_unused_input='ignore') self.predict = th.function(gradvariables+[x,eps],mu_decoder,on_unused_input='ignore') self.gradientfunction = th.function(gradvariables + [x,eps], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [x,eps], logp, on_unused_input='ignore')
def createGradientFunctions(self): #Create the Theano variables W1, W2, W3, W4, W5, W6, x, eps = T.dmatrices("W1", "W2", "W3", "W4", "W5", "W6", "x", "eps") #Create biases as cols so they can be broadcasted for minibatches b1, b2, b3, b4, b5, b6 = T.dcols("b1", "b2", "b3", "b4", "b5", "b6") if self.continuous: h_encoder = T.nnet.softplus(T.dot(W1, x) + b1) else: h_encoder = T.tanh(T.dot(W1, x) + b1) mu_encoder = T.dot(W2, h_encoder) + b2 log_sigma_encoder = 0.5 * (T.dot(W3, h_encoder) + b3) #Find the hidden variable z z = mu_encoder + T.exp(log_sigma_encoder) * eps prior = 0.5 * T.sum(1 + 2 * log_sigma_encoder - mu_encoder**2 - T.exp(2 * log_sigma_encoder)) #Set up decoding layer if self.continuous: h_decoder = T.nnet.softplus(T.dot(W4, z) + b4) mu_decoder = T.nnet.sigmoid(T.dot(W5, h_decoder) + b5) log_sigma_decoder = 0.5 * (T.dot(W6, h_decoder) + b6) logpxz = T.sum(-(0.5 * np.log(2 * np.pi) + log_sigma_decoder) - 0.5 * ((x - mu_decoder) / T.exp(log_sigma_decoder))**2) gradvariables = [W1, W2, W3, W4, W5, W6, b1, b2, b3, b4, b5, b6] else: h_decoder = T.tanh(T.dot(W4, z) + b4) y = T.nnet.sigmoid(T.dot(W5, h_decoder) + b5) logpxz = -T.nnet.binary_crossentropy(y, x).sum() gradvariables = [W1, W2, W3, W4, W5, b1, b2, b3, b4, b5] logp = logpxz + prior #Compute all the gradients derivatives = T.grad(logp, gradvariables) #Add the lowerbound so we can keep track of results derivatives.append(logp) self.gradientfunction = th.function(gradvariables + [x, eps], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [x, eps], logp, on_unused_input='ignore') self.zfunction = th.function(gradvariables + [x, eps], z, on_unused_input='ignore')
def createGradientFunctions(self): #create X = T.dmatrices("X") mu, logSigma, u, v, f, R = T.dcols("mu", "logSigma", "u", "v", "f", "R") mu = sharedX(np.random.normal(10, 10, (self.dimTheta, 1)), name='mu') logSigma = sharedX(np.random.uniform(0, 4, (self.dimTheta, 1)), name='logSigma') logLambd = sharedX(np.matrix(np.random.uniform(0, 10)), name='logLambd') logLambd = T.patternbroadcast(T.dmatrix("logLambd"), [1, 1]) negKL = 0.5 * T.sum(1 + 2 * logSigma - mu**2 - T.exp(logSigma)**2) theta = mu + T.exp(logSigma) * v W = theta y = X[:, 0] X_sim = X[:, 1:] f = (T.dot(X_sim, W) + u).flatten() gradvariables = [mu, logSigma, logLambd] logLike = T.sum(-(0.5 * np.log(2 * np.pi) + logLambd) - 0.5 * ((y - f) / (T.exp(logLambd)))**2) logp = (negKL + logLike) / self.m optimizer = -logp self.negKL = th.function([mu, logSigma], negKL, on_unused_input='ignore') self.f = th.function(gradvariables + [X, u, v], f, on_unused_input='ignore') self.logLike = th.function(gradvariables + [X, u, v], logLike, on_unused_input='ignore') derivatives = T.grad(logp, gradvariables) derivatives.append(logp) self.gradientfunction = th.function(gradvariables + [X, u, v], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [X, u, v], logp, on_unused_input='ignore') self.optimizer = BatchGradientDescent(objective=optimizer, params=gradvariables, inputs=[X, u, v], conjugate=True, max_iter=1)
def createGradientFunctions(self): #Create the Theano variables W1,W2,W3,W4,W5,W6,x,eps = T.dmatrices("W1","W2","W3","W4","W5","W6","x","eps") #Create biases as cols so they can be broadcasted for minibatches b1,b2,b3,b4,b5,b6,pi = T.dcols("b1","b2","b3","b4","b5","b6","pi") if self.continuous: h_encoder = T.nnet.softplus(T.dot(W1,x) + b1) else: h_encoder = T.tanh(T.dot(W1,x) + b1) print type(pi) rng = T.shared_randomstreams.RandomStreams(seed=124) i = rng.choice(size=(1,), a=self.num_model, p=T.nnet.softmax(pi.T).T.flatten()) mu_encoder = T.dot(W2[i[0]*self.dimZ:(1+i[0])*self.dimZ],h_encoder) + b2[i[0]*self.dimZ:(1+i[0])*self.dimZ] log_sigma_encoder = (0.5*(T.dot(W3[i[0]*self.dimZ:(1+i[0])*self.dimZ],h_encoder)))+ b3[i[0]*self.dimZ:(1+i[0])*self.dimZ] z = mu_encoder + T.exp(log_sigma_encoder)*eps prior = 0 for i in range(self.num_model): prior += T.exp(pi[i][0])*0.5* T.sum(1 + 2*log_sigma_encoder[int(i)*self.dimZ:(1+int(i))*self.dimZ] - mu_encoder[int(i)*self.dimZ:(1+int(i))*self.dimZ]**2 - T.exp(2*log_sigma_encoder[int(i)*self.dimZ:(1+int(i))*self.dimZ])) prior /= T.sum(T.exp(pi)) #Set up decoding layer if self.continuous: h_decoder = T.nnet.softplus(T.dot(W4,z) + b4) mu_decoder = T.nnet.sigmoid(T.dot(W5,h_decoder) + b5) log_sigma_decoder = 0.5*(T.dot(W6,h_decoder) + b6) logpxz = T.sum(-(0.5 * np.log(2 * np.pi) + log_sigma_decoder) - 0.5 * ((x - mu_decoder) / T.exp(log_sigma_decoder))**2) gradvariables = [W1,W2,W3,W4,W5,W6,b1,b2,b3,b4,b5,b6,pi] else: h_decoder = T.tanh(T.dot(W4,z) + b4) y = T.nnet.sigmoid(T.dot(W5,h_decoder) + b5) logpxz = -T.nnet.binary_crossentropy(y,x).sum() gradvariables = [W1,W2,W3,W4,W5,b1,b2,b3,b4,b5,pi] logp = logpxz + prior #Compute all the gradients derivatives = T.grad(logp,gradvariables) #Add the lowerbound so we can keep track of results derivatives.append(logpxz) self.gradientfunction = th.function(gradvariables + [x,eps], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [x,eps], logp, on_unused_input='ignore') self.hiddenstatefunction = th.function(gradvariables + [x,eps], z, on_unused_input='ignore')
def createGradientFunctions(self): #Create the Theano variables W1, W2, W3, W4, W5, W7, x, eps = T.dmatrices("W1", "W2", "W3", "W4", "W5", "W7", "x", "eps") #Create biases as cols so they can be broadcasted for minibatches b1, b2, b3, b4, b5, b7 = T.dcols("b1", "b2", "b3", "b4", "b5", "b7") if self.continuous_data: h_encoder = T.nnet.softplus(T.dot(W1, x) + b1) else: h_encoder = T.tanh(T.dot(W1, x) + b1) mu_encoder = T.dot(W2, h_encoder) + b2 log_sigma_encoder = 0.5 * (T.dot(W3, h_encoder) + b3) L_u = T.tril(log_L_u - T.diag(T.diag(log_L_u)) + T.diag(T.exp(T.diag(log_L_u)))) # To do: Better ways of paramterising the covariance (see: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.31.494&rep=rep1&type=pdf) #Compute GP objects K_ff = self.ker.RBF(sf2, ell, X) K_uu = self.ker.RBF(sf2, ell, X_u) K_uu_inv = nlinalg.matrix_inverse(K_uu) L_f = slinalg.cholesky(K_ff - T.dot(K_fu, T.dot(K_uu_inv, K_fu.T))) # f_i make up the columns of f, simiarly for m_u_i u = m_u + T.dot(L_u, eps_u) #n_induce iid pseudo inducing sets f = T.dot(K_fu, T.dot(K_uu_inv, u)) + T.dot(L_f, X) #Find the hidden variable z # log_sigma_lhood = 0.5*(T.dot(W9,f) + b9) # the var GP maps to both mean *and* covariance sigma_var_lhood = sigma_z**2 * T.eye(self.dimZ) L_z = slinalg.cholesky(sigma_var_lhood) z = f + T.dot(L_z, eps_z) # z = mu_encoder + T.exp(log_sigma_encoder)*eps prior = 0.5 * T.sum(1 + 2 * log_sigma_encoder - mu_encoder**2 - T.exp(2 * log_sigma_encoder)) #Set up decoding layer if self.continuous_data: h_decoder = T.nnet.softplus(T.dot(W4, z) + b4) mu_decoder = T.nnet.sigmoid(T.dot(W5, h_decoder) + b5) log_sigma_decoder = 0.5 * (T.dot(W7, h_decoder) + b7) logpxz = T.sum(-(0.5 * np.log(2 * np.pi) + log_sigma_decoder) - 0.5 * ((x - mu_decoder) / T.exp(log_sigma_decoder))**2) gradvariables = [ W1, W2, W3, W4, W5, W7, b1, b2, b3, b4, b5, b7, sf2, ell, X_u, m_u, L_u ] else: h_decoder = T.tanh(T.dot(W4, z) + b4) y = T.nnet.sigmoid(T.dot(W5, h_decoder) + b5) logpxz = -T.nnet.binary_crossentropy(y, x).sum() gradvariables = [ W1, W2, W3, W4, W5, b1, b2, b3, b4, b5, sf2, ell, X_u, m_u, L_u ] #Set up auxiliary layer if self.continuous_data: h_auxiliary = T.nnet.softplus(T.dot(W6, [x, z]) + b6) mu_auxiliary = T.nnet.sigmoid(T.dot(W7, h_auxiliary) + b7) log_sigma_auxiliary = 0.5 * (T.dot(W8, h_auxiliary) + b8) else: pass #to do logp = logpxz + prior #Compute KL terms # KL_qp = -0.5*T.sum(1.0 + 2*log_sigma_lhood - f**2 - T.exp(2*log_sigma_lhood)) KL_qp = 0.5 * (T.dot(f.T, f) + T.trace(sigma_var_lhood + T.log(T.eye(self.dimZ)) - T.log(sigma_var_lhood)) - self.dimZ) KL_qr = 0.5 * (T.dot( (mu_auxiliary - mu_encoder).T, T.dot(T.diag(1.0 / T.exp(log_sigma_auxiliary)), mu_auxiliary - mu_decoder)) + T.trace( T.dot(T.diag(1.0 / T.exp(log_sigma_auxiliary)), T.dot(L_u, L_u.T)) + log_sigma_auxiliary - log_sigma_encoder) - self.dimXf - self.dimf) #Compute bound and all the gradients stoch_bound = logpxz - KL_qp - KL_qr derivatives = T.grad(stoch_bound, gradvariables) #Add the lowerbound so we can keep track of results derivatives.append(stoch_bound) self.gradientfunction = th.function(gradvariables + [x, eps_u, eps_z, X], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [x, eps_u, eps_z, X], stoch_bound, on_unused_input='ignore') self.zfunction = th.function(gradvariables + [x, eps_u, eps_z, X], z, on_unused_input='ignore')
def createGradientFunctions(self): #Create the Theano variables W1,W2,W3,W4,W5,W6,x,eps = T.dmatrices("W1","W2","W3","W4","W5","W6","x","eps") #Create biases as cols so they can be broadcasted for minibatches b1,b2,b3,b4,b5,b6 = T.dcols("b1","b2","b3","b4","b5","b6") if self.continuous: h_encoder = T.nnet.softplus(T.dot(W1,x) + b1) else: # h_encoder = T.tanh(T.dot(W1,x) + b1) def relu(x): return 0.5 * (x + abs(x)) h_encoder = relu((T.dot(W1,x) + b1)) mu_encoder = T.dot(W2,h_encoder) + b2 log_sigma_encoder = 0.5*(T.dot(W3,h_encoder) + b3) #Find the hidden variable z z = mu_encoder + T.exp(log_sigma_encoder)*eps KL_divergence = 0.5* T.sum(1 + 2*log_sigma_encoder - mu_encoder**2 - T.exp(2*log_sigma_encoder)) #Set up decoding layer if self.continuous: h_decoder = T.nnet.softplus(T.dot(W4,z) + b4) mu_decoder = T.nnet.sigmoid(T.dot(W5,h_decoder) + b5) log_sigma_decoder = 0.5*(T.dot(W6,h_decoder) + b6) logpxz = T.sum(-(0.5 * np.log(2 * np.pi) + log_sigma_decoder) - 0.5 * ((x - mu_decoder) / T.exp(log_sigma_decoder))**2) logpxz_individual = T.sum(-(0.5 * np.log(2 * np.pi) + log_sigma_decoder) - 0.5 * ((x - mu_decoder) / T.exp(log_sigma_decoder))**2, axis=0) gradvariables = [W1,W2,W3,W4,W5,W6,b1,b2,b3,b4,b5,b6] else: # h_decoder = T.tanh(T.dot(W4,z) + b4) def relu(x): return 0.5 * (x + abs(x)) h_decoder = relu(T.dot(W4,z) + b4) y = T.nnet.sigmoid(T.dot(W5,h_decoder) + b5) logpxz_individual = -T.nnet.binary_crossentropy(y,x).sum(axis=0) logpxz = -T.nnet.binary_crossentropy(y,x).sum() gradvariables = [W1,W2,W3,W4,W5,b1,b2,b3,b4,b5] logp = logpxz + KL_divergence #Compute all the gradients derivatives = T.grad(logp,gradvariables) #Add the lowerbound so we can keep track of results # derivatives.append(logp) ## This is more confusing self.gradientfunction = th.function(gradvariables + [x,eps], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [x,eps], logp, on_unused_input='ignore') self.latentvarfunction = th.function(gradvariables + [x], [mu_encoder.T, log_sigma_encoder.T], on_unused_input='ignore') # if self.continuous: # self.reconstructionfunction = th.function(gradvariables + [x, eps], [mu_decoder.T, log_sigma_decoder.T], on_unused_input='ignore') # else: # self.reconstructionfunction = th.function(gradvariables + [x, eps], logpxz_individual.T, on_unused_input='ignore') if not self.continuous: self.recon_mnist = th.function(gradvariables + [x, eps], y, on_unused_input='ignore') self.reconstructionfunction = th.function(gradvariables + [x, eps], logpxz_individual.T, on_unused_input='ignore')