def get_model(self, X, Y, x_test): ''' Gaussian Process Regression model. Reference: C.E. Rasmussen, "Gaussian Process for Machine Learning", MIT Press 2006 Args: X: tensor matrix, training data Y: tensor matrix, training target x_test: tensor matrix, testing data Returns: K: prior cov matrix Ks: prior joint cov matrix Kss: prior cov matrix for testing data Posterior Distribution: alpha: alpha = inv(K)*(mu-m) sW: vector containing diagonal of sqrt(W) L: L = chol(sW*K*sW+eye(n)) y_test_mu: predictive mean y_test_var: predictive variance fs2: predictive latent variance Note: the cov matrix inverse is computed through Cholesky factorization https://makarandtapaswi.wordpress.com/2011/07/08/cholesky-decomposition-for-matrix-inversion/ ''' # Compute GP prior distribution: mean and covariance matrices (eq 2.13, 2.14) K = self.covFunc(X, X, 'K') # pior cov #m = T.mean(Y)*T.ones_like(Y) # pior mean m = self.mean * T.ones_like(Y) # pior mean # Compute GP joint prior distribution between training and test (eq 2.18) Ks = self.covFunc(X, x_test, 'Ks') # Pay attention!! here is the self test cov matrix. Kss = self.covFunc(x_test, x_test, 'Kss', mode='self_test') # Compute posterior distribution with noise: L,alpha,sW,and log_likelihood. sn2 = T.exp(2 * self.sigma_n) # noise variance of likGauss L = sT.cholesky(K / sn2 + T.identity_like(K)) sl = sn2 alpha = T.dot(sT.matrix_inverse(L.T), T.dot(sT.matrix_inverse(L), (Y - m))) / sl sW = T.ones_like(T.sum(K, axis=1)).reshape( (K.shape[0], 1)) / T.sqrt(sl) log_likelihood = T.sum(-0.5 * (T.dot((Y - m).T, alpha)) - T.sum(T.log(T.diag(L))) - X.shape[0] / 2 * T.log(2. * np.pi * sl)) # Compute predictive distribution using the computed posterior distribution. fmu = m + T.dot(Ks.T, alpha) # Prediction Mu fs|f, eq 2.25 V = T.dot(sT.matrix_inverse(L), T.extra_ops.repeat(sW, x_test.shape[0], axis=1) * Ks) fs2 = Kss - (T.sum(V * V, axis=0)).reshape( (1, V.shape[1])).T # Predication Sigma, eq 2.26 fs2 = T.maximum(fs2, 0) # remove negative variance noise #fs2 = T.sum(fs2,axis=1) # in case x has multiple dimensions y_test_mu = fmu y_test_var = fs2 + sn2 return K, Ks, Kss, y_test_mu, y_test_var, log_likelihood, L, alpha, V, fs2, sW
def get_opt_A(self, sn_trf, EPhiTPhi, XT_EPhi, K_MM): cholSigInv = sT.cholesky(EPhiTPhi + sn_trf * T.identity_like(K_MM)) cholK_MM = sT.cholesky(K_MM + 1e-6 * T.identity_like(K_MM)) invCholSigInv = sT.matrix_inverse(cholSigInv) invCholK_MM = sT.matrix_inverse(cholK_MM) InvSig = invCholSigInv.T.dot(invCholSigInv) InvK_MM = invCholK_MM.T.dot(invCholK_MM) Sig_EPhiT_X = InvSig.dot(XT_EPhi.T) return Sig_EPhiT_X, cholSigInv, cholK_MM, InvK_MM
def get_model(self,X, Y, x_test): ''' Gaussian Process Regression model. Reference: C.E. Rasmussen, "Gaussian Process for Machine Learning", MIT Press 2006 Args: X: tensor matrix, training data Y: tensor matrix, training target x_test: tensor matrix, testing data Returns: K: prior cov matrix Ks: prior joint cov matrix Kss: prior cov matrix for testing data Posterior Distribution: alpha: alpha = inv(K)*(mu-m) sW: vector containing diagonal of sqrt(W) L: L = chol(sW*K*sW+eye(n)) y_test_mu: predictive mean y_test_var: predictive variance fs2: predictive latent variance Note: the cov matrix inverse is computed through Cholesky factorization https://makarandtapaswi.wordpress.com/2011/07/08/cholesky-decomposition-for-matrix-inversion/ ''' # Compute GP prior distribution: mean and covariance matrices (eq 2.13, 2.14) K = self.covFunc(X,X,'K') # pior cov #m = T.mean(Y)*T.ones_like(Y) # pior mean m = self.mean*T.ones_like(Y) # pior mean # Compute GP joint prior distribution between training and test (eq 2.18) Ks = self.covFunc(X,x_test,'Ks') # Pay attention!! here is the self test cov matrix. Kss = self.covFunc(x_test,x_test,'Kss',mode='self_test') # Compute posterior distribution with noise: L,alpha,sW,and log_likelihood. sn2 = T.exp(2*self.sigma_n) # noise variance of likGauss L = sT.cholesky(K/sn2 + T.identity_like(K)) sl = sn2 alpha = T.dot(sT.matrix_inverse(L.T), T.dot(sT.matrix_inverse(L), (Y-m)) ) / sl sW = T.ones_like(T.sum(K,axis=1)).reshape((K.shape[0],1)) / T.sqrt(sl) log_likelihood = T.sum(-0.5 * (T.dot((Y-m).T, alpha)) - T.sum(T.log(T.diag(L))) - X.shape[0] / 2 * T.log(2.*np.pi*sl)) # Compute predictive distribution using the computed posterior distribution. fmu = m + T.dot(Ks.T, alpha) # Prediction Mu fs|f, eq 2.25 V = T.dot(sT.matrix_inverse(L),T.extra_ops.repeat(sW,x_test.shape[0],axis=1)*Ks) fs2 = Kss - (T.sum(V*V,axis=0)).reshape((1,V.shape[1])).T # Predication Sigma, eq 2.26 fs2 = T.maximum(fs2,0) # remove negative variance noise #fs2 = T.sum(fs2,axis=1) # in case x has multiple dimensions y_test_mu = fmu y_test_var = fs2 + sn2 return K, Ks, Kss, y_test_mu, y_test_var, log_likelihood, L, alpha,V, fs2,sW
def __init__(self, mu, sigma, random_state=None): super(MultivariateNormal, self).__init__(mu=mu, sigma=sigma) # XXX: The SDP-ness of sigma should be check upon changes # ndim self.ndim_ = self.mu.shape[0] self.make_(self.ndim_, "ndim_func_", args=[]) # pdf L = linalg.cholesky(self.sigma) sigma_det = linalg.det(self.sigma) # XXX: compute from L instead sigma_inv = linalg.matrix_inverse(self.sigma) # XXX: idem self.pdf_ = ((1. / T.sqrt( (2. * np.pi)**self.ndim_ * T.abs_(sigma_det))) * T.exp(-0.5 * T.sum(T.mul( T.dot(self.X - self.mu, sigma_inv), self.X - self.mu), axis=1))).ravel() self.make_(self.pdf_, "pdf") # -log pdf self.nll_ = -T.log(self.pdf_) # XXX: for sure this can be better self.make_(self.nll_, "nll") # self.rvs_ self.make_(T.dot(L, self.X.T).T + self.mu, "rvs_func_")
def get_opt_A(self, tau, EPhiTPhi, YT_EPhi): SigInv = EPhiTPhi + (tau**-1 + 1e-4) * T.identity_like(EPhiTPhi) cholTauSigInv = tau**0.5 * sT.cholesky(SigInv) invCholTauSigInv = sT.matrix_inverse(cholTauSigInv) tauInvSig = invCholTauSigInv.T.dot(invCholTauSigInv) Sig_EPhiT_Y = tau * tauInvSig.dot(YT_EPhi.T) return Sig_EPhiT_Y, tauInvSig, cholTauSigInv
def expected_new_y(self, x, y, new_x): assert new_x.ndim == 0 beta = alloc_diag(T.alloc(1., (x.shape[0],)) * self.beta) C = self.kernel.gram_matrix(x) + beta C_inv = matrix_inverse(C) k = self.kernel(x, new_x) return T.dot(k, T.dot(C_inv, y))
def l2ls_learn_basis_dual(X, S, c): tX = T.matrix('X') tS = T.matrix('S') tc = T.scalar('c') tlambdas = T.vector('lambdas') tXST = T.dot(tX, tS.T) tSSTetc = la.matrix_inverse(T.dot(tS, tS.T) + T.diag(tlambdas)) objective = -(T.dot(tX, tX.T).trace() - reduce(T.dot, [tXST, tSSTetc, tXST.T]).trace() - tc*tlambdas.sum()) objective_fn = theano.function([tlambdas], objective, givens={tX: X, tS: S, tc: c}) objective_grad_fn = theano.function([tlambdas], T.grad(objective, tlambdas), givens={tX: X, tS: S, tc: c}) initial_lambdas = 10*np.abs(np.random.random((S.shape[0], 1))) output = scipy.optimize.fmin_cg(f=objective_fn, fprime=objective_grad_fn, x0=initial_lambdas, maxiter=100, full_output=True) logging.debug("optimizer stats %s" % (output[1:],)) logging.debug("optimizer lambdas %s" % output[0]) lambdas = output[0] B = np.dot(np.linalg.inv(np.dot(S, S.T) + np.diag(lambdas)), np.dot(S, X.T)).T return B
def logp(self, X): v = self.v p = self.p S = self.S Z = self.Z result = -Z + log(det(X)) * -(v + p + 1.) / 2. - trace(S.dot(matrix_inverse(X))) / 2. return ifelse(gt(v, p-1), result, self.invalid)
def get_opt_A(self, sn_trf, EPhiTPhi, XT_EPhi): SigInv = EPhiTPhi + (sn_trf + 1e-6) * T.identity_like(EPhiTPhi) cholSigInv = sT.cholesky(SigInv) invCholSigInv = sT.matrix_inverse(cholSigInv) InvSig = invCholSigInv.T.dot(invCholSigInv) Sig_EPhiT_X = InvSig.dot(XT_EPhi.T) return Sig_EPhiT_X, cholSigInv
def expected_new_y(self, x, y, new_x): assert new_x.ndim == 0 beta = alloc_diag(T.alloc(1., (x.shape[0], )) * self.beta) C = self.kernel.gram_matrix(x) + beta C_inv = matrix_inverse(C) k = self.kernel(x, new_x) return T.dot(k, T.dot(C_inv, y))
def __init__(self, mu, sigma, random_state=None): super(MultivariateNormal, self).__init__(mu=mu, sigma=sigma, random_state=random_state, optimizer=None) # XXX: The SDP-ness of sigma should be check upon changes # ndim self.ndim_ = self.mu.shape[0] self.make_(self.ndim_, "ndim_func_", args=[]) # pdf L = linalg.cholesky(self.sigma) sigma_det = linalg.det(self.sigma) # XXX: compute from L instead sigma_inv = linalg.matrix_inverse(self.sigma) # XXX: idem self.pdf_ = ( (1. / T.sqrt((2. * np.pi) ** self.ndim_ * T.abs_(sigma_det))) * T.exp(-0.5 * T.sum(T.mul(T.dot(self.X - self.mu, sigma_inv), self.X - self.mu), axis=1))).ravel() self.make_(self.pdf_, "pdf") # -log pdf self.nnlf_ = -T.log(self.pdf_) # XXX: for sure this can be better self.make_(self.nnlf_, "nnlf") # self.rvs_ self.make_(T.dot(L, self.X.T).T + self.mu, "rvs_func_")
def logp(X): IVI = det(V) return bound( ((n - p - 1) * log(IVI) - trace(matrix_inverse(V).dot(X)) - n * p * log( 2) - n * log(IVI) - 2 * multigammaln(p, n / 2)) / 2, all(n > p - 1))
def s_variance(self, x): """Gaussian Process variance at points x""" K, y, var_y, N = self.kyn() rK = psd(K + var_y * tensor.eye(N)) K_x = self.K_fn(self.x, x) var_x = 1 - diag(dots(K_x.T, matrix_inverse(rK), K_x)) if var_x.dtype != self.dtype: raise TypeError('var_x dtype', var_x.dtype) return var_x
def build_theano_models(self, algo, algo_params): epsilon = 1e-6 kl = lambda mu, sig: sig+mu**2-TT.log(sig) X, y = TT.dmatrices('X', 'y') params = TT.dvector('params') a, b, c, l_F, F, l_FC, FC = self.unpack_params(params) sig2_n, sig_f = TT.exp(2*a), TT.exp(b) l_FF = TT.dot(X, l_F)+l_FC FF = TT.concatenate((l_FF, TT.dot(X, F)+FC), 1) Phi = TT.concatenate((TT.cos(FF), TT.sin(FF)), 1) Phi = sig_f*TT.sqrt(2./self.M)*Phi noise = TT.log(1+TT.exp(c)) PhiTPhi = TT.dot(Phi.T, Phi) A = PhiTPhi+(sig2_n+epsilon)*TT.identity_like(PhiTPhi) L = Tlin.cholesky(A) Li = Tlin.matrix_inverse(L) PhiTy = Phi.T.dot(y) beta = TT.dot(Li, PhiTy) alpha = TT.dot(Li.T, beta) mu_f = TT.dot(Phi, alpha) var_f = (TT.dot(Phi, Li.T)**2).sum(1)[:, None] dsp = noise*(var_f+1) mu_l = TT.sum(TT.mean(l_F, axis=1)) sig_l = TT.sum(TT.std(l_F, axis=1)) mu_w = TT.sum(TT.mean(F, axis=1)) sig_w = TT.sum(TT.std(F, axis=1)) hermgauss = np.polynomial.hermite.hermgauss(30) herm_x = Ts(hermgauss[0])[None, None, :] herm_w = Ts(hermgauss[1]/np.sqrt(np.pi))[None, None, :] herm_f = TT.sqrt(2*var_f[:, :, None])*herm_x+mu_f[:, :, None] nlk = (0.5*herm_f**2.-y[:, :, None]*herm_f)/dsp[:, :, None]+0.5*( TT.log(2*np.pi*dsp[:, :, None])+y[:, :, None]**2/dsp[:, :, None]) enll = herm_w*nlk nlml = 2*TT.log(TT.diagonal(L)).sum()+2*enll.sum()+1./sig2_n*( (y**2).sum()-(beta**2).sum())+2*(X.shape[0]-self.M)*a penelty = (kl(mu_w, sig_w)*self.M+kl(mu_l, sig_l)*self.S)/(self.S+self.M) cost = (nlml+penelty)/X.shape[0] grads = TT.grad(cost, params) updates = getattr(OPT, algo)(self.params, grads, **algo_params) updates = getattr(OPT, 'apply_nesterov_momentum')(updates, momentum=0.9) train_inputs = [X, y] train_outputs = [cost, alpha, Li] self.train_func = Tf(train_inputs, train_outputs, givens=[(params, self.params)]) self.train_iter_func = Tf(train_inputs, train_outputs, givens=[(params, self.params)], updates=updates) Xs, Li, alpha = TT.dmatrices('Xs', 'Li', 'alpha') l_FFs = TT.dot(Xs, l_F)+l_FC FFs = TT.concatenate((l_FFs, TT.dot(Xs, F)+FC), 1) Phis = TT.concatenate((TT.cos(FFs), TT.sin(FFs)), 1) Phis = sig_f*TT.sqrt(2./self.M)*Phis mu_pred = TT.dot(Phis, alpha) std_pred = (noise*(1+(TT.dot(Phis, Li.T)**2).sum(1)))**0.5 pred_inputs = [Xs, alpha, Li] pred_outputs = [mu_pred, std_pred] self.pred_func = Tf(pred_inputs, pred_outputs, givens=[(params, self.params)])
def s_variance(self, x): """Gaussian Process variance at points x""" K, y, var_y, N = self.kyn() rK = psd(K + var_y * tensor.eye(N)) K_x = self.K_fn(self.x, x) var_x = 1 - diag(dots(K_x.T, matrix_inverse(rK), K_x)) if var_x.dtype != self.dtype: raise TypeError('var_x dtype', var_x.dtype) return var_x
def s_mean(self, x): """Gaussian Process mean at points x""" K, y, var_y, N = self.kyn() rK = psd(K + var_y * tensor.eye(N)) alpha = tensor.dot(matrix_inverse(rK), y) K_x = self.K_fn(self.x, x) y_x = tensor.dot(alpha, K_x) if y_x.dtype != self.dtype: raise TypeError('y_x dtype', y_x.dtype) return y_x
def step(visible, filtered_hidden_mean_m1, filtered_hidden_cov_m1): A, B = transition, emission # (h, h), (h, v) # Shortcuts for the filtered mean and covariance from the previous # time step. f_m1 = filtered_hidden_mean_m1 # (n, h) F_m1 = filtered_hidden_cov_m1 # (n, h, h) # Calculate mean of joint. hidden_mean = T.dot(f_m1, A) + hnm # (n, h) visible_mean = T.dot(hidden_mean, B) + vnm # (n, v) # Calculate covariance of joint. hidden_cov = stacked_dot(A.T, stacked_dot(F_m1, A)) # (n, h, h) hidden_cov += hnc visible_cov = stacked_dot( # (n, v, v) B.T, stacked_dot(hidden_cov, B)) visible_cov += vnc visible_hidden_cov = stacked_dot(hidden_cov, B) # (n, h, v) visible_error = visible - visible_mean # (n, v) inv_visible_cov, _ = theano.map(lambda x: matrix_inverse(x), visible_cov) # (n, v, v) # I don't know a better name for this monster. visible_hidden_cov_T = visible_hidden_cov.dimshuffle(0, 2, 1) # (n, v, h) D = stacked_dot(inv_visible_cov, visible_hidden_cov_T) f = ( D * visible_error.dimshuffle(0, 1, 'x') # (n, h) ).sum(axis=1) f += hidden_mean F = hidden_cov F -= stacked_dot(visible_hidden_cov, D) log_l = ( inv_visible_cov * # (n,) visible_error.dimshuffle(0, 1, 'x') * visible_error.dimshuffle(0, 'x', 1)).sum(axis=(1, 2)) log_l *= -.5 dets, _ = theano.map(lambda x: det(x), visible_cov) log_l -= 0.5 * T.log(dets) log_l -= np.log(2 * np.pi) return f, F, log_l
def __init__(self, v, S, *args, **kwargs): super(Wishart, self).__init__(*args, **kwargs) self.v = v self.S = S self.p = p = S.shape[0] self.inv_S = matrix_inverse(S) 'TODO: We should pre-compute the following if the parameters are fixed' self.invalid = theano.tensor.fill(S, nan) # Invalid result, if v<p self.Z = log(2.)*(v * p / 2.) + multigammaln(p, v / 2.) - log(det(S)) * v / 2., self.mean = ifelse(gt(v, p-1), S / ( v - p - 1), self.invalid)
def s_mean(self, x): """Gaussian Process mean at points x""" K, y, var_y, N = self.kyn() rK = psd(K + var_y * tensor.eye(N)) alpha = tensor.dot(matrix_inverse(rK), y) K_x = self.K_fn(self.x, x) y_x = tensor.dot(alpha, K_x) if y_x.dtype != self.dtype: raise TypeError('y_x dtype', y_x.dtype) return y_x
def KLD_U(self, m, L_scaled, Kmm): #N(u|m,S)とN(u|0,Kmm) S=L*L.T(コレスキー分解したのを突っ込みましょう) M = m.shape[0] D = m.shape[1] KmmInv = sT.matrix_inverse(Kmm) KL_U = D * (T.sum(KmmInv.T * L_scaled.dot(L_scaled.T)) - M - 2.0 * T.sum(T.log(T.diag(L_scaled))) + 2.0 * T.sum(T.log(T.diag(sT.cholesky(Kmm))))) KL_U += T.sum(T.dot(KmmInv, m) * m) return 0.5 * KL_U
def s_nll(self): """ Marginal negative log likelihood of model :note: See RW.pdf page 37, Eq. 2.30. """ K, y, var_y, N = self.kyn() rK = psd(K + var_y * tensor.eye(N)) nll = (0.5 * dots(y, matrix_inverse(rK), y) + 0.5 * tensor.log(det(rK)) + N / 2.0 * tensor.log(2 * numpy.pi)) if nll.dtype != self.dtype: raise TypeError('nll dtype', nll.dtype) return nll
def step(visible, filtered_hidden_mean_m1, filtered_hidden_cov_m1): A, B = transition, emission # (h, h), (h, v) # Shortcuts for the filtered mean and covariance from the previous # time step. f_m1 = filtered_hidden_mean_m1 # (n, h) F_m1 = filtered_hidden_cov_m1 # (n, h, h) # Calculate mean of joint. hidden_mean = T.dot(f_m1, A) + hnm # (n, h) visible_mean = T.dot(hidden_mean, B) + vnm # (n, v) # Calculate covariance of joint. hidden_cov = stacked_dot( A.T, stacked_dot(F_m1, A)) # (n, h, h) hidden_cov += hnc visible_cov = stacked_dot( # (n, v, v) B.T, stacked_dot(hidden_cov, B)) visible_cov += vnc visible_hidden_cov = stacked_dot(hidden_cov, B) # (n, h, v) visible_error = visible - visible_mean # (n, v) inv_visible_cov, _ = theano.map( lambda x: matrix_inverse(x), visible_cov) # (n, v, v) # I don't know a better name for this monster. visible_hidden_cov_T = visible_hidden_cov.dimshuffle(0, 2, 1) # (n, v, h) D = stacked_dot(inv_visible_cov, visible_hidden_cov_T) f = (D * visible_error.dimshuffle(0, 1, 'x') # (n, h) ).sum(axis=1) f += hidden_mean F = hidden_cov F -= stacked_dot(visible_hidden_cov, D) log_l = (inv_visible_cov * # (n,) visible_error.dimshuffle(0, 1, 'x') * visible_error.dimshuffle(0,'x', 1)).sum(axis=(1, 2)) log_l *= -.5 dets, _ = theano.map(lambda x: det(x), visible_cov) log_l -= 0.5 * T.log(dets) log_l -= np.log(2 * np.pi) return f, F, log_l
def compile_theano_funcs(self, opt_algo, opt_params, dropout): self.compiled_funcs = {} # Compile Train & Optimization Function eps = 1e-5 params = Tt.vector('params') X, Y = Tt.matrix('X'), Tt.matrix('Y') sig2, F, M, V = self.feature_maps(X, params) EPhi = F[-1] EPhiPhiT = Tt.dot(EPhi, Tt.transpose(EPhi)) A = EPhiPhiT + (sig2 + eps) * Tt.identity_like(EPhiPhiT) L = Tlin.cholesky(A) Linv = Tlin.matrix_inverse(L) YPhiT = Tt.dot(Y, Tt.transpose(EPhi)) beta = Tt.dot(YPhiT, Tt.transpose(Linv)) alpha = Tt.dot(beta, Linv) mu_F = Tt.dot(alpha, EPhi) GOF = .5 / sig2 * Tt.sum(Tt.sum(Tt.dot(Y, (Y - mu_F).T))) REG = Tt.sum(Tt.log( Tt.diagonal(L))) + (self.N - self.D[-2]) / 2 * Tt.log(sig2) REG *= self.D[-1] KL = 0 for h in range(self.H): KL += Tt.sum(Tt.sum(M[h]**2) + Tt.sum(V[h] - Tt.log(V[h] + eps))) KL -= self.D[h + 1] * self.D[h + 2] // 2 obj = debug('obj', GOF + REG + KL) self.compiled_funcs['debug'] = Tf([X, Y], [obj], givens=[(params, self.params)]) grads = Tt.grad(obj, params) updates = {self.params: grads} updates = getattr(Optimizer, opt_algo)(updates, **opt_params) updates = getattr(Optimizer, 'nesterov')(updates, momentum=0.9) train_inputs = [X, Y] train_outputs = [obj, alpha, Linv, mu_F] self.compiled_funcs['opt'] = Tf(train_inputs, train_outputs, givens=[(params, self.params)], updates=updates) self.compiled_funcs['train'] = Tf(train_inputs, train_outputs, givens=[(params, self.params)]) # Compile Predict Function Linv, alpha = Tt.matrix('Linv'), Tt.matrix('alpha') Xs = Tt.matrix('Xs') sig2, Fs, _, _ = self.feature_maps(Xs, params) EPhis = Fs[-1] mu_Fs = Tt.dot(alpha, EPhis) std_Fs = ((sig2 * (1 + (Tt.dot(Linv, EPhis)**2).sum(0)))**0.5)[:, None] pred_inputs = [Xs, alpha, Linv] pred_outputs = [mu_Fs, std_Fs] self.compiled_funcs['pred'] = Tf(pred_inputs, pred_outputs, givens=[(params, self.params)])
def logp(self, X): n = self.n p = self.p V = self.V IVI = det(V) return bound( ((n - p - 1) * log(IVI) - trace(matrix_inverse(V).dot(X)) - n * p * log( 2) - n * log(IVI) - 2 * multigammaln(p, n / 2)) / 2, n > (p - 1))
def s_nll(self): """ Marginal negative log likelihood of model :note: See RW.pdf page 37, Eq. 2.30. """ K, y, var_y, N = self.kyn() rK = psd(K + var_y * tensor.eye(N)) nll = (0.5 * dots(y, matrix_inverse(rK), y) + 0.5 * tensor.log(det(rK)) + N / 2.0 * tensor.log(2 * numpy.pi)) if nll.dtype != self.dtype: raise TypeError('nll dtype', nll.dtype) return nll
def s_deg_of_freedom(self): """ Degrees of freedom aka "effective number of parameters" of kernel smoother. Defined pg. 25 of Rasmussen & Williams. """ K, y, var_y, N = self.kyn() rK = psd(K + var_y * tensor.eye(N)) dof = trace(tensor.dot(K, matrix_inverse(rK))) if dof.dtype != self.dtype: raise TypeError('dof dtype', dof.dtype) return dof
def s_deg_of_freedom(self): """ Degrees of freedom aka "effective number of parameters" of kernel smoother. Defined pg. 25 of Rasmussen & Williams. """ K, y, var_y, N = self.kyn() rK = psd(K + var_y * tensor.eye(N)) dof = trace(tensor.dot(K, matrix_inverse(rK))) if dof.dtype != self.dtype: raise TypeError('dof dtype', dof.dtype) return dof
def get_model(self, X, Y, X_test): #initial_params = {'m':m,'S_b':S_b,'mu':mu,'Sigma_b':Sigma_b,'Z':Z,'lhyp':lhyp,'ls':ls} (M, D), N, Q = self.Z.shape, X.shape[0], X.shape[1] #変数の正の値への制約条件 beta, sf2, l = T.exp(self.ls), T.exp(self.lhyp[0]), T.exp( self.lhyp[1:]) S = T.exp(self.S_b) #Sigma=T.exp(self.Sigma_b) #xについてはルートを取らなくても対角行列なので問題なし #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = sf2**0.5 * self.mu, sf2**0.5 * Sigma #reparametarizationのための乱数 srng = T.shared_randomstreams.RandomStreams(234) eps_NQ = srng.normal(self.m.shape) eps_M = srng.normal(self.mu.shape) #サンプルの生成。バッチでやるので一回だけのMC Xtilda = self.m + S * eps_NQ U = mu_scaled + Sigma_scaled * eps_M Kmm = self.ker.RBF(sf2, l, self.Z) KmmInv = sT.matrix_inverse(Kmm) #KmmDet=theano.sandbox.linalg.det(Kmm) Kmn = self.ker.RBF(sf2, l, self.Z, Xtilda) Knn = self.ker.RBF(sf2, l, Xtilda, Xtilda) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) Kinterval = T.dot(KmmInv, Kmn) mean_U = T.dot(Kinterval.T, U) Covariance = beta LL = self.log_mvn(X, mean_U, Covariance) - 0.5 * beta * T.sum( (T.eye(N) * Ktilda)) #KL_X = -0.5 * (-T.sum(T.log(T.sum(Sigma,0))) + T.dot(m.T,T.dot(KmmInv,m)).squeeze() + T.sum((Sigma*KmmInv)) - M)-0.5*T.log(KmmDet) KL_X = self.KLD_X(self.m, S) KL_U = self.KLD_U(mu_scaled, Sigma_scaled, Kmm) return KL_X, KL_U, LL
def LNLEP( theta = Th.dvector('theta'), M = Th.dmatrix('M') , STA = Th.dvector('STA') , STC = Th.dmatrix('STC'), N_spike = Th.dscalar('N_spike'), **other): ''' The actual quadratic-Poisson model, as a function of theta and M, without any barriers or priors. ''' ImM = Th.identity_like(M)-(M+M.T)/2 ldet = logdet(ImM) # Th.log( det( ImM) ) # logdet(ImM) return -0.5 * N_spike *( ldet \ - Th.sum(Th.dot(matrix_inverse(ImM),theta) * theta) \ + 2. * Th.sum( theta * STA ) \ + Th.sum( M * (STC + Th.outer(STA,STA)) ))
def quadratic_Poisson( theta = Th.dvector('theta'), M = Th.dmatrix('M') , STA = Th.dvector('STA') , STC = Th.dmatrix('STC'), N_spike = Th.dscalar('N_spike'), logprior = 0 , **other): ''' The actual quadratic-Poisson model, as a function of theta and M, with a barrier on the log-det term and a prior. ''' ImM = Th.identity_like(M)-(M+M.T)/2 ldet = logdet(ImM) # Th.log( det( ImM) ) # logdet(ImM) return -0.5 * N_spike *( ldet + logprior \ - 1./(ldet+250.)**2. \ - Th.sum(Th.dot(matrix_inverse(ImM),theta) * theta) \ + 2. * Th.sum( theta * STA ) \ + Th.sum( M * (STC + Th.outer(STA,STA)) ))
def LQLEP( theta = Th.dvector() , M = Th.dmatrix() , STA = Th.dvector() , STC = Th.dmatrix() , N_spike = Th.dscalar() , Cm1 = Th.dmatrix() , **other): ''' The actual Linear-Quadratic-Exponential-Poisson log-likelihood, as a function of theta and M, without any barriers or priors. ''' # ImM = Th.identity_like(M)-(M+M.T)/2 ImM = Cm1-(M+M.T)/2 ldet = logdet(ImM) LQLEP = -0.5 * N_spike *( ldet - logdet(Cm1) \ - Th.sum(Th.dot(matrix_inverse(ImM),theta) * theta) \ + 2. * Th.sum( theta * STA ) \ + Th.sum( M * (STC + Th.outer(STA,STA)) )) other.update(locals()) return named( **other )
def get_rbfnet_learning_func(f_name): assert f_name == 'euclidean' X_matrix = T.dmatrix('X') W_matrix = T.dmatrix('W') b = T.scalar('b') C_scalar = T.scalar('C') y_vector = T.dvector('y') H_matrix = metric_theano[f_name](X_matrix, W_matrix) H_rbf = np.exp(T.power(H_matrix, 2) * (-b)) beta_matrix = T.dot( matrix_inverse(T.dot(H_rbf.T, H_rbf) + 1.0 / C_scalar * T.eye(H_rbf.shape[1])), T.dot(H_rbf.T, y_vector).T) # beta_function = theano.function([H_matrix, C_scalar, y_vector], beta_matrix) rbfnet_learning_function = theano.function([X_matrix, W_matrix, C_scalar, b, y_vector], beta_matrix) return rbfnet_learning_function
def step(filtered_mean, filtered_cov, smoothed_mean_p1, smoothed_cov_p1): f, F = filtered_mean, filtered_cov # (n, h), (n, h, h) hidden_mean = T.dot(f, A) + hidden_noise_mean # (n, h) hidden_cov = stacked_dot(A.T, stacked_dot(F, A)) # (n, h, h) hidden_cov += hidden_noise_cov hidden_p1_hidden_cov = stacked_dot(A.T, F) # (n, h, h) hidden_p1_hidden_cov_T = hidden_p1_hidden_cov.dimshuffle(0, 2, 1) inv_hidden_cov, _ = theano.map( lambda x: matrix_inverse(x), hidden_cov) # (n, h, h) cov_rev = F - stacked_dot( stacked_dot(hidden_p1_hidden_cov_T, inv_hidden_cov), hidden_p1_hidden_cov) # (n, h, h) trans_rev = stacked_dot(hidden_p1_hidden_cov_T, # (n, h, h) inv_hidden_cov) mean_rev = f mean_rev -= (hidden_mean.dimshuffle(0, 'x', 1) * trans_rev # (n, h) ).sum(axis=2) # Turn these into matrices so they work with stacked_dot. smoothed_mean_p1 = smoothed_mean_p1.dimshuffle(0, 'x', 1) trans_rev_T = trans_rev.dimshuffle(0, 2, 1) smoothed_mean = stacked_dot(smoothed_mean_p1, trans_rev_T) smoothed_mean = smoothed_mean[0, :, :] smoothed_mean += mean_rev smoothed_cov = stacked_dot(trans_rev, stacked_dot(smoothed_cov_p1, trans_rev_T)) smoothed_cov += cov_rev return smoothed_mean, smoothed_cov
def s_nll(K, y, var_y, prior_var): """ Marginal negative log likelihood of model K - gram matrix (matrix-like) y - the training targets (vector-like) var_y - the variance of uncertainty about y (vector-like) :note: See RW.pdf page 37, Eq. 2.30. """ n = y.shape[0] rK = psd(prior_var * K + var_y * TT.eye(n)) fit = .5 * dots(y, matrix_inverse(rK), y) complexity = 0.5 * TT.log(det(rK)) normalization = n / 2.0 * TT.log(2 * np.pi) nll = fit + complexity + normalization return nll
def l2ls_learn_basis_dual(X, S, c): tX = T.matrix('X') tS = T.matrix('S') tc = T.scalar('c') tlambdas = T.vector('lambdas') tXST = T.dot(tX, tS.T) tSSTetc = la.matrix_inverse(T.dot(tS, tS.T) + T.diag(tlambdas)) objective = -(T.dot(tX, tX.T).trace() - reduce( T.dot, [tXST, tSSTetc, tXST.T]).trace() - tc * tlambdas.sum()) objective_fn = theano.function([tlambdas], objective, givens={ tX: X, tS: S, tc: c }) objective_grad_fn = theano.function([tlambdas], T.grad(objective, tlambdas), givens={ tX: X, tS: S, tc: c }) initial_lambdas = 10 * np.abs(np.random.random((S.shape[0], 1))) output = scipy.optimize.fmin_cg(f=objective_fn, fprime=objective_grad_fn, x0=initial_lambdas, maxiter=100, full_output=True) logging.debug("optimizer stats %s" % (output[1:], )) logging.debug("optimizer lambdas %s" % output[0]) lambdas = output[0] B = np.dot(np.linalg.inv(np.dot(S, S.T) + np.diag(lambdas)), np.dot(S, X.T)).T return B
def normal(X, m, C): """ Evaluates the density of a normal distribution. @type X: C{TensorVariable} @param X: matrix storing data points column-wise @type m: C{ndarray}/C{TensorVariable} @param m: column vector representing the mean of the Gaussian @type C: C{ndarray}/C{TensorVariable} @param C: covariance matrix @rtype: C{TensorVariable} @return: density of a Gaussian distribution evaluated at C{X} """ Z = X - m return tt.exp(-tt.sum(Z * tt.dot(tl.matrix_inverse(C), Z), 0) / 2. - tt.log(tl.det(C)) / 2. - m.size / 2. * np.log(2. * np.pi))
def get_eem_learning_function(metric_name): W = T.dmatrix('W') X = T.dmatrix('X') H = metric_theano[metric_name](X, W) H_func = theano.function([X, W], H) C = T.scalar('C') H_plus = T.dmatrix('H_plus') H_minus = T.dmatrix('H_minus') sigma_plus = T.dmatrix('sigma_plus') sigma_minus = T.dmatrix('sigma_minus') sigma_plus_reg = sigma_plus + T.eye(sigma_plus.shape[1]) / 2 * C sigma_minus_reg = sigma_minus + T.eye(sigma_minus.shape[1]) / 2 * C m_plus = H_plus.mean(axis=0).T m_minus = H_minus.mean(axis=0).T mean_diff = m_plus - m_minus beta = (2 * T.dot(matrix_inverse(sigma_plus_reg + sigma_minus_reg), mean_diff) / mean_diff.norm(L=2)) func = theano.function([H_plus, H_minus, sigma_plus, sigma_minus, C], [beta, sigma_plus_reg, sigma_minus_reg, m_plus, m_minus]) def eem_learning_function(X, W, y, C): the_H = H_func(X, W) the_H_plus = the_H[y == 1] the_H_minus = the_H[y == -1] the_sigma_plus = LedoitWolf(store_precision=False).fit(the_H_plus).covariance_ the_sigma_minus = LedoitWolf(store_precision=False).fit(the_H_minus).covariance_ if C is None: C = 0 return func(the_H_plus, the_H_minus, the_sigma_plus, the_sigma_minus, C) return eem_learning_function
def _build_graph(self): """Sets up the gaussian process's tensor variables.""" X = self.X Y = self.Y x = self.x reg = self.reg if self._normalize_y: Y_mean = T.mean(Y, axis=0) Y_variance = T.std(Y, axis=0) Y = (Y - Y_mean) / Y_variance # Kernel functions. K_ss = self._kernel(x, x) K_s = self._kernel(x, X) K = self._kernel(X, X) + self._sigma_n**2 * T.eye(X.shape[0]) # Guarantee positive definite. K = 0.5 * (K + K.T) + reg * T.eye(K.shape[0]) # Mean and variance functions. K_inv = sT.matrix_inverse(K) mu = T.dot(K_s, T.dot(K_inv, self.Y)) # Non-normalized Y for scale. var = K_ss - T.dot(K_s, T.dot(K_inv, K_s.T)) # Compute the standard deviation. L = sT.cholesky(K) L_k = T.slinalg.solve_lower_triangular(L, K_s.T) std = T.sqrt(T.diag(K_ss) - T.sum(L_k**2, axis=0)).reshape((-1, 1)) # Compute the log likelihood. log_likelihood_dims = -0.5 * T.dot(Y.T, T.dot(K_inv, Y)).sum(axis=0) log_likelihood_dims -= T.log(T.diag(L)).sum() log_likelihood_dims -= L.shape[0] / 2 * T.log(2 * np.pi) log_likelihood = log_likelihood_dims.sum(axis=-1) self._mu = mu self._var = var self._std = std self._log_likelihood = log_likelihood
def generate_optimize_basis(): # original solution tx0 = partial.x # optimized solution tx1 = T.dot(tl.matrix_inverse(T.dot(partial.A.T, partial.A)), T.dot(partial.A.T, y) - gamma/2*partial.theta) # investigate zero crossings between tx0 and tx1 tbetas = tx0 / (tx0 - tx1) # investigate tx1 tbetas = T.concatenate([tbetas, [1.0]]) # only between tx0 and inclusively tx1 tbetas = tbetas[(T.lt(0, tbetas) * T.le(tbetas, 1)).nonzero()] txbs, _ = theano.map(lambda b: (1-b)*tx0 + b*tx1, [tbetas]) tlosses, _ = theano.map(loss, [txbs]) # select the optimum txb = txbs[T.argmin(tlosses)] return theano.function([tpart, full.x, full.theta], [T.set_subtensor(partial.x, txb), T.set_subtensor(partial.theta, T.sgn(txb))])
def get_xelm_learning_function(f_name): # global xelm_learning_function X_matrix = T.dmatrix('X') W_matrix = T.dmatrix('W') # b_vector = T.dvector('b') w_vector = T.dvector('w') C_scalar = T.scalar('C') y_vector = T.dvector('y') H_matrix = metric_theano[f_name](X_matrix, W_matrix) Hw_matrix = H_matrix * w_vector.reshape((-1, 1)) yw_vector = (y_vector * w_vector) beta_matrix = T.dot( matrix_inverse(T.dot(Hw_matrix.T, Hw_matrix) + 1.0 / C_scalar * T.eye(Hw_matrix.shape[1])), T.dot(Hw_matrix.T, yw_vector).T) # beta_function = theano.function([H_matrix, C_scalar, y_vector], beta_matrix) xelm_learning_function = theano.function([X_matrix, W_matrix, w_vector, C_scalar, y_vector], beta_matrix) return xelm_learning_function
def __init__(self, mu, sigma): """Constructor. Parameters ---------- * `mu` [1d array]: The means. * `sigma` [2d array]: The covariance matrix. """ super(MultivariateNormal, self).__init__(mu=mu, sigma=sigma) # XXX: The SDP-ness of sigma should be check upon changes # ndim self.ndim_ = self.mu.shape[0] self._make(self.ndim_, "ndim_func_", args=[]) # pdf L = linalg.cholesky(self.sigma) sigma_det = linalg.det(self.sigma) # XXX: compute from L instead sigma_inv = linalg.matrix_inverse(self.sigma) # XXX: idem self.pdf_ = ( (1. / T.sqrt((2. * np.pi) ** self.ndim_ * T.abs_(sigma_det))) * T.exp(-0.5 * T.sum(T.mul(T.dot(self.X - self.mu, sigma_inv), self.X - self.mu), axis=1))).ravel() self._make(self.pdf_, "pdf") # -log pdf self.nll_ = -T.log(self.pdf_) # XXX: for sure this can be better self._make(self.nll_, "nll") # self.rvs_ self._make(T.dot(L, self.X.T).T + self.mu, "rvs_func_")
def __init__(self, D, M, Q, Domain_number, m, pre_params, Pre_U, Hiddenlayerdim1, Hiddenlayerdim2): self.Xlabel = T.matrix('Xlabel') self.X = T.matrix('X') N = self.X.shape[0] self.Weight = T.matrix('Weight') ker = kernel(Q) #mmd=MMD(M,Domain_number) mu_value = np.random.randn(M, D) Sigma_b_value = np.zeros((M, M)) + np.log(0.01) Z_value = m[:M] self.test = Z_value ls_value = np.zeros(Domain_number) + np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu', borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b', borrow=True) self.Z = theano.shared(value=Z_value, name='Z', borrow=True) self.ls = theano.shared(value=ls_value, name='ls', borrow=True) self.params = [self.mu, self.Sigma_b, self.Z, self.ls] self.hiddenLayer_x = HiddenLayer(rng=rng, input=self.X, n_in=D, n_out=Hiddenlayerdim1, activation=T.nnet.relu, number='_x') self.hiddenLayer_hidden = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=Hiddenlayerdim1, n_out=Hiddenlayerdim2, activation=T.nnet.relu, number='_h') self.hiddenLayer_m = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=Q, activation=T.nnet.relu, number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=Q, activation=T.nnet.relu, number='_S') self.loc_params = [] self.loc_params.extend(self.hiddenLayer_x.params) self.loc_params.extend(self.hiddenLayer_hidden.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params = {} for i in self.loc_params: self.local_params[str(i)] = i self.params.extend(ker.params) #self.params.extend(mmd.params) self.hyp_params = {} for i in [self.mu, self.Sigma_b, self.ls]: self.hyp_params[str(i)] = i self.Z_params = {} for i in [self.Z]: self.Z_params[str(i)] = i self.global_params = {} for i in self.params: self.global_params[str(i)] = i self.params.extend(self.hiddenLayer_x.params) self.params.extend(self.hiddenLayer_hidden.params) self.params.extend(self.hiddenLayer_m.params) self.params.extend(self.hiddenLayer_S.params) self.wrt = {} for i in self.params: self.wrt[str(i)] = i for i, j in pre_params.items(): self.wrt[i].set_value(j) for i, j in Pre_U.items(): self.wrt[i].set_value(j) m = self.hiddenLayer_m.output S_0 = self.hiddenLayer_S.output S_1 = T.exp(S_0) S = T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N, Q)) eps_M = srng.normal((M, D)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled + Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) #Kmm=mmd.MMD_kenel_Xonly(mmd.Zlabel_T,Kmm,self.Weight) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z, Xtilda) #Kmn=mmd.MMD_kenel_ZX(self.Xlabel,Kmn,self.Weight) Knn = ker.RBF(Xtilda) #Knn=mmd.MMD_kenel_Xonly(self.Xlabel,Knn,self.Weight) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) Kinterval = T.dot(KmmInv, Kmn) mean_U = T.dot(Kinterval.T, self.U) betaI = T.diag(T.dot(self.Xlabel, beta)) Covariance = betaI self.LL = (self.log_mvn(self.X, mean_U, Covariance) - 0.5 * T.sum(T.dot(betaI, Ktilda))) self.KL_X = -self.KLD_X(m, S) self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)
def __init__(self, params, correct, samples=20, batch_size=None): ker = kernel() self.samples = samples self.params = params self.batch_size = batch_size #データの保存ファイル model_file_name = 'model2' + '.save' #もしこれまでに作ったのがあるならロードする try: print('Trying to load model...') with open(model_file_name, 'rb') as file_handle: obj = pickle.load(file_handle) self.f, self.g = obj print('Loaded!') return except: print('Failed. Creating a new model...') X,Y,X_test,m,S_b,mu,Sigma_b,Z,eps_NQ,eps_M =\ T.dmatrices('X','Y','X_test','m','S_b','mu','Sigma_b','Z','eps_NQ','eps_M') lhyp = T.dvector('lhyp') ls = T.dvector('ls') (M, D), N, Q = Z.shape, X.shape[0], X.shape[1] #変数の正の値への制約条件 beta = T.exp(ls[0]) #beta=T.exp(lhyp[0]) sf2, l = T.exp(lhyp[0]), T.exp(lhyp[1:1 + Q]) S = T.exp(S_b) #Sigma=T.exp(self.Sigma_b) #xについてはルートを取らなくても対角行列なので問題なし #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(Sigma_b - T.diag(T.diag(Sigma_b)) + T.diag(T.exp(T.diag(Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = sf2**0.5 * mu, sf2**0.5 * Sigma Xtilda = m + S * eps_NQ U = mu_scaled + Sigma_scaled.dot(eps_M) print('Setting up cache...') Kmm = ker.RBF(sf2, l, Z) KmmInv = sT.matrix_inverse(Kmm) #KmmDet=theano.sandbox.linalg.det(Kmm) #KmmInv_cache = sT.matrix_inverse(Kmm) #self.fKmm = theano.function([Z, lhyp], Kmm, name='Kmm') #self.f_KmmInv = theano.function([Z, lhyp], KmmInv_cache, name='KmmInv_cache') #復習:これは員数をZ,lhypとした関数kmmInv_cacheをコンパイルしている。つまり逆行列はzとハイパーパラメタの関数になった #self.update_KmmInv_cache()#実際に数値を入れてkinnvを計算させている #逆行列の微分関数を作っている #self.dKmm_d = {'Z': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), Z), name='dKmm_dZ'), # 'lhyp': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), lhyp), name='dKmm_dlhyp')} print('Modeling...') Kmn = ker.RBF(sf2, l, Z, Xtilda) Knn = ker.RBF(sf2, l, Xtilda, Xtilda) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) Kinterval = T.dot(KmmInv, Kmn) mean_U = T.dot(Kinterval.T, U) Covariance = beta LL = (self.log_mvn(X, mean_U, Covariance) - 0.5 * beta * T.sum( (T.eye(N) * Ktilda))) * correct KL_X = -self.KLD_X(m, S) * correct KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv) print('Compiling model ...') inputs = { 'X': X, 'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls, 'eps_M': eps_M, 'eps_NQ': eps_NQ } z = 0.0 * sum([ T.sum(v) for v in inputs.values() ]) # solve a bug with derivative wrt inputs not in the graph self.f = {n: theano.function(list(inputs.values()), f+z, name=n, on_unused_input='ignore')\ for n,f in zip(['X', 'U', 'LL', 'KL_U', 'KL_X'], [X, U, LL, KL_U, KL_X])} wrt = { 'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls } self.g = { vn: { gn: theano.function(list(inputs.values()), T.grad(gv + z, vv), name='d' + gn + '_d' + vn, on_unused_input='ignore') for gn, gv in zip(['LL', 'KL_U', 'KL_X'], [LL, KL_U, KL_X]) } for vn, vv in wrt.items() } with open(model_file_name, 'wb') as file_handle: print('Saving model...') sys.setrecursionlimit(2000) pickle.dump([self.f, self.g], file_handle, protocol=pickle.HIGHEST_PROTOCOL)
def __init__(self, rng,input_m,input_S, n_in, n_out,inducing_number,Domain_number=None, liklihood="Gaussian",Domain_consideration=True,number="1",kernel_name='X'): m=input_m self.cal=input_m S_0=input_S self.N=m.shape[0] D=n_out Q=n_in M=inducing_number #set_initial_value ker=kernel(Q,kernel_name) self.kern=ker mu_value = np.random.randn(M,D)* 1e-2 Sigma_b_value = np.zeros((M,M)) Z_value = np.random.randn(M,Q) if Domain_consideration: ls_value=np.zeros(Domain_number)+np.log(0.1) else: ls_value=np.zeros(1)+np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu'+number, borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b'+number, borrow=True) self.Z = theano.shared(value=Z_value, name='Z'+number, borrow=True) self.ls = theano.shared(value=ls_value, name='ls'+number, borrow=True) self.params = [self.mu,self.Sigma_b,self.Z,self.ls] self.params.extend(ker.params) self.hyp_params_list=[self.mu,self.Sigma_b,self.ls] self.Z_params_list=[self.Z] self.global_params_list=self.params S_1=T.exp(S_0) S=T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((100,self.N,Q)) eps_M = srng.normal((100,M,D))#平均と分散で違う乱数を使う必要があるので別々に銘銘 eps_ND = srng.normal((100,self.N,D)) self.beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma #Xtilda = m[None,:,:] + S[None,:,:] * eps_NQ Xtilda, updates = theano.scan(fn=lambda a: m+S*a, sequences=[eps_NQ]) #self.U = mu_scaled[None,:,:]+Sigma_scaled[None,:,:].dot(eps_M) self.U, updates = theano.scan(fn=lambda a: mu_scaled+Sigma_scaled.dot(a), sequences=[eps_M]) Kmm = ker.RBF(self.Z) KmmInv = sT.matrix_inverse(Kmm) Knn, updates = theano.scan(fn=lambda a: self.kern.RBF(a), sequences=[Xtilda]) Kmn, updates = theano.scan(fn=lambda a: self.kern.RBF(self.Z,a), sequences=[Xtilda]) #Kmn = ker.RBF(self.Z,Xtilda) #Knn = ker.RBF(Xtilda) Ktilda, updates = theano.scan(fn=lambda a,b: a-T.dot(b.T,T.dot(KmmInv,b)), sequences=[Knn,Kmn]) #Ktilda=Knn-T.dot(Kmn.T,T.dot(KmmInv,Kmn)) F, updates = theano.scan(fn=lambda a,b,c,d: T.dot(a.T,T.dot(KmmInv,b)) + T.dot(T.maximum(c, 1e-16)**0.5,d), sequences=[Kmn,self.U,Ktilda,eps_ND]) #F = T.dot(Kmn.T,T.dot(KmmInv,self.U)) + T.dot(T.maximum(Ktilda, 1e-16)**0.5,eps_ND) #Kinterval=T.dot(KmmInv,Kmn) self.mean_U=F #mean_U=T.dot(Kinterval.T,self.U) #A=Kinterval.T #Sigma_tilda=Ktilda+T.dot(A,T.dot(Sigma_scaled,A.T)) #mean_tilda=T.dot(A,mu_scaled) #self.mean_U=mean_tilda + T.dot(T.maximum(Sigma_tilda, 1e-16)**0.5,eps_ND) self.output=self.mean_U self.KL_X = -self.KLD_X(m,S) self.KL_U = -self.KLD_U(mu_scaled , Sigma_scaled , Kmm,KmmInv)
def kreg(xtrain, ytrain, xtest): return kernmat(xtest, xtrain, sigma).dot( Tlina.matrix_inverse( kernmat(xtrain, xtrain, sigma) + theta * T.eye(xtrain.shape[0])).dot(ytrain))
def _problem_MERLiNbp(self, icoh=False): ''' Set up cost function and return the pymanopt problem of the MERLiNbp algorithm ([1], Algorithm 4) or the MERLiNbpicoh algorithm ([1], Algorithm 5) Input (default) - icoh (False) False = set up MERLiNbp, True = set up MERLiNbpicoh Sets/updates self._problem_MERLiNbp_val or self._problem_MERLiNbpicoh_val and the shared theano variables self._T_S, self._T_Vi, self._T_Vr, self._T_Fi, self._T_Fr, self._T_n ''' if (not icoh and self._problem_MERLiNbp_val is None) or ( icoh and self._problem_MERLiNbpicoh_val is None): S = self._T_S = TS.shared(self._S) Vi = self._T_Vi = TS.shared(self._C[0]) Vr = self._T_Vr = TS.shared(self._C[1]) Fi = self._T_Fi = TS.shared(self._F[0].reshape(self._d, -1)) Fr = self._T_Fr = TS.shared(self._F[1].reshape(self._d, -1)) n = self._T_n = TS.shared(self._n) w = T.matrix() m = self._m # linear combination wFr = T.reshape(w.T.dot(Fr), (m, -1)) # m x n' wFi = T.reshape(w.T.dot(Fi), (m, -1)) # m x n' # replace zeros, since we're taking logs def unzero(x): return T.switch(T.eq(x, 0), 1, x) # bandpower def bp(re, im): return T.reshape( T.mean(T.log(unzero(T.sqrt(re * re + im * im))) - T.log(n), axis=1), (m, 1)) wFbp = bp(wFr, wFi) # m x 1 vFbp = bp(Vr, Vi) # m x 1 # centering matrix I = T.eye(m, m) H = I - T.mean(I) # column-centered data X = H.dot(T.concatenate([S, vFbp, wFbp], axis=1)) # m x 3 # covariance matrix S = X.T.dot(X) / (m - 1) # precision matrix prec = Tlina.matrix_inverse(S) # MERLiNbpicoh if icoh: # complex row-wise vdot # (x+yi)(u+vi) = (xu-yv)+(xv+yu)i # vdot i.e. -v instead of +v def vdot(x, y, u, v): return x * u + y * v def vdoti(x, y, u, v): return -x * v + y * u def cross(x, y, u, v): return T.sum(vdot(x, y, u, v), axis=0) / m def crossi(x, y, u, v): return T.sum(vdoti(x, y, u, v), axis=0) / m def sqrtcross(x, y): return T.sqrt(cross(x, y, x, y) + crossi(x, y, x, y)) icoherency = crossi(Vr, Vi, wFr, wFi) / ( sqrtcross(Vr, Vi) * sqrtcross(wFr, wFi)) # n' cost = -(T.abs_(T.sum(icoherency)) * T.abs_(prec[1, 2]) - T.abs_(prec[0, 2])) self._problem_MERLiNbpicoh_val = Problem(manifold=None, cost=cost, arg=w, verbosity=0) # MERLiNbp else: cost = -(T.abs_(prec[1, 2]) - T.abs_(prec[0, 2])) self._problem_MERLiNbp_val = Problem(manifold=None, cost=cost, arg=w, verbosity=0) else: self._T_S.set_value(self._S) self._T_Vi.set_value(self._C[0]) self._T_Vr.set_value(self._C[1]) self._T_Fi.set_value(self._F[0].reshape(self._d, -1)) self._T_Fr.set_value(self._F[1].reshape(self._d, -1)) self._T_n.set_value(self._n) if not icoh: return self._problem_MERLiNbp_val else: return self._problem_MERLiNbpicoh_val
def __init__(self, init_w): self.w = sharedX(init_w) self.b = sharedX(0.) params = [self.w] X = T.matrix() y = T.vector() X.tag.test_value = np.zeros((100, 784), dtype='float32') y.tag.test_value = np.zeros((100, ), dtype='float32') self.cost = function([X, y], self.cost_samples(X, y).mean()) alpha = T.scalar() alpha.tag.test_value = 1. cost_samples = self.cost_samples(X, y) assert cost_samples.ndim == 1 cost = cost_samples.mean() assert cost.ndim == 0 updates = {} for param in params: updates[param] = param - alpha * T.grad(cost, param) self.sgd_step = function([X, y, alpha], updates=updates) num_samples = cost_samples.shape[0] cost_variance = T.sqr(cost_samples - cost).sum() / (num_samples - 1) cost_std = T.sqrt(cost_variance) assert cost_std.ndim == 0 caution = -2. bound = cost + caution * cost_std / T.sqrt(num_samples) updates = {} for param in params: updates[param] = param - alpha * T.grad(cost, param) self.do_step = function([X, y, alpha], updates=updates) self.experimental_step = function( [X, y, alpha], updates={self.w: self.w - alpha * T.grad(bound, param)}) alphas = T.vector() alphas.tag.test_value = np.ones((2, ), dtype='float32') #also tried using grad of bound instead of cost (got to change it in do_step as well) W = self.w.dimshuffle(0, 'x') - T.grad(cost, self.w).dimshuffle( 0, 'x') * alphas.dimshuffle('x', 0) B = self.b.dimshuffle('x') - T.grad(cost, self.b).dimshuffle('x') * alphas Z = T.dot(X, W) + B C = y.dimshuffle(0, 'x') * T.nnet.softplus(-Z) + ( 1 - y.dimshuffle(0, 'x')) * T.nnet.softplus(Z) means = C.mean(axis=0) variances = T.sqr(C - means).sum(axis=0) / (num_samples - 1) stds = T.sqrt(variances) bounds = means + caution * stds / T.sqrt(num_samples) self.eval_bounds = function([X, y, alphas], bounds) W = T.concatenate([self.w.dimshuffle('x', 0)] * batch_size, axis=0) z = (X * W).sum(axis=1) C = y * T.nnet.softplus(-z) + (1 - y) * T.nnet.softplus(z) grad_W = T.grad(C.sum(), W) zero_mean = grad_W - grad_W.mean() cov = T.dot(zero_mean.T, zero_mean) from theano.sandbox.linalg import matrix_inverse inv = matrix_inverse(cov + np.identity(784).astype('float32') * .01) self.nat_grad_step = function( [X, y, alpha], updates={ self.w: self.w - alpha * T.dot(inv, T.grad(cost, self.w)) })
def __init__(self, params,correct,Xinfo, samples = 500,batch_size=None): ker = kernel() mmd = MMD() self.samples = samples self.params = params self.batch_size=batch_size self.Xlabel_value=Xinfo["Xlabel_value"] self.Weight_value=Xinfo["Weight_value"] #データの保存ファイル model_file_name = 'model_MMD_kernel' + '.save' #もしこれまでに作ったのがあるならロードする try: print ('Trying to load model...') with open(model_file_name, 'rb') as file_handle: obj = pickle.load(file_handle) self.f, self.g= obj print ('Loaded!') return except: print ('Failed. Creating a new model...') X,Y,X_test,m,S_b,mu,Sigma_b,Z,eps_NQ,eps_M =\ T.dmatrices('X','Y','X_test','m','S_b','mu','Sigma_b','Z','eps_NQ','eps_M') Xlabel=T.dmatrix('Xlabel') Zlabel=T.dmatrix('Zlabel') Zlabel_T=T.exp(Zlabel)/T.sum(T.exp(Zlabel),1)[:,None]#ラベルは確率なので正の値でかつ、企画化されている Weight=T.dmatrix('Weight') lhyp = T.dvector('lhyp') ls=T.dvector('ls') ga=T.dvector('ga') (M, D), N, Q = Z.shape, X.shape[0], X.shape[1] #変数の正の値への制約条件 beta = T.exp(ls) gamma=T.exp(ga[0]) #beta=T.exp(lhyp[0]) sf2, l = T.exp(lhyp[0]), T.exp(lhyp[1:1+Q]) S=T.exp(S_b) #Sigma=T.exp(self.Sigma_b) #xについてはルートを取らなくても対角行列なので問題なし #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(Sigma_b - T.diag(T.diag(Sigma_b)) + T.diag(T.exp(T.diag(Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = sf2**0.5 * mu, sf2**0.5 * Sigma Xtilda = m + S * eps_NQ U = mu_scaled+Sigma_scaled.dot(eps_M) print ('Setting up cache...') Kmm = ker.RBF(sf2, l, Z) Kmm=mmd.MMD_kenel_Xonly(gamma,Zlabel_T,Kmm,Weight) KmmInv = sT.matrix_inverse(Kmm) #KmmDet=theano.sandbox.linalg.det(Kmm) #KmmInv_cache = sT.matrix_inverse(Kmm) #self.fKmm = theano.function([Z, lhyp], Kmm, name='Kmm') #self.f_KmmInv = theano.function([Z, lhyp], KmmInv_cache, name='KmmInv_cache') #復習:これは員数をZ,lhypとした関数kmmInv_cacheをコンパイルしている。つまり逆行列はzとハイパーパラメタの関数になった #self.update_KmmInv_cache()#実際に数値を入れてkinnvを計算させている #逆行列の微分関数を作っている #self.dKmm_d = {'Z': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), Z), name='dKmm_dZ'), # 'lhyp': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), lhyp), name='dKmm_dlhyp')} print ('Modeling...') Kmn = ker.RBF(sf2,l,Z,Xtilda) Kmn=mmd.MMD_kenel_ZX(gamma,Zlabel_T,Xlabel,Kmn,Weight) Knn = ker.RBF(sf2,l,Xtilda,Xtilda) Knn=mmd.MMD_kenel_Xonly(gamma,Xlabel,Knn,Weight) Ktilda=Knn-T.dot(Kmn.T,T.dot(KmmInv,Kmn)) Kinterval=T.dot(KmmInv,Kmn) mean_U=T.dot(Kinterval.T,U) betaI=T.diag(T.dot(Xlabel,beta)) Covariance = betaI LL = (self.log_mvn(X, mean_U, Covariance) - 0.5*T.sum(T.dot(betaI,Ktilda)))*correct KL_X = -self.KLD_X(m,S)*correct KL_U = -self.KLD_U(mu_scaled , Sigma_scaled , Kmm,KmmInv) print ('Compiling model ...') inputs = {'X': X, 'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls, 'eps_M': eps_M, 'eps_NQ': eps_NQ,'ga':ga,'Zlabel':Zlabel,'Weight':Weight,'Xlabel':Xlabel} z = 0.0*sum([T.sum(v) for v in inputs.values()]) # solve a bug with derivative wrt inputs not in the graph self.f = {n: theano.function(list(inputs.values()), f+z, name=n, on_unused_input='ignore')\ for n,f in zip(['X', 'U', 'LL', 'KL_U', 'KL_X'], [X, U, LL, KL_U, KL_X])} wrt = {'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls,'ga':ga,'Zlabel':Zlabel} self.g = {vn: {gn: theano.function(list(inputs.values()), T.grad(gv+z, vv), name='d'+gn+'_d'+vn, on_unused_input='ignore') for gn,gv in zip(['LL', 'KL_U', 'KL_X'], [LL, KL_U, KL_X])} for vn, vv in wrt.items()} with open(model_file_name, 'wb') as file_handle: print ('Saving model...') sys.setrecursionlimit(10000) pickle.dump([self.f, self.g], file_handle, protocol=pickle.HIGHEST_PROTOCOL)
def invM( M = Th.dmatrix('M') , **result): return matrix_inverse( Th.identity_like(M)-(M+M.T)/2 )
def s_variance(K, y, var_y, prior_var, K_new, var_min): rK = psd(prior_var * K + var_y * TT.eye(y.shape[0])) L = cholesky(rK) v = dots(matrix_inverse(L), prior_var * K_new) var_x = TT.maximum(prior_var - (v ** 2).sum(axis=0), var_min) return var_x
def __init__(self, init_w): self.w = sharedX(init_w) self.b = sharedX(0.) params = [self.w ] X = T.matrix() y = T.vector() X.tag.test_value = np.zeros((100,784),dtype='float32') y.tag.test_value = np.zeros((100,),dtype='float32') self.cost = function([X,y],self.cost_samples(X,y).mean()) alpha = T.scalar() alpha.tag.test_value = 1. cost_samples = self.cost_samples(X,y) assert cost_samples.ndim == 1 cost = cost_samples.mean() assert cost.ndim == 0 updates = {} for param in params: updates[param] = param - alpha * T.grad(cost,param) self.sgd_step = function([X,y,alpha],updates = updates) num_samples = cost_samples.shape[0] cost_variance = T.sqr(cost_samples-cost).sum() / ( num_samples - 1) cost_std = T.sqrt(cost_variance) assert cost_std.ndim == 0 caution = -2. bound = cost + caution * cost_std / T.sqrt(num_samples) updates = {} for param in params: updates[param] = param - alpha * T.grad(cost,param) self.do_step = function([X,y,alpha],updates = updates) self.experimental_step = function([X,y,alpha],updates = { self.w: self.w - alpha * T.grad(bound,param) } ) alphas = T.vector() alphas.tag.test_value = np.ones((2,),dtype='float32') #also tried using grad of bound instead of cost (got to change it in do_step as well) W = self.w.dimshuffle(0,'x') - T.grad(cost,self.w).dimshuffle(0,'x')* alphas.dimshuffle('x',0) B = self.b.dimshuffle('x') - T.grad(cost, self.b).dimshuffle('x') * alphas Z = T.dot(X,W) + B C = y.dimshuffle(0,'x') * T.nnet.softplus(-Z) + (1-y.dimshuffle(0,'x'))*T.nnet.softplus(Z) means = C.mean(axis=0) variances = T.sqr(C-means).sum(axis=0) / (num_samples - 1) stds = T.sqrt(variances) bounds = means + caution * stds / T.sqrt(num_samples) self.eval_bounds = function([X,y,alphas],bounds) W = T.concatenate( [self.w.dimshuffle('x',0) ] * batch_size, axis= 0) z = (X*W).sum(axis=1) C = y*T.nnet.softplus(-z) + (1-y)*T.nnet.softplus(z) grad_W = T.grad(C.sum(),W) zero_mean = grad_W - grad_W.mean() cov = T.dot(zero_mean.T,zero_mean) from theano.sandbox.linalg import matrix_inverse inv = matrix_inverse(cov + np.identity(784).astype('float32') * .01) self.nat_grad_step = function([X,y,alpha], updates = { self.w : self.w - alpha * T.dot( inv, T.grad(cost,self.w)) } )
def __init__(self, rng, target, input_m, input_S, n_in, n_out, inducing_number, Domain_number, Xlabel, liklihood="Gaussian", Domain_consideration=True, number="1"): m = input_m S_0 = input_S N = m.shape[0] D = n_out Q = n_in M = inducing_number #set_initial_value ker = kernel(Q) mu_value = np.random.randn(M, D) * 1e-2 Sigma_b_value = np.zeros((M, M)) Z_value = np.random.randn(M, Q) if Domain_consideration: ls_value = np.zeros(Domain_number) + np.log(0.1) else: ls_value = np.zeros(1) + np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu' + number, borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b' + number, borrow=True) self.Z = theano.shared(value=Z_value, name='Z' + number, borrow=True) self.ls = theano.shared(value=ls_value, name='ls' + number, borrow=True) self.params = [self.mu, self.Sigma_b, self.Z, self.ls] self.params.extend(ker.params) self.hyp_params_list = [self.mu, self.Sigma_b, self.ls] self.Z_params_list = [self.Z] self.global_params_list = self.params S_1 = T.exp(S_0) S = T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N, Q)) eps_M = srng.normal((M, D)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 eps_ND = srng.normal((N, D)) beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled + Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z, Xtilda) Knn = ker.RBF(Xtilda) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) #F = T.dot(Kmn.T,T.dot(KmmInv,self.U)) + T.dot(T.maximum(Ktilda, 1e-16)**0.5,eps_ND) Kinterval = T.dot(KmmInv, Kmn) A = Kinterval.T Sigma_tilda = Ktilda + T.dot(A, T.dot(Sigma_scaled, A.T)) mean_tilda = T.dot(A, mu_scaled) #mean_U=F #mean_U=T.dot(Kinterval.T,self.U) mean_U = mean_tilda + T.dot(T.maximum(Sigma_tilda, 1e-16)**0.5, eps_ND) betaI = T.diag(T.dot(Xlabel, beta)) Covariance = betaI self.output = mean_U self.LL = self.log_mvn( target, mean_U, Covariance) / N # - 0.5*T.sum(T.dot(betaI,Ktilda)) self.KL_X = -self.KLD_X(m, S) self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)
def __init__(self, params, sx2 = 1, linear_model = False, samples = 20, use_hat = False): ker, self.samples, self.params, self.KmmInv = kernel(), samples, params, {} self.use_hat = use_hat model_file_name = 'model' + ('_hat' if use_hat else '') + ('_linear' if linear_model else '') + '.save' try: print 'Trying to load model...' with open(model_file_name, 'rb') as file_handle: obj = cPickle.load(file_handle) self.f, self.g, self.f_Kmm, self.f_KmmInv, self.dKmm_d = obj self.update_KmmInv_cache() print 'Loaded!' return except: print 'Failed. Creating a new model...' Y, Z, m, ls, mu, lL, eps_MK, eps_NQ, eps_NK, KmmInv = T.dmatrices('Y', 'Z', 'm', 'ls', 'mu', 'lL', 'eps_MK', 'eps_NQ', 'eps_NK', 'KmmInv') lhyp = T.dvector('lhyp') (M, K), N, Q = mu.shape, m.shape[0], Z.shape[1] s, sl2, sf2, l = T.exp(ls), T.exp(lhyp[0]), T.exp(lhyp[1]), T.exp(lhyp[2:2+Q]) L = T.tril(lL - T.diag(T.diag(lL)) + T.diag(T.exp(T.diag(lL)))) print 'Setting up cache...' Kmm = ker.RBF(sf2, l, Z) if not linear_model else ker.LIN(sl2, Z) KmmInv_cache = sT.matrix_inverse(Kmm) self.f_Kmm = theano.function([Z, lhyp], Kmm, name='Kmm') self.f_KmmInv = theano.function([Z, lhyp], KmmInv_cache, name='KmmInv_cache') self.update_KmmInv_cache() self.dKmm_d = {'Z': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), Z), name='dKmm_dZ'), 'lhyp': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), lhyp), name='dKmm_dlhyp')} print 'Setting up model...' if not self.use_hat: mu_scaled, L_scaled = sf2**0.5 * mu, sf2**0.5 * L X = m + s * eps_NQ U = mu_scaled + L_scaled.dot(eps_MK) Kmn = ker.RBF(sf2, l, Z, X) if not linear_model else ker.LIN(sl2, Z, X) Knn = ker.RBFnn(sf2, l, X) if not linear_model else ker.LINnn(sl2, X) A = KmmInv.dot(Kmn) B = Knn - T.sum(Kmn * KmmInv.dot(Kmn), 0) F = A.T.dot(U) + T.maximum(B, 1e-16)[:,None]**0.5 * eps_NK F = T.concatenate((T.zeros((N,1)), F), axis=1) S = T.nnet.softmax(F) LS = T.sum(T.log(T.maximum(T.sum(Y * S, 1), 1e-16))) if not linear_model: KL_U = -0.5 * (T.sum(KmmInv.T * T.sum(mu_scaled[:,None,:]*mu_scaled[None,:,:], 2)) + K * (T.sum(KmmInv.T * L_scaled.dot(L_scaled.T)) - M - 2.0*T.sum(T.log(T.diag(L_scaled))) + 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm)))))) else: KL_U = 0 #KL_U = -0.5 * T.sum(T.sum(mu_scaled * KmmInv.dot(mu_scaled), 0) + T.sum(KmmInv * L_scaled.dot(L_scaled.T)) - M # - 2.0*T.sum(T.log(T.diag(L_scaled))) + 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm))))) if not linear_model else 0 else: # mu_scaled, L_scaled = mu / sf2**0.5, L / sf2**0.5 mu_scaled, L_scaled = mu / sf2, L / sf2 X = m + s * eps_NQ U = mu_scaled + L_scaled.dot(eps_MK) Kmn = ker.RBF(sf2, l, Z, X) if not linear_model else ker.LIN(sl2, Z, X) Knn = ker.RBFnn(sf2, l, X) if not linear_model else ker.LINnn(sl2, X) B = Knn - T.sum(Kmn * KmmInv.dot(Kmn), 0) F = Kmn.T.dot(U) + T.maximum(B, 1e-16)[:,None]**0.5 * eps_NK F = T.concatenate((T.zeros((N,1)), F), axis=1) S = T.nnet.softmax(F) LS = T.sum(T.log(T.maximum(T.sum(Y * S, 1), 1e-16))) if not linear_model: KL_U = -0.5 * (T.sum(Kmm.T * T.sum(mu_scaled[:,None,:]*mu_scaled[None,:,:], 2)) + K * (T.sum(Kmm.T * L_scaled.dot(L_scaled.T)) - M - 2.0*T.sum(T.log(T.diag(L_scaled))) - 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm)))))) else: KL_U = 0 KL_X_all = -0.5 * T.sum((m**2.0 + s**2.0)/sx2 - 1.0 - 2.0*ls + T.log(sx2), 1) KL_X = T.sum(KL_X_all) print 'Compiling...' inputs = {'Y': Y, 'Z': Z, 'm': m, 'ls': ls, 'mu': mu, 'lL': lL, 'lhyp': lhyp, 'KmmInv': KmmInv, 'eps_MK': eps_MK, 'eps_NQ': eps_NQ, 'eps_NK': eps_NK} z = 0.0*sum([T.sum(v) for v in inputs.values()]) # solve a bug with derivative wrt inputs not in the graph f = zip(['X', 'U', 'S', 'LS', 'KL_U', 'KL_X', 'KL_X_all'], [X, U, S, LS, KL_U, KL_X, KL_X_all]) self.f = {n: theano.function(inputs.values(), f+z, name=n, on_unused_input='ignore') for n,f in f} g = zip(['LS', 'KL_U', 'KL_X'], [LS, KL_U, KL_X]) wrt = {'Z': Z, 'm': m, 'ls': ls, 'mu': mu, 'lL': lL, 'lhyp': lhyp, 'KmmInv': KmmInv} self.g = {vn: {gn: theano.function(inputs.values(), T.grad(gv+z, vv), name='d'+gn+'_d'+vn, on_unused_input='ignore') for gn,gv in g} for vn, vv in wrt.iteritems()} with open(model_file_name, 'wb') as file_handle: print 'Saving model...' sys.setrecursionlimit(2000) cPickle.dump([self.f, self.g, self.f_Kmm, self.f_KmmInv, self.dKmm_d], file_handle, protocol=cPickle.HIGHEST_PROTOCOL)
def __init__(self, params,correct, samples = 500,batch_size=None): ker = kernel() self.samples = samples self.params = params self.batch_size=batch_size #データの保存ファイル model_file_name = 'model2' + '.save' #もしこれまでに作ったのがあるならロードする try: print ('Trying to load model...') with open(model_file_name, 'rb') as file_handle: obj = pickle.load(file_handle) self.f, self.g= obj print ('Loaded!') return except: print ('Failed. Creating a new model...') X,Y,X_test,mu,Sigma_b,Z,eps_NQ,eps_M =\ T.dmatrices('X','Y','X_test','mu','Sigma_b','Z','eps_NQ','eps_M') Wx, Ws, Wu=\ T.dmatrices('Wx', 'Ws', 'Wu') bx, bs, bu=\ T.dvectors('bx', 'bs', 'bu') gamma_x,beta_x,gamma_u,beta_u,gamma_s,beta_s=\ T.dvectors("gamma_x","beta_x","gamma_u","beta_u","gamma_s","beta_s") lhyp = T.dvector('lhyp') ls=T.dvector('ls') (M, D), N, Q = Z.shape, X.shape[0], X.shape[1] #変数の正の値への制約条件 beta = T.exp(ls[0]) #beta=T.exp(lhyp[0]) sf2, l = T.exp(lhyp[0]), T.exp(lhyp[1:1+Q]) #Sigma=T.exp(self.Sigma_b) #xについてはルートを取らなくても対角行列なので問題なし #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(Sigma_b - T.diag(T.diag(Sigma_b)) + T.diag(T.exp(T.diag(Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = sf2**0.5 * mu, sf2**0.5 * Sigma #隠れ層の生成 out1=self.neural_net_predict(Wx,bx,gamma_x,beta_x,X) m=self.neural_net_predict(Wu,bu,gamma_u,beta_u,out1) S=self.neural_net_predict(Ws,bs,gamma_s,beta_s,out1) #outputs1 = T.dot(X,Wx) + bx #m = T.dot(out1,Wu) + bu #S=T.dot(out1,Ws) + bs S=T.exp(S) S=T.sqrt(S) Xtilda = m+S*eps_NQ U = mu_scaled+Sigma_scaled.dot(eps_M) print ('Setting up cache...') Kmm = ker.RBF(sf2, l, Z) KmmInv = sT.matrix_inverse(Kmm) #KmmDet=theano.sandbox.linalg.det(Kmm) #KmmInv_cache = sT.matrix_inverse(Kmm) #self.fKmm = theano.function([Z, lhyp], Kmm, name='Kmm') #self.f_KmmInv = theano.function([Z, lhyp], KmmInv_cache, name='KmmInv_cache') #復習:これは員数をZ,lhypとした関数kmmInv_cacheをコンパイルしている。つまり逆行列はzとハイパーパラメタの関数になった #self.update_KmmInv_cache()#実際に数値を入れてkinnvを計算させている #逆行列の微分関数を作っている #self.dKmm_d = {'Z': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), Z), name='dKmm_dZ'), # 'lhyp': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), lhyp), name='dKmm_dlhyp')} print ('Modeling...') Kmn = ker.RBF(sf2,l,Z,Xtilda) Knn = ker.RBF(sf2,l,Xtilda,Xtilda) Ktilda=Knn-T.dot(Kmn.T,T.dot(KmmInv,Kmn)) Kinterval=T.dot(KmmInv,Kmn) mean_U=T.dot(Kinterval.T,U) Covariance = beta LL = (self.log_mvn(X, mean_U, Covariance) - 0.5*beta*T.sum((T.eye(N)*Ktilda)))*correct KL_X = -self.KLD_X(m,S)*correct KL_U = -self.KLD_U(mu_scaled , Sigma_scaled , Kmm,KmmInv) print ('Compiling model ...') inputs = {'X': X, 'Z': Z,'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls, 'eps_M': eps_M, 'eps_NQ': eps_NQ,\ "Wx":Wx, "bx":bx, "Wu":Wu,"bu":bu, "Ws":Ws, "bs":bs,\ "gamma_x":gamma_x,"beta_x":beta_x,"gamma_u":gamma_u,"beta_u":beta_u,"gamma_s":gamma_s,"beta_s":beta_s} z = 0.0*sum([T.sum(v) for v in inputs.values()]) # solve a bug with derivative wrt inputs not in the graph self.f = {n: theano.function(list(inputs.values()), f+z, name=n, on_unused_input='ignore')\ for n,f in zip(['Xtilda','U', 'LL', 'KL_U', 'KL_X'], [Xtilda,U, LL, KL_U, KL_X])} wrt = {'Z': Z,'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls, "Wx":Wx, "bx":bx, "Wu":Wu,"bu":bu, "Ws":Ws, "bs":bs,\ "gamma_x":gamma_x,"beta_x":beta_x,"gamma_u":gamma_u,"beta_u":beta_u,"gamma_s":gamma_s,"beta_s":beta_s} self.g = {vn: {gn: theano.function(list(inputs.values()), T.grad(gv+z, vv), name='d'+gn+'_d'+vn, on_unused_input='ignore') for gn,gv in zip(['LL', 'KL_U', 'KL_X'], [LL, KL_U, KL_X])} for vn, vv in wrt.items()} with open(model_file_name, 'wb') as file_handle: print ('Saving model...') sys.setrecursionlimit(2000) pickle.dump([self.f, self.g], file_handle, protocol=pickle.HIGHEST_PROTOCOL)
def __init__(self,D, M,Q,Domain_number): self.Xlabel=T.matrix('Xlabel') self.X=T.matrix('X') N=self.X.shape[0] self.Weight=T.matrix('Weight') ker=kernel(Q) mmd=MMD(M,Domain_number) mu_value = np.random.randn(M,D) Sigma_b_value = np.zeros((M,M)) + np.log(0.01) Z_value = np.random.randn(M,Q) self.test=Z_value ls_value=np.zeros(Domain_number)+np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu', borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b', borrow=True) self.Z = theano.shared(value=Z_value, name='Z', borrow=True) self.ls = theano.shared(value=ls_value, name='ls', borrow=True) self.params = [self.mu,self.Sigma_b,self.Z,self.ls] self.hiddenLayer_x = HiddenLayer(rng=rng,input=self.X,n_in=D,n_out=20,activation=T.nnet.relu,number='_x') self.hiddenLayer_m = HiddenLayer(rng=rng,input=self.hiddenLayer_x.output,n_in=20,n_out=Q,activation=T.nnet.relu,number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng,input=self.hiddenLayer_x.output,n_in=20,n_out=Q,activation=T.nnet.relu,number='_S') self.loc_params= [] self.loc_params.extend(self.hiddenLayer_x.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params={} for i in self.loc_params: self.local_params[str(i)]=i self.params.extend(ker.params) self.params.extend(mmd.params) self.global_params={} for i in self.params: self.global_params[str(i)]=i self.params.extend(self.hiddenLayer_x.params) self.params.extend(self.hiddenLayer_m.params) self.params.extend(self.hiddenLayer_S.params) self.wrt={} for i in self.params: self.wrt[str(i)]=i m=self.hiddenLayer_m.output S_0=self.hiddenLayer_S.output S_1=T.exp(S_0) S=T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N,Q)) eps_M= srng.normal((M,D))#平均と分散で違う乱数を使う必要があるので別々に銘銘 beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled+Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) Kmm=mmd.MMD_kenel_Xonly(mmd.Zlabel_T,Kmm,self.Weight) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z,Xtilda) Kmn=mmd.MMD_kenel_ZX(self.Xlabel,Kmn,self.Weight) Knn = ker.RBF(Xtilda) Knn=mmd.MMD_kenel_Xonly(self.Xlabel,Knn,self.Weight) Ktilda=Knn-T.dot(Kmn.T,T.dot(KmmInv,Kmn)) Kinterval=T.dot(KmmInv,Kmn) mean_U=T.dot(Kinterval.T,self.U) betaI=T.diag(T.dot(self.Xlabel,beta)) Covariance = betaI self.LL = (self.log_mvn(self.X, mean_U, Covariance) - 0.5*T.sum(T.dot(betaI,Ktilda))) self.KL_X = -self.KLD_X(m,S) self.KL_U = -self.KLD_U(mu_scaled , Sigma_scaled , Kmm,KmmInv)