def L_op(self, inputs, outputs, gradients): # Modified from theano/tensor/slinalg.py # No handling for on_error = 'nan' dz = gradients[0] chol_x = outputs[0] # this is for nan mode # # ok = ~tensor.any(tensor.isnan(chol_x)) # chol_x = tensor.switch(ok, chol_x, 1) # dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return gpu_solve_upper_triangular( outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) return [grad]
def marginalize_over_v_z(self, h): # energy = \sum_{i=1}^{|h|} h_i*b_i - \beta * ln(1 + e^{b_i}) # In theory should use the following line # energy = (h * self.b).T # However, when there is broadcasting, the Theano element-wise multiplication between np.NaN and 0 is 0 instead of np.NaN! # so we use T.tensordot and T.diagonal instead as a workaround! # See Theano issue #3848 (https://github.com/Theano/Theano/issues/3848) energy = T.tensordot(h, self.b, axes=0) energy = T.diagonal(energy, axis1=1, axis2=2).T if self.penalty == "softplus_bi": energy = energy - self.beta * T.log(1 + T.exp(self.b))[:, None] elif self.penalty == "softplus0": energy = energy - self.beta * T.log(1 + T.exp(0))[:, None] else: raise NameError("Invalid penalty term") energy = T.set_subtensor(energy[(T.isnan(energy)).nonzero()], 0) # Remove NaN energy = T.sum(energy, axis=0, keepdims=True).T ener = T.tensordot(h, self.W, axes=0) ener = T.diagonal(ener, axis1=1, axis2=2) ener = T.set_subtensor(ener[(T.isnan(ener)).nonzero()], 0) ener = T.sum(ener, axis=2) + self.c[None, :] ener = T.sum(T.log(1 + T.exp(ener)), axis=1, keepdims=True) return -(energy + ener)
def rbf_mmd2(X, Y, sigma=0, biased=True): gamma = 1 / (2 * sigma**2) XX = T.dot(X, X.T) XY = T.dot(X, Y.T) YY = T.dot(Y, Y.T) X_sqnorms = T.diagonal(XX) Y_sqnorms = T.diagonal(YY) K_XY = T.exp( -gamma * (-2 * XY + X_sqnorms[:, np.newaxis] + Y_sqnorms[np.newaxis, :])) K_XX = T.exp( -gamma * (-2 * XX + X_sqnorms[:, np.newaxis] + X_sqnorms[np.newaxis, :])) K_YY = T.exp( -gamma * (-2 * YY + Y_sqnorms[:, np.newaxis] + Y_sqnorms[np.newaxis, :])) if biased: mmd2 = K_XX.mean() + K_YY.mean() - 2 * K_XY.mean() else: m = K_XX.shape[0] n = K_YY.shape[0] mmd2 = ((K_XX.sum() - m) / (m * (m - 1)) + (K_YY.sum() - n) / (n * (n - 1)) - 2 * K_XY.mean()) return mmd2, mmd2
def _mmd2_and_variance(K_XX, K_XY, K_YY, unit_diagonal=False, biased=False): m = K_XX.shape[0] # Assumes X, Y are same shape ### Get the various sums of kernels that we'll use # Kts drop the diagonal, but we don't need to compute them explicitly if unit_diagonal: diag_X = diag_Y = 1 sum_diag_X = sum_diag_Y = m sum_diag2_X = sum_diag2_Y = m else: diag_X = T.diagonal(K_XX) diag_Y = T.diagonal(K_YY) sum_diag_X = diag_X.sum() sum_diag_Y = diag_Y.sum() sum_diag2_X = diag_X.dot(diag_X) sum_diag2_Y = diag_Y.dot(diag_Y) Kt_XX_sums = K_XX.sum(axis=1) - diag_X Kt_YY_sums = K_YY.sum(axis=1) - diag_Y K_XY_sums_0 = K_XY.sum(axis=0) K_XY_sums_1 = K_XY.sum(axis=1) Kt_XX_sum = Kt_XX_sums.sum() Kt_YY_sum = Kt_YY_sums.sum() K_XY_sum = K_XY_sums_0.sum() # TODO: turn these into dot products? # should figure out if that's faster or not on GPU / with theano... Kt_XX_2_sum = (K_XX**2).sum() - sum_diag2_X Kt_YY_2_sum = (K_YY**2).sum() - sum_diag2_Y K_XY_2_sum = (K_XY**2).sum() if biased: mmd2 = ((Kt_XX_sum + sum_diag_X) / (m * m) + (Kt_YY_sum + sum_diag_Y) / (m * m) - 2 * K_XY_sum / (m * m)) else: mmd2 = (Kt_XX_sum / (m * (m - 1)) + Kt_YY_sum / (m * (m - 1)) - 2 * K_XY_sum / (m * m)) var_est = (2 / (m**2 * (m - 1)**2) * (2 * Kt_XX_sums.dot(Kt_XX_sums) - Kt_XX_2_sum + 2 * Kt_YY_sums.dot(Kt_YY_sums) - Kt_YY_2_sum) - (4 * m - 6) / (m**3 * (m - 1)**3) * (Kt_XX_sum**2 + Kt_YY_sum**2) + 4 * (m - 2) / (m**3 * (m - 1)**2) * (K_XY_sums_1.dot(K_XY_sums_1) + K_XY_sums_0.dot(K_XY_sums_0)) - 4 * (m - 3) / (m**3 * (m - 1)**2) * K_XY_2_sum - (8 * m - 12) / (m**5 * (m - 1)) * K_XY_sum**2 + 8 / (m**3 * (m - 1)) * (1 / m * (Kt_XX_sum + Kt_YY_sum) * K_XY_sum - Kt_XX_sums.dot(K_XY_sums_1) - Kt_YY_sums.dot(K_XY_sums_0))) return mmd2, var_est
def get_output_for(self, input, **kwargs): gamma = 1 / (2 * T.exp(2 * self.log_sigma)) XX = T.dot(input, input.T) XY = T.dot(input, self.locs.T) YY = T.dot(self.locs, self.locs.T) # cache this somehow? X_sqnorms = T.diagonal(XX) Y_sqnorms = T.diagonal(YY) return T.exp( -gamma * (-2 * XY + X_sqnorms[:, np.newaxis] + Y_sqnorms[np.newaxis, :]))
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) # Replace the cholesky decomposition with 1 if there are nans # or solve_upper_triangular will throw a ValueError. if self.on_error == 'nan': ok = ~tensor.any(tensor.isnan(chol_x)) chol_x = tensor.switch(ok, chol_x, 1) dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) if self.on_error == 'nan': return [tensor.switch(ok, grad, np.nan)] else: return [grad]
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) return [tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))]
def compile_adapt_f(self, signals): x = self.signal(signals) x_prev = [p.signal(signals) for p in self.prev] assert np.all([x.k == xp.k for xp in x_prev]) assert self.m == [xp.n for xp in x_prev] assert x.n == self.n k = np.float32(x.k) # Modulate x if x.modulation is not None: x_ = x.var * T.as_tensor_variable(x.modulation) else: x_ = x.var updates = [] upd = lambda en, old, new: [(old, ifelse(en, new, old))] E_XX_new, _, d = lerp(self.E_XX, T.dot(x_, x_.T) / k, self.min_tau) updates += upd(self.enabled, self.E_XX, E_XX_new) b = 1. d = T.diagonal(E_XX_new) stiff = T.scalar('stiffnes', dtype=FLOATX) Q_new = theano_diag( b / T.where(d < stiff * self.stiffx, stiff * self.stiffx, d)) updates += upd(self.enabled, self.Q, Q_new) for i, x_p in enumerate(x_prev): E_XU_new, _, d_ = lerp(self.E_XU[i], T.dot(x_, x_p.var.T) / k, self.min_tau) updates += upd(self.enabled, self.E_XU[i], E_XU_new) d = T.maximum(d, d_) updates += upd(self.enabled, self.phi[i], T.dot(Q_new, E_XU_new).T) self.info('Compile layer update between: ' + self.name + ' and ' + ', '.join([p.name for p in self.prev])) return theano.function(inputs=[stiff], outputs=d, updates=updates)
def free_energy(self, v_sample): wx_b = T.dot(v_sample, self.W) + self.hbias vbias_term = 0.5 * T.dot((v_sample - self.vbias), (v_sample - self.vbias).T) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) #return -hidden_term - vbias_term return -hidden_term - T.diagonal(vbias_term)
def free_energy(self, v_sample): #重写 wx_b = T.dot(v_sample, self.W) + self.hbias vbias_term = 0.5 * T.dot((v_sample - self.vbias), (v_sample - self.vbias).T) #这一个项 修改了, 原来是直接dot,现在换成了差的平方。 hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) return -hidden_term - T.diagonal(vbias_term)
def version1(self): Fhat = self.feedForward(self.dot()) latFhat = T.dot(T.abs_(Fhat.T), self.distMat.T) latFhat = T.dot(latFhat, T.abs_(Fhat)) self.latFhat = T.diagonal(latFhat) return self.latFhat
def logprob(x, m, S): delta = x - m L = cholesky(S) beta = solve_lower_triangular(L, delta.T).T lp = -0.5 * tt.square(beta).sum(-1) lp -= tt.sum(tt.log(tt.diagonal(L))) lp -= (0.5 * m.size * tt.log(2 * np.pi)).astype( theano.config.floatX) return lp
def build_theano_models(self, algo, algo_params): epsilon = 1e-6 kl = lambda mu, sig: sig+mu**2-TT.log(sig) X, y = TT.dmatrices('X', 'y') params = TT.dvector('params') a, b, c, l_F, F, l_FC, FC = self.unpack_params(params) sig2_n, sig_f = TT.exp(2*a), TT.exp(b) l_FF = TT.dot(X, l_F)+l_FC FF = TT.concatenate((l_FF, TT.dot(X, F)+FC), 1) Phi = TT.concatenate((TT.cos(FF), TT.sin(FF)), 1) Phi = sig_f*TT.sqrt(2./self.M)*Phi noise = TT.log(1+TT.exp(c)) PhiTPhi = TT.dot(Phi.T, Phi) A = PhiTPhi+(sig2_n+epsilon)*TT.identity_like(PhiTPhi) L = Tlin.cholesky(A) Li = Tlin.matrix_inverse(L) PhiTy = Phi.T.dot(y) beta = TT.dot(Li, PhiTy) alpha = TT.dot(Li.T, beta) mu_f = TT.dot(Phi, alpha) var_f = (TT.dot(Phi, Li.T)**2).sum(1)[:, None] dsp = noise*(var_f+1) mu_l = TT.sum(TT.mean(l_F, axis=1)) sig_l = TT.sum(TT.std(l_F, axis=1)) mu_w = TT.sum(TT.mean(F, axis=1)) sig_w = TT.sum(TT.std(F, axis=1)) hermgauss = np.polynomial.hermite.hermgauss(30) herm_x = Ts(hermgauss[0])[None, None, :] herm_w = Ts(hermgauss[1]/np.sqrt(np.pi))[None, None, :] herm_f = TT.sqrt(2*var_f[:, :, None])*herm_x+mu_f[:, :, None] nlk = (0.5*herm_f**2.-y[:, :, None]*herm_f)/dsp[:, :, None]+0.5*( TT.log(2*np.pi*dsp[:, :, None])+y[:, :, None]**2/dsp[:, :, None]) enll = herm_w*nlk nlml = 2*TT.log(TT.diagonal(L)).sum()+2*enll.sum()+1./sig2_n*( (y**2).sum()-(beta**2).sum())+2*(X.shape[0]-self.M)*a penelty = (kl(mu_w, sig_w)*self.M+kl(mu_l, sig_l)*self.S)/(self.S+self.M) cost = (nlml+penelty)/X.shape[0] grads = TT.grad(cost, params) updates = getattr(OPT, algo)(self.params, grads, **algo_params) updates = getattr(OPT, 'apply_nesterov_momentum')(updates, momentum=0.9) train_inputs = [X, y] train_outputs = [cost, alpha, Li] self.train_func = Tf(train_inputs, train_outputs, givens=[(params, self.params)]) self.train_iter_func = Tf(train_inputs, train_outputs, givens=[(params, self.params)], updates=updates) Xs, Li, alpha = TT.dmatrices('Xs', 'Li', 'alpha') l_FFs = TT.dot(Xs, l_F)+l_FC FFs = TT.concatenate((l_FFs, TT.dot(Xs, F)+FC), 1) Phis = TT.concatenate((TT.cos(FFs), TT.sin(FFs)), 1) Phis = sig_f*TT.sqrt(2./self.M)*Phis mu_pred = TT.dot(Phis, alpha) std_pred = (noise*(1+(TT.dot(Phis, Li.T)**2).sum(1)))**0.5 pred_inputs = [Xs, alpha, Li] pred_outputs = [mu_pred, std_pred] self.pred_func = Tf(pred_inputs, pred_outputs, givens=[(params, self.params)])
def get_mu_sigma_costs(hid): shp = hid.shape mu = hid.mean(0) sigma = T.dot(hid.T, hid) / shp[0] C_mu = T.sum(mu**2) C_sigma = T.diagonal(sigma - T.log(T.clip(sigma, 1e-15, 1))) C_sigma -= -T.ones_like(C_sigma) return C_mu, C_sigma.sum() # trace(C_sigma)
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) ok = tt.all(tt.nlinalg.diag(chol_x) > 0) chol_x = tt.switch(ok, chol_x, tt.fill_diagonal(chol_x, 1)) dz = tt.switch(ok, dz, floatX(1)) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tt.tril(mtx) - tt.diag(tt.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" solve = tt.slinalg.Solve(A_structure="upper_triangular") return solve(outer.T, solve(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tt.tril(s + s.T) - tt.diag(tt.diagonal(s)) else: grad = tt.triu(s + s.T) - tt.diag(tt.diagonal(s)) return [tt.switch(ok, grad, floatX(np.nan))]
def free_energy(self, v_sample): """ Function to compute the free energy, it overwrite free energy function (here only v_bias term is different) :param v_sample: Sampling values of visible units """ wx_b = T.dot(v_sample, self.W) + self.h_bias v_bias_term = 0.5 * T.dot((v_sample - self.v_bias), (v_sample - self.v_bias).T) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) return -hidden_term - T.diagonal(v_bias_term)
def __call__(self, A, b, inference=False): dA = T.diagonal(A) D = T.diag(dA) R = A - D iD = T.diag(1.0 / dA) x = T.zeros_like(b) for i in range(self.iterations): x = iD.dot(b - R.dot(x)) return x
def rbf_mmd2_and_ratio(X, Y, sigma=0, biased=True): gamma = 1 / (2 * sigma**2) XX = T.dot(X, X.T) XY = T.dot(X, Y.T) YY = T.dot(Y, Y.T) X_sqnorms = T.diagonal(XX) Y_sqnorms = T.diagonal(YY) K_XY = T.exp( -gamma * (-2 * XY + X_sqnorms[:, np.newaxis] + Y_sqnorms[np.newaxis, :])) K_XX = T.exp( -gamma * (-2 * XX + X_sqnorms[:, np.newaxis] + X_sqnorms[np.newaxis, :])) K_YY = T.exp( -gamma * (-2 * YY + Y_sqnorms[:, np.newaxis] + Y_sqnorms[np.newaxis, :])) return _mmd2_and_ratio(K_XX, K_XY, K_YY, unit_diagonal=True, biased=biased)
def fill_diagonal(x, val): """Fills in the diagonal of a tensor. """ if val.size.eval() == 1: val = T.extra_ops.repeat(val, x.eval().shape[0]) # adapted from following theano help topic: https://groups.google.com/forum/#!topic/theano-users/zYD-gsddIYs orig_diag = T.diag(T.diagonal(x)) new_diag = T.diag(val) y = x - orig_diag + new_diag return y
def compile_theano_funcs(self, opt_algo, opt_params, dropout): self.compiled_funcs = {} # Compile Train & Optimization Function eps = 1e-5 params = Tt.vector('params') X, Y = Tt.matrix('X'), Tt.matrix('Y') sig2, F, M, V = self.feature_maps(X, params) EPhi = F[-1] EPhiPhiT = Tt.dot(EPhi, Tt.transpose(EPhi)) A = EPhiPhiT + (sig2 + eps) * Tt.identity_like(EPhiPhiT) L = Tlin.cholesky(A) Linv = Tlin.matrix_inverse(L) YPhiT = Tt.dot(Y, Tt.transpose(EPhi)) beta = Tt.dot(YPhiT, Tt.transpose(Linv)) alpha = Tt.dot(beta, Linv) mu_F = Tt.dot(alpha, EPhi) GOF = .5 / sig2 * Tt.sum(Tt.sum(Tt.dot(Y, (Y - mu_F).T))) REG = Tt.sum(Tt.log( Tt.diagonal(L))) + (self.N - self.D[-2]) / 2 * Tt.log(sig2) REG *= self.D[-1] KL = 0 for h in range(self.H): KL += Tt.sum(Tt.sum(M[h]**2) + Tt.sum(V[h] - Tt.log(V[h] + eps))) KL -= self.D[h + 1] * self.D[h + 2] // 2 obj = debug('obj', GOF + REG + KL) self.compiled_funcs['debug'] = Tf([X, Y], [obj], givens=[(params, self.params)]) grads = Tt.grad(obj, params) updates = {self.params: grads} updates = getattr(Optimizer, opt_algo)(updates, **opt_params) updates = getattr(Optimizer, 'nesterov')(updates, momentum=0.9) train_inputs = [X, Y] train_outputs = [obj, alpha, Linv, mu_F] self.compiled_funcs['opt'] = Tf(train_inputs, train_outputs, givens=[(params, self.params)], updates=updates) self.compiled_funcs['train'] = Tf(train_inputs, train_outputs, givens=[(params, self.params)]) # Compile Predict Function Linv, alpha = Tt.matrix('Linv'), Tt.matrix('alpha') Xs = Tt.matrix('Xs') sig2, Fs, _, _ = self.feature_maps(Xs, params) EPhis = Fs[-1] mu_Fs = Tt.dot(alpha, EPhis) std_Fs = ((sig2 * (1 + (Tt.dot(Linv, EPhis)**2).sum(0)))**0.5)[:, None] pred_inputs = [Xs, alpha, Linv] pred_outputs = [mu_Fs, std_Fs] self.compiled_funcs['pred'] = Tf(pred_inputs, pred_outputs, givens=[(params, self.params)])
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: return [tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))] else: return [tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))]
def prepare_model(x_train, y_train, batchsize, params=None): input_var = T.matrix('inputs') target_var = T.ivector('targets') same_cluster_indices_matrix = T.matrix('same_clusters') diff_cluster_indices_matrix = T.matrix('diff_clusters') # prepare network print '\nPreparing the model with primary hidden layer size %d...'%HOURGLASS_LAYER_SIZE print 'X-shape = %d, Num_classes = %d, num_samples = %d'%(x_train[0].shape[0], max(y_train), len(x_train)) representation_layer, network = build_args_nn(x_train, y_train, batchsize, input_var) # loss stuff prediction = lasagne.layers.get_output(network) get_representations = lasagne.layers.get_output(representation_layer, inputs=input_var, deterministic=True) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) if LAMBDA1 == LAMBDA2 == 0.0: loss = loss.mean() else: representations = get_representations dot_prods = T.dot(representations, representations.T) # X times X.T diag = T.sqrt(T.diagonal(dot_prods)) # sqrt(||ri||^2) = ||ri|| norms = T.outer(diag, diag.T) distances = 0.5*(1 -(dot_prods * (1./norms))) # d(a,b) = 1/2 (1 - dot(a,b) / (||a||*||b||)) # we want the first sum to be as close to zero as possible, so we add it to the loss. # we want the second sum to be as close to 1 as possible, so we want LAMBDA2 * (1 - sum2) # to be as close to zero as possible, thus adding that difference to the overall loss. loss = loss.mean() \ + (LAMBDA1 * T.sum(same_cluster_indices_matrix * distances)) \ + (LAMBDA2 * (1.0 - T.sum(diff_cluster_indices_matrix * distances))) # for loading/building the parameters if not params: params = lasagne.layers.get_all_params(network, trainable=True) else: lasagne.layers.set_all_param_values(network, params) params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=LEARNING_RATE) # the final keys train_function = theano.function([input_var, target_var, same_cluster_indices_matrix, diff_cluster_indices_matrix], loss, updates=updates, allow_input_downcast=True, on_unused_input='ignore') convert_to_numpy_function = theano.function([input_var], get_representations, allow_input_downcast=True) # theano.printing.debugprint(train_function.maker.fgraph.outputs[0]) return network, train_function, convert_to_numpy_function
def plot_qXphi(signal, n=int(1e5), axis=None): axis, show_it, lim = axis_and_show(axis) if axis is None: return en = np.mean(np.square(signal.val()), axis=1) nphi = np.linalg.norm(signal.layer.phi[0].get_value(), axis=0) Q = T.diagonal(signal.layer.Q).eval() pen, = axis.plot(en[:n], 's-') pphi, = axis.plot(nphi[:n], '*-') pq, = axis.plot(Q[:n], 'x-') axis.legend([pen, pphi, pq], ['E{X^2}', '|phi|', 'q_i']) lim([0.0, 5]) if show_it: axis.show()
def cca_loss(y1, y2, lamda=0.1): ''' Approximated cca loss of two views ''' y1_mean = T.mean(y1, axis=0) y1_centered = y1 - y1_mean y2_mean = T.mean(y2, axis=0) y2_centered = y2 - y2_mean corr_nr = T.sum(y1_centered * y2_centered, axis=0) corr_dr1 = T.sqrt(T.sum(y1_centered * y1_centered, axis=0) + 1e-8) corr_dr2 = T.sqrt(T.sum(y2_centered * y2_centered, axis=0) + 1e-8) corr_dr = corr_dr1 * corr_dr2 corr = corr_nr / corr_dr #C12 = T.dot(y1_centered.T, y2_centered)# / y1_centered.shape[0] #l12 = 0.5*((C12 ** 2).sum() - (T.diagonal(C12) ** 2).sum()) C11 = T.dot(y1_centered.T, y1_centered) / y1_centered.shape[0] l11 = 0.5 * ((C11**2).sum() - (T.diagonal(C11)**2).sum()) C22 = T.dot(y2_centered.T, y2_centered) / y1_centered.shape[0] l22 = 0.5 * ((C22**2).sum() - (T.diagonal(C22)**2).sum()) # return -T.sum(corr) + lamda*(l11+l22+l12) return -T.sum(corr) + lamda * (l11 + l22)
def L_op(self, inputs, outputs, gradients): # Modified from theano/tensor/slinalg.py # No handling for on_error = 'nan' dz = gradients[0] chol_x = outputs[0] # this is for nan mode # # ok = ~tensor.any(tensor.isnan(chol_x)) # chol_x = tensor.switch(ok, chol_x, 1) # dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.0) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return gpu_solve_upper_triangular( outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T ) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)) ) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) return [grad]
def lerp(old, new, min_tau=0.0, en=None): """ Return new interpolated value and a relative difference """ diff = T.mean(T.sqr(new) - T.sqr(old), axis=1, keepdims=True) rel_diff = diff / (T.mean(T.sqr(old), axis=1, keepdims=True) + 1e-5) t = rel_diff * 20. t = T.where(t < 5, 5, t) t = T.where(t > 100, 100, t) t = t + min_tau if en is not None: lmbd = T.diagonal(en).dimshuffle(0, 'x') * (1. / t) else: lmbd = 1. / t return ((1 - lmbd) * old + lmbd * new, t, rel_diff)
def __call__(self, model, X): # Corrupt X corrupted_inputs = model.corruptor(X) hidden = model.encode(corrupted_inputs) ex_corrupted_hidden = (1 - self.q) * hidden ex_recon = model.decode(ex_corrupted_hidden) # Trace term depends on variance # var_cost = (tensor.diagonal(theano.dot(model.w_prime, model.w_prime.T)) * (self.q * (1-self.q)**2 * hidden**2).mean(axis=0)).sum() var_cost = ( tensor.diagonal(theano.dot(model.w_prime, model.w_prime.T)) * (self.q * (1 - self.q) * hidden ** 2).mean(axis=0) ).sum() recon_cost = ((ex_recon - X) ** 2).sum(axis=1).mean() cost = var_cost + recon_cost return cost
def __init__(self, X_u, X_v, y, Uinit, Vinit, loss=loss_squared, regularization=(1, 1), reg_type=(lambda x: T.mean(x**2), lambda x: T.mean(x**2))): self.U = theano.shared(Uinit) self.V = theano.shared(Vinit) self.X_u = X_u self.X_v = X_v self.y = y self.prediction_matrix = X_u.dot(self.U).dot((X_v.dot(self.V)).T) self.prediction_pairs = T.diagonal(self.prediction_matrix) self.cost = loss(y, self.prediction_pairs) self.cost += regularization[0] * reg_type[0](self.U) self.cost += regularization[1] * reg_type[1](self.V)
def step(input_n, hid_prevprev, hid_previous, *args): # Compute the hidden-to-hidden activation hid_pre = helper.get_output(self.hidden_to_hidden, hid_previous, **kwargs) # If the dot product is precomputed then add it, otherwise # calculate the input_to_hidden values and add them if self.precompute_input: hid_pre += input_n else: hid_pre += helper.get_output( self.input_to_hidden, input_n, **kwargs) # Clip gradients if self.grad_clipping: hid_pre = theano.gradient.grad_clip(hid_pre, -self.grad_clipping, self.grad_clipping) hid_pre += self.gamma * hid_prevprev * T.clip(T.tile(T.reshape(T.diagonal(T.dot(hid_prevprev, hid_previous.T)), (1,hid_previous.shape[0])), (hid_previous.shape[1],1)).T, 0.0, 100.0) return self.nonlinearity( hid_pre )
def marginalize_over_v_z(self, h): # energy = \sum_{i=1}^{|h|} h_i*b_i - \beta * ln(1 + e^{b_i}) if self.penalty == "softplus_bi": energy = (h * self.b).T - self.beta * T.log(1 + T.exp(self.b))[:, None] elif self.penalty == "softplus0": energy = (h * self.b).T - self.beta * T.log(1 + T.exp(0))[:, None] else: raise NameError("Invalid penalty term") energy = T.set_subtensor(energy[(T.isnan(energy)).nonzero()], 0) # Remove energy = T.sum(energy, axis=0, keepdims=True).T ener = T.tensordot(h, self.W, axes=0) ener = T.diagonal(ener, axis1=1, axis2=2) ener = T.set_subtensor(ener[(T.isnan(ener)).nonzero()], 0) ener = T.sum(ener, axis=2) + self.c[None, :] ener = T.sum(T.log(1 + T.exp(ener)), axis=1, keepdims=True) return -(energy + ener)
def compile_adapt_f(self, signals): x = self.signal(signals) x_prev = [p.signal(signals) for p in self.prev] assert np.all([x.k == xp.k for xp in x_prev]) assert self.m == [xp.n for xp in x_prev] assert x.n == self.n k = np.float32(x.k) # Modulate x if x.modulation is not None: x_ = x.var * T.as_tensor_variable(x.modulation) else: x_ = x.var updates = [] upd = lambda en, old, new: [(old, ifelse(en, new, old))] E_XX_new, _, d = lerp(self.E_XX, T.dot(x_, x_.T) / k, self.min_tau) updates += upd(self.enabled, self.E_XX, E_XX_new) b = 1. d = T.diagonal(E_XX_new) stiff = T.scalar('stiffnes', dtype=FLOATX) Q_new = theano_diag(b / T.where(d < stiff * self.stiffx, stiff * self.stiffx, d)) updates += upd(self.enabled, self.Q, Q_new) for i, x_p in enumerate(x_prev): E_XU_new, _, d_ = lerp(self.E_XU[i], T.dot(x_, x_p.var.T) / k, self.min_tau) updates += upd(self.enabled, self.E_XU[i], E_XU_new) d = T.maximum(d, d_) updates += upd(self.enabled, self.phi[i], T.dot(Q_new, E_XU_new).T) self.info('Compile layer update between: ' + self.name + ' and ' + ', '.join([p.name for p in self.prev])) return theano.function( inputs=[stiff], outputs=d, updates=updates)
def __mapper(self, train_example): pos_triple, neg_triple = train_example[0:3], train_example[3:] unconstrained_objective = self.margin - self.__objective_triple(neg_triple) \ + self.__objective_triple(pos_triple) entity_normalize = T.sum(T.square(self.Entity.norm(2, axis=0)) - 1) relation_normalize = T.square(self.Relation.norm(2, axis=0)) surface_normalize = T.square(T.diagonal(T.dot(self.RelationNormal.T, self.Relation))) / relation_normalize surface_normalize = T.sum(surface_normalize - self.epsilon ** 2) unconstrained_objective_positive = ifelse(T.gt(unconstrained_objective, theano.shared(0.0)), unconstrained_objective, theano.shared(0.0)) entity_normalize_positive = ifelse(T.gt(entity_normalize, theano.shared(0.0)), entity_normalize, theano.shared(0.0)) surface_normalize_positive = ifelse(T.gt(surface_normalize, theano.shared(0.0)), surface_normalize, theano.shared(0.0)) return unconstrained_objective_positive + self.regularize_factor \ * (surface_normalize_positive + entity_normalize_positive)
def get_mapping(self, pr): X = T.transpose(self.input) # X=[X;ones(1,size(X,2))]; X = T.concatenate([X, T.ones((1, X.shape[1]))], axis=0) #d=size(X,1); d = X.shape[0] #q=[ones(d-1,1).*(1-p); 1]; q = T.concatenate([T.ones((d - 1, 1)) * (1 - pr), T.ones((1, 1))], axis=0) #S=X*X'; S = T.dot(X, X.T) #Q=S.*(q*q'); Q = S * T.dot(q, q.T) #Q(1:d+1:end)=q.*diag(S); Q -= (T.eye(Q.shape[0]) * Q.diagonal()) Q += T.eye(Q.shape[0]) * T.diagonal(q * S.diagonal()) #P=S.*repmat(q',d,1); P = S * T.extra_ops.repeat(q.T, d, 0) #W=P(1:end-1,:)/(Q+1e-5*eye(d)); A = Q + 10**-5 * T.eye(d) B = P self.W = T.slinalg.solve(A.T, B.T)[:-1, :] self.Xh = T.tanh(T.dot(self.W, X)).T return self.W, self.Xh
def get_gaussian_likelihood(comps, X_, mu_, S_, w_, feat_dim): _2PI = 2. * np.pi comps = T.cast(comps, 'int32') mu = mu_[comps, :] w = w_[comps] S = S_[comps, :, :] mu = T.cast(mu, "float32") w = T.cast(w, "float32") S = T.cast(S, "float32") X = T.cast(X_, "float32") feat_dim = T.cast(feat_dim, "float32") residuals_t = X - mu maha_t = T.diagonal(residuals_t.dot(T.nlinalg.matrix_inverse(S)).dot(residuals_t.T)) likelihood_t = ( T.nlinalg.det(_2PI * S) ** -0.5 * (T.exp(-0.5 * maha_t)) ) likelihood_t += feat_dim * np.float32(0.) return likelihood_t * w
def dot(x,y): return T.sum(T.diagonal(T.dot(x, T.transpose(y))))
def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.0)
def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)
def OptimalGaussian(x_train, y_train, Regression=True, Classification=False, bias=False, n_iter=5, alpha=0.01, minibatch=False): ''' inputs x_train: training features y_train: response variable n_iter: # of iterations for SGD alpha: strength of L2 penalty (default penalty for now) outputs Gaussian Node: dictionary with Node parameters an predict method ''' rng = numpy.random feats = len(x_train[0, :]) N = len(x_train[:, 0]) D = [x_train, y_train] training_steps = n_iter #print "training steps: ", training_steps #print "penalty strength: ", alpha #print "Uses bias: ", bias # Declare Theano symbolic variables x = T.matrix("x") y = T.vector("y") w = theano.shared(rng.uniform(low=-0.25, high=0.25, size=feats), name="w") b = theano.shared(abs(rng.randn(1)[0]), name="b") a = theano.shared(abs(rng.randn(1)[0]), name="a") rep = theano.shared(numpy.asarray([1]*N), name="rep") #print "Initialize node as:" #print w.get_value(), b.get_value(), a.get_value() # Construct Theano expression graph W = T.outer(rep, w) if bias: p_1 = a * T.exp(-0.5 / (b**2) * T.dot((x - w).T, (x - w))) else: p_1 = a * T.exp(-0.5 / (1**2) * T.diagonal(T.dot((x - W), (x - W).T))) prediction = p_1 > 0.5 if Regression: xent = 0.5 * (y - p_1)**2 if alpha == 0: cost = xent.mean() # The cost to minimize else: cost = xent.mean() + alpha * ((w ** 2).sum()) if bias: gw, gb, ga = T.grad(cost, [w, b, a]) else: gw, ga = T.grad(cost, [w, a]) # Compute the gradient of the cost # Compile Node = {} Node['Path'] = {} NodePath = Node['Path'] if bias: train = theano.function(inputs=[x, y], outputs=[prediction, xent], updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb), (a, a - 0.1 * ga))) else: train = theano.function(inputs=[x, y], outputs=[prediction, xent], updates=((w, w - 0.1 * gw), (a, a - 0.1 * ga))) predict = theano.function(inputs=[x], outputs=p_1) # Train for i in range(training_steps): if minibatch: batch_split = train_test_split(x_train, y_train, test_size=0.2) _, D[0], _, D[1] = batch_split #IPython.embed() pred, err = train(D[0], D[1]) elif not minibatch: pred, err = train(D[0], D[1]) NodePath[str(i)] = {} NodePath[str(i)]['w'] = w.get_value() NodePath[str(i)]['b'] = b.get_value() NodePath[str(i)]['a'] = a.get_value() Node['w'] = w.get_value() Node['b'] = b.get_value() Node['a'] = a.get_value() Node['predict'] = predict return Node
def no_linear_dependencies_constraint(W, k): M=T.dot(W, W.T) M=M-T.diag(T.diagonal(M)) cost = -T.sum(T.log(1.0 - M ** 2))/2 return cost
def free_energy(self, v_sample): wx_b = T.dot(v_sample, self.W) + self.hbias vbias_term = 0.5 * T.dot((v_sample - self.vbias), (v_sample - self.vbias).T) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) # return -hidden_term - vbias_term return -hidden_term - T.diagonal(vbias_term)
def test_jax_basic(): x = tt.matrix("x") y = tt.matrix("y") b = tt.vector("b") # `ScalarOp` z = tt.cosh(x**2 + y / 3.0) # `[Inc]Subtensor` out = tt.set_subtensor(z[0], -10.0) out = tt.inc_subtensor(out[0, 1], 2.0) out = out[:5, :3] out_fg = theano.gof.FunctionGraph([x, y], [out]) test_input_vals = [ np.tile(np.arange(10), (10, 1)).astype(theano.config.floatX), np.tile(np.arange(10, 20), (10, 1)).astype(theano.config.floatX), ] (jax_res, ) = compare_jax_and_py(out_fg, test_input_vals) # Confirm that the `Subtensor` slice operations are correct assert jax_res.shape == (5, 3) # Confirm that the `IncSubtensor` operations are correct assert jax_res[0, 0] == -10.0 assert jax_res[0, 1] == -8.0 out = tt.clip(x, y, 5) out_fg = theano.gof.FunctionGraph([x, y], [out]) compare_jax_and_py(out_fg, test_input_vals) out = tt.diagonal(x, 0) out_fg = theano.gof.FunctionGraph([x], [out]) compare_jax_and_py( out_fg, [np.arange(10 * 10).reshape((10, 10)).astype(theano.config.floatX)]) out = tt.slinalg.cholesky(x) out_fg = theano.gof.FunctionGraph([x], [out]) compare_jax_and_py( out_fg, [(np.eye(10) + np.random.randn(10, 10) * 0.01).astype( theano.config.floatX)], ) # not sure why this isn't working yet with lower=False out = tt.slinalg.Cholesky(lower=False)(x) out_fg = theano.gof.FunctionGraph([x], [out]) compare_jax_and_py( out_fg, [(np.eye(10) + np.random.randn(10, 10) * 0.01).astype( theano.config.floatX)], ) out = tt.slinalg.solve(x, b) out_fg = theano.gof.FunctionGraph([x, b], [out]) compare_jax_and_py( out_fg, [ np.eye(10).astype(theano.config.floatX), np.arange(10).astype(theano.config.floatX), ], ) out = tt.nlinalg.alloc_diag(b) out_fg = theano.gof.FunctionGraph([b], [out]) compare_jax_and_py(out_fg, [np.arange(10).astype(theano.config.floatX)]) out = tt.nlinalg.det(x) out_fg = theano.gof.FunctionGraph([x], [out]) compare_jax_and_py( out_fg, [np.arange(10 * 10).reshape((10, 10)).astype(theano.config.floatX)]) out = tt.nlinalg.matrix_inverse(x) out_fg = theano.gof.FunctionGraph([x], [out]) compare_jax_and_py( out_fg, [(np.eye(10) + np.random.randn(10, 10) * 0.01).astype( theano.config.floatX)], )
def free_energy_grbm(self, v_sample): ''' Function to compute the free energy ''' wx_b = T.dot(v_sample,self.W)+self.hbias vbias_term = 0.5*T.dot((v_sample-self.vbias),(v_sample-self.vbias).T) hidden_term = T.sum(T.log(1+T.exp(wx_b)),axis=1) return -hidden_term-T.diagonal(vbias_term)