def loss(lv1, lv2): """ Contrastive cosine distance optimization target """ n = lv1.shape[0] # direction 1 D = lv1.dot(lv2.T) d = D.diagonal().reshape((-1, 1)) M = T.identity_like(D) O = D[(M <= 0).nonzero()].reshape((n, n - 1)) L = gamma - d L = T.repeat(L, n - 1, 1) L += O L = T.clip(L, 0, 1000) loss = L.mean() # direction 2 if symmetric: D = lv2.dot(lv1.T) d = D.diagonal().reshape((-1, 1)) M = T.identity_like(D) O = D[(M <= 0).nonzero()].reshape((n, n - 1)) L = gamma - d L = T.repeat(L, n - 1, 1) L += O L = T.clip(L, 0, 1000) loss += L.mean() return weight * loss
def _get_orthogonal_matrix_inv(self): # A: skew symmetric matrix # O: orthogonal mattrix # O = (I + A)(I - A)^-1 # 1. create upper triangular matrix using self.decorr # 2. create lower triangular matrix using -self.decorr # 3. add them up and take matrix exponential n = self.ortho_n num_triu_entries = (n - 1) * n / 2 triu_index_matrix = np.zeros([n, n], dtype=np.int32) triu_index_matrix[np.triu_indices(n, 1)] = np.arange(num_triu_entries) triu_index_matrix[np.triu_indices( n, 1)[::-1]] = np.arange(num_triu_entries) triu_mat = self.decorr[ triu_index_matrix] # symmetric matrix with diagonal values be the first element of self.decorr triu_mat = tt.extra_ops.fill_diagonal(triu_mat, 0) # set diagonal values zero triu_mat = tt.set_subtensor(triu_mat[np.triu_indices(n, 1)[::-1]], triu_mat[np.triu_indices(n, 1)[::-1]] * -1) part1 = tt.identity_like(triu_mat) + triu_mat part2 = tt.nlinalg.MatrixInverse()(tt.identity_like(triu_mat) - triu_mat) orth_mat = K.dot(part1, part2) return orth_mat
def get_opt_A(self, sn_trf, EPhiTPhi, XT_EPhi, K_MM): cholSigInv = sT.cholesky(EPhiTPhi + sn_trf * T.identity_like(K_MM)) cholK_MM = sT.cholesky(K_MM + 1e-6 * T.identity_like(K_MM)) invCholSigInv = sT.matrix_inverse(cholSigInv) invCholK_MM = sT.matrix_inverse(cholK_MM) InvSig = invCholSigInv.T.dot(invCholSigInv) InvK_MM = invCholK_MM.T.dot(invCholK_MM) Sig_EPhiT_X = InvSig.dot(XT_EPhi.T) return Sig_EPhiT_X, cholSigInv, cholK_MM, InvK_MM
def exact_proj_sqrtm(x, x_test, gp_params, indep_noise, batch_size): Ktt = cov_mat(x_test, x_test, gp_params) Kxt = cov_mat(x, x_test, gp_params) Kxx = cov_mat(x, x, gp_params) Kxx = Kxx + indep_noise * T.identity_like(Kxx) KxtT_Kxxinv = Kxt.T.dot(T.nlinalg.matrix_inverse(Kxx)) K = Ktt - KxtT_Kxxinv.dot(Kxt) K = K + 1e-10 * T.identity_like(K) R = theano_sqrtm(K) eps = rng.normal(size=(batch_size, x_test.shape[0])) return R.dot(eps.T).T
def add_withening_regularization(hidden_x, hidden_y_reversed): hooks_temp = {} loss_withen = T.constant(0) for x, y in zip(hidden_x, hidden_y_reversed): x_value = lasagne.layers.get_output(x, moving_avg_hooks=hooks_temp) y_value = lasagne.layers.get_output(y, moving_avg_hooks=hooks_temp) cov_x = T.dot(x_value.T, x_value) / x_value.shape[0] cov_y = T.dot(y_value.T, y_value) / y_value.shape[0] loss_withen += Params.WITHEN_REG_X * T.mean(T.sum(abs(cov_x - T.identity_like(cov_x)), axis=0)) loss_withen += Params.WITHEN_REG_Y * T.mean(T.sum(abs(cov_y - T.identity_like(cov_y)), axis=0)) return loss_withen
def log_joint_scan_fn(n, llik, y, cov, mask): partial_cov = T.outer(mask[:, n], mask[:, n]) * cov + (1 - mask[:, n]) * T.identity_like(cov) llik += (-1 / 2.0) * T.log(T.nlinalg.Det()(partial_cov)) - (1 / 2.0) * T.dot(y[:, n].T, T.dot( T.nlinalg.MatrixInverse()(partial_cov), y[:, n])) return llik
def chi2_test_statistic(M, Obs, K, num_M, num_Obs): #Getting frequencies from observations Ns = T.dot(Obs,T.ones((K,1))) p = Obs/Ns #Find the zeros so we can deal with them later pZEROs = T.eq(p, 0) mZEROs = T.eq(M, 0) #log probabilities, with -INF as log(0) lnM = T.log(M + mZEROs) - INF*mZEROs lnp = T.log(p + pZEROs) - INF*pZEROs #Using kroneker products so every row of M hits every row of P in the difference klnM - kln O_ones = T.ones((num_Obs,1)) M_ones = T.ones((num_M,1)) klnM = kron(lnM,O_ones) klnP = kron(M_ones, lnp) klnP_M = klnP - klnM kObs = kron(M_ones, Obs) G = 2.0*T.dot(klnP_M ,kObs.T) G = G*T.identity_like(G) G = T.dot(G,T.ones((num_M*num_Obs,1))) G = T.reshape(G,(num_M,num_Obs)) #The following quotient improves the convergence to chi^2 by an order of magnitude #source: http://en.wikipedia.org/wiki/Multinomial_test #numerator = T.dot(- 1.0/(M + 0.01),T.ones((K,1))) - T.ones((num_M,1)) #q1 = T.ones((num_M,num_Obs)) + T.dot(numerator,1.0/Ns.T/6.0)/(K-1.0) return G#/q1
def exact_post_mean(x, x_test, gp_params, indep_noise, y): Kxt = cov_mat(x, x_test, gp_params) Kxx = cov_mat(x, x, gp_params) Kxx = Kxx + indep_noise * T.identity_like(Kxx) KxtT_Kxxinv = Kxt.T.dot(T.nlinalg.matrix_inverse(Kxx)) mu = KxtT_Kxxinv.dot(y) return mu
def __init__(self, n_comp=10, verbose=False): # Theano initialization self.T_weights = shared(np.eye(n_comp, dtype=np.float32)) self.T_bias = shared(np.ones((n_comp, 1), dtype=np.float32)) T_p_x_white = T.fmatrix() T_lrate = T.fscalar() T_block = T.fscalar() T_unmixed = T.dot(self.T_weights,T_p_x_white) + T.addbroadcast(self.T_bias,1) T_logit = 1 - 2 / (1 + T.exp(-T_unmixed)) T_out = self.T_weights + T_lrate * T.dot(T_block * T.identity_like(self.T_weights) + T.dot(T_logit, T.transpose(T_unmixed)), self.T_weights) T_bias_out = self.T_bias + T_lrate * T.reshape(T_logit.sum(axis=1), (-1,1)) T_max_w = T.max(self.T_weights) T_isnan = T.any(T.isnan(self.T_weights)) self.w_up_fun = theano.function([T_p_x_white, T_lrate, T_block], [T_max_w, T_isnan], updates=[(self.T_weights, T_out), (self.T_bias, T_bias_out)], allow_input_downcast=True) T_matrix = T.fmatrix() T_cov = T.dot(T_matrix,T.transpose(T_matrix))/T_block self.cov_fun = theano.function([T_matrix, T_block], T_cov, allow_input_downcast=True) self.loading = None self.sources = None self.weights = None self.n_comp = n_comp self.verbose = verbose
def get_output_for(self, inputs, **kwargs): """ Compute diffusion convolution of inputs. """ A = inputs[0] X = inputs[1] # Normalize by degree. A = A / (T.sum(A, 0) + 1.0) Apow_list = [T.identity_like(A)] for i in range(1, self.parameters.num_hops + 1): Apow_list.append(A.dot(Apow_list[-1])) Apow = T.stack(Apow_list) Apow_dot_X = T.dot(Apow, X) Apow_dot_X_times_W = Apow_dot_X * self.W out = self.nonlinearity( T.mean( T.reshape(T.mean(Apow_dot_X_times_W, 1), (1, (self.parameters.num_hops + 1), self.num_features)), 2)) return out
def build(self, loss1="crossentropy", loss2="mse", optimizer="rmsprop", lr=0.01, rho=0.9, epsilon=1e-6): self.loss1 = losses.get(loss1) self.loss2 = losses.get(loss2) optim = optimizers.get(optimizer, inst=False) if optim.__name__ == "RMSprop": self.optimizer = optim(lr=lr, rho=rho, epsilon=epsilon) elif optim.__name__ == "Adagrad": self.optimizer = optim(lr=lr, epsilon=epsilon) else: self.optimizer = optim(lr=lr) # get input to model self.X_c = TT.fmatrix(name="X_c") # n_batches*input_dim # output label self.Y = TT.matrix(dtype=theano.config.floatX, name="Y") # n_batches*nc self.X_recon, self.Y_pred, self.Y_class, self.ave_emb, self.group_ids = self.get_output( ) # Y_pred: n_batches*nc # prediction_loss + reconstruction_loss + reg_loss train_loss_pred = self.get_loss1(self.Y, self.Y_pred) + self.get_loss2( self.X_c, self.X_recon) reg1_loss = TT.sqr(self.W_g).mean() + TT.sqr( self.W_enc).mean() + TT.sqr(self.W_dec).mean() Wg_norm, _ = theano.scan(lambda row: row / LA.norm(row), sequences=[self.W_g]) inter_Wg = TT.dot(Wg_norm, TT.transpose(Wg_norm)) reg2_loss = self.get_loss2(inter_Wg, TT.identity_like(inter_Wg)) train_loss = train_loss_pred + 0.1 * reg1_loss + 0.0001 * reg2_loss updates = self.optimizer.get_updates(self.params, cost=train_loss) self.grad_h = theano.function(inputs=[self.X_c, self.Y], on_unused_input='warn', outputs=optimizers.get_gradients( train_loss, self.X_c), allow_input_downcast=True) self.train = theano.function(inputs=[self.X_c, self.Y], on_unused_input='warn', outputs=train_loss_pred, updates=updates, allow_input_downcast=True, mode=None) self.predict = theano.function(inputs=[self.X_c], on_unused_input='warn', outputs=[self.Y_pred, self.Y_class], allow_input_downcast=True) self.get_emb = theano.function(inputs=[self.X_c], on_unused_input='warn', outputs=[self.ave_emb, self.group_ids], allow_input_downcast=True, mode=None)
def MVNormalScan(n, is_observed_matrix, covariance_matrix_mvn_scan, w_mvn_scan, zY, zK, true_full_matrix): #construct is_unobserved_matrix a vector of 1s and 0s where the ith coord is a 1 if we haven't seen the ith coord of y_n is_unobserved_matrix = -(is_observed_matrix[:, n] - T.ones(D)) #construct covariance of the observed entries where the rows/columns with nothing have a 1 on diag (so invertible) sigma_observed = T.outer( is_observed_matrix[:, n], is_observed_matrix[:, n]) * covariance_matrix_mvn_scan + ( is_unobserved_matrix * T.identity_like(covariance_matrix_mvn_scan)) sigma_unobs_obs = (T.outer( is_unobserved_matrix, is_observed_matrix[:, n])) * covariance_matrix_mvn_scan sigma_observed_inv = T.nlinalg.MatrixInverse()(sigma_observed) dummy_y = T.zeros(D) #draw the mean vector dummy_y from N(0, wwT+sigma^2I) using computationally fast trick dummy_results, dummy_updates = theano.scan( lambda prior_result, sigma, zY, w_mvn_scan, zK, n: T.sqrt( sigma) * zY[:, n] + T.dot(w_mvn_scan, zK[:, n]) + prior_result, sequences=None, outputs_info=T.zeros(D), non_sequences=[sigma, zY, w_mvn_scan, zK, n], n_steps=R) dummy_y = dummy_results[-1] dummy_y /= R dummy_y_obs = is_observed_matrix[:, n] * dummy_y dummy_y_unobs = is_unobserved_matrix * dummy_y y_unobserved = dummy_y_unobs + T.dot( T.dot(sigma_unobs_obs, sigma_observed_inv), (theano_observed_matrix[:, n] - dummy_y_obs)) y_unobserved = (y_unobserved * is_unobserved_matrix) + ( true_full_matrix[:, n] + ERROR[0]) * is_observed_matrix[:, n] #Add true_full_matrix for BETA discount method instead of subtracting infinity from observed entries return [y_unobserved, sigma_unobs_obs, sigma_observed_inv]
def _svi(N, D, K, mnp, masknp, w_init, r_init): Shared = lambda shape, name: theano.shared(value=np.ones(shape, dtype=theano.config.floatX), name=name, borrow=True) srng = T.shared_randomstreams.RandomStreams(seed=120) mask = Shared((D, N), 'mask') mask.set_value(masknp) m = T.as_tensor_variable(mnp) y = mask * m zero_y = T.as_tensor_variable(np.zeros((D, N))) zero2 = T.as_tensor_variable(np.zeros((D, D))) zero = T.as_tensor_variable(np.zeros(D)) st = T.sum(T.neq(y, zero_y), axis=0) s = st.eval() # Define variational parameters m_w = Shared((D, K), 'm_w') m_w.set_value(w_init) s_w = Shared((D, K), "s_w") m_r = Shared((K), 'm_r') m_r.set_value(r_init) s_r = Shared((K), 's_r') m_gamma = Shared((1), 'm_gamma') s_gamma = Shared((1), 's_gamma') m_gamma0 = Shared((1), 'm_gamma0') s_gamma0 = Shared((1), 's_gamma0') m_c0 = Shared((1), 'm_c0') s_c0 = Shared((1), 's_c0') m_sigma = Shared((1), 'm_sigma') s_sigma = Shared((1), 's_sigma') # Define noise for model parameters z_w = srng.normal((D, K)) z_r = srng.normal([K]) z_gamma = srng.normal([1]) z_gamma0 = srng.normal([1]) z_c0 = srng.normal([1]) z_sigma = srng.normal([1]) # Define variational parameters # All model parameters have a log-normal variational posterior w = T.exp(m_w + z_w * s_w) r = T.exp(m_r + z_r * s_r) gamma = T.exp(m_gamma + z_gamma * s_gamma) gamma0 = T.exp(m_gamma0 + z_gamma0 * s_gamma0) c0 = T.exp(m_c0 + z_c0 * s_c0) sigma = T.exp(m_sigma + z_sigma * s_sigma) # Define random variables for mVNscan component z_y = srng.normal([D]) z_k = srng.normal([K]) z_eps = srng.normal() # For data given seqentially we need a different covariance matrix for each yn wwT = T.dot(w, w.T) cov = Shared((D, D), 'cov') cov = wwT + sigma[0] * T.identity_like(wwT) return mask, m, y, zero_y, zero2, zero, st, m_w, s_w, w, m_r, s_r, r, m_gamma, s_gamma, gamma, m_gamma0, \ s_gamma0, gamma0, m_c0, s_c0, c0, m_sigma, s_sigma, sigma, z_y, z_k, z_eps, wwT, cov
def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total): sigma2 = tt.square(sigma) Kuu = cov_total(Xu) Kuf = cov_total(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = cov_total(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 else: # VFE or DTC Lamd = tt.ones_like(Qffd) * sigma2 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - mean_total(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) Kus = self.cov_func(Xu, Xnew) As = solve_lower(Luu, Kus) mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c)) C = solve_lower(L_B, As) if diag: Kss = self.cov_func(Xnew, diag=True) var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0) if pred_noise: var += sigma2 return mu, var else: cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) + tt.dot(tt.transpose(C), C)) if pred_noise: cov += sigma2 * tt.identity_like(cov) return mu, stabilize(cov)
def get_opt_A(self, sn_trf, EPhiTPhi, XT_EPhi): SigInv = EPhiTPhi + (sn_trf + 1e-6) * T.identity_like(EPhiTPhi) cholSigInv = sT.cholesky(SigInv) invCholSigInv = sT.matrix_inverse(cholSigInv) InvSig = invCholSigInv.T.dot(invCholSigInv) Sig_EPhiT_X = InvSig.dot(XT_EPhi.T) return Sig_EPhiT_X, cholSigInv
def calcKFAC(grad_vec, damp): self.grads = [] # self.acts = [TT.concatenate([self.model.x, TT.ones((self.model.x.shape[0], 1))], axis=1)] self.acts = [self.model.x] for l in self.model.layers: S = TT.grad(self.loss, l.s) self.grads.append(S) self.acts.append(l.a) self.G = [] self.A = [] self.F_block = [] self.F = [] cnt = TT.cast(self.grads[0].shape[0], theano.config.floatX) for i in range(len(self.grads)): self.G += [[]] self.A += [[]] for j in range(len(self.grads)): # self.G[-1] += [TT.mean(TT.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)] # self.A[-1] += [TT.mean(TT.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)] # self.G[-1] += [TT.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1))] # self.A[-1] += [TT.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1))] self.G[-1] += [ self.grads[i].TT.dot(self.grads[j]).dimshuffle('x', 0, 1) / cnt ] self.A[-1] += [ self.acts[i].TT.dot(self.acts[j]).dimshuffle('x', 0, 1) / cnt ] if self.diag: self.G[-1][-1] *= float(i == j) self.A[-1][-1] *= float(i == j) for i in range(len(self.grads)): self.F_block += [[]] for j in range(len(self.grads)): # depends on whether you want to compute the real fisher with this or the kr approximation # since numpy-base fast_kron somehow computes 3d tensors faster than theano # cblock = fast_kron(self.A[i][j], self.G[i][j]) cblock = native_kron(self.A[i][j], self.G[i][j]) cblock = cblock.reshape(cblock.shape[1:], ndim=2) self.F_block[i] += [cblock] self.F.append(TT.concatenate(self.F_block[-1], axis=1)) self.F = TT.concatenate(self.F, axis=0) self.F = (self.F + self.F.T) / 2 self.Fdamp = self.F + TT.identity_like(self.F) * damp # new_grad_vec = theano.tensor.slinalg.solve(self.Fdamp, grad_vec.dimshuffle(0, 'x')) new_grad_vec = solve_sym_pos(self.Fdamp, grad_vec) # new_grad_vec = gpu_solve(self.Fdamp, grad_vec.dimshuffle(0, 'x')) return new_grad_vec
def build_L(self, sentences, context, activation=T.nnet.sigmoid, biased_diagonal=False): """Constructs the matrix L (L-Ensemble from the paper 'Determinantal point processes for machine learning' (Kulesza, Taskar, 2013). L_mn = p_m'*p_n*(2*f(p_m'*context)-1)*(2*f(p_n'*context)-1), where p_m and p_n are the feature vector of items m and n. f is the activation function and context the context vector._ :type sentences: T.matrix, shape = (num_items_in_set, dim_per_item) :param sentences: The input matrix for the DPP. Each row of the DPP encodes the feature vector for one item of a ground set S. :type context: T.vector, shape = (dim_per_item) :param context: The feature vector encoding the context :type activation: Theano tensor function :parameter activation: An activation function (a sigmoid is recommended to get reasonable values). :type return value: T.matrix, shape = (num_items_in_set, num_items_in_set) :return return value: The Ensemble Matrix L. """ f_sentT_cont = activation(T.dot(sentences, context)) f2_1 = f_sentT_cont B = sentences * f2_1.dimshuffle((0, 'x')) B_BT = T.dot(B, B.T) if biased_diagonal: return B_BT + T.identity_like(B_BT) * 0.1 else: return B_BT
def get_monitoring_channels(self, model, X, Y=None, **kwargs): if not self.supervised: Y = None WBW = T.dot(model.W.T * model.beta, model.W) target = T.identity_like(WBW) err = WBW - target penalty = T.sqr(err).sum() log_likelihood = model.log_likelihood(X).mean() diag = (T.sqr(model.W) * model.beta.dimshuffle(0,'x')).sum(axis=0) diag_penalty = T.sqr(diag-1.).sum() rval = { 'constraint_sum_sq_err' : penalty, 'diagonal_constraint_sum_sq_err' : diag_penalty, 'log_likelihood' : log_likelihood } if self.use_admm: dual = model.dual rval['dual_min'] = dual.min() rval['dual_max'] = dual.max() rval['dual_mean'] = dual.mean() abs_dual = abs(dual) rval['abs_dual_min'] = abs_dual.min() rval['abs_dual_mean'] = abs_dual.mean() rval['abs_dual_max'] = abs_dual.max() return rval
def __init__(self, x, n_in, n_hidden, n_out, steps, rng=rng): """ Initialize a basic single-layer RNN x: symbolic input tensor n_in: input dimensionality n_hidden: # of hidden units hidden_activation: non-linearity at hidden units (e.g. relu) n_out: # of output units steps: # of time steps to truncate BPTT at """ self.Wx = _uniform_weight(n_in, n_hidden, rng) self.Wh = _ortho_weight(n_hidden, rng) self.Wy = _uniform_weight(n_hidden, n_out, rng) self.bh = _zero_bias(n_hidden) self.by = _zero_bias(n_out) self.params = [self.Wx, self.Wh, self.Wy, self.bh, self.by] def step(x_t, h_tm1, Wx, Wh, Wy, bh, by): h_t = relu(T.dot(x_t, Wx) + T.dot(h_tm1, Wh) + bh) y_t = relu(T.dot(h_t, Wy) + by) return [h_t, y_t] h0 = T.zeros((n_hidden, ), dtype=theano.config.floatX) ([h, self.output], _) = theano.scan( fn=step, sequences=x.dimshuffle([1, 0, 2]), outputs_info=[T.alloc(h0, x.shape[0], n_hidden), None], non_sequences=[self.Wx, self.Wh, self.Wy, self.bh, self.by], strict=True, truncate_gradient=steps) self.orthogonality = T.sum( T.sqr(T.dot(self.Wh, self.Wh.T) - T.identity_like(self.Wh)))
def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total): sigma2 = tt.square(sigma) Kuu = cov_total(Xu) Kuf = cov_total(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = cov_total(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 else: # VFE or DTC Lamd = tt.ones_like(Qffd) * sigma2 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - mean_total(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) Kus = self.cov_func(Xu, Xnew) As = solve_lower(Luu, Kus) mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c)) C = solve_lower(L_B, As) if diag: Kss = self.cov_func(Xnew, diag=True) var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0) if pred_noise: var += sigma2 return mu, var else: cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) + tt.dot(tt.transpose(C), C)) if pred_noise: cov += sigma2 * tt.identity_like(cov) return mu, cov if pred_noise else stabilize(cov)
def cal_encoder_step(self, encoder_val): ''' Calculate the weight ratios in encoder. :type decoder_val: class :param decoder_val: the class which stores the intermediate variables in encoder :returns: R_h_x, R_h_h are theano variables, weight ratios in encoder ''' encoder_val.x = encoder_val.x.dimshuffle(0, 1, 'x') R_state_in_x = (encoder_val.x * self.input_emb + self.input_emb_offset ) / (self.ep * TT.sgn(encoder_val.state_in) + encoder_val.state_in).dimshuffle(0, 'x', 1) R_state_in_x = R_state_in_x.dimshuffle(0, 2, 1) R_reset_in_x = encoder_val.x * self.reset_emb / ( encoder_val.reset_in + self.ep * TT.sgn(encoder_val.reset_in)).dimshuffle(0, 'x', 1) R_reset_in_x = R_reset_in_x.dimshuffle(0, 2, 1) R_gate_in_x = encoder_val.x * self.gate_emb / ( encoder_val.gate_in + self.ep * TT.sgn(encoder_val.gate_in)).dimshuffle(0, 'x', 1) R_gate_in_x = R_gate_in_x.dimshuffle(0, 2, 1) h_before = encoder_val.h_before.dimshuffle(0, 1, 'x') R_gate_h = h_before * self.gate_hidden / ( encoder_val.gate + self.ep * TT.sgn(encoder_val.gate)).dimshuffle( 0, 'x', 1) R_gate_x = R_gate_in_x * (encoder_val.gate_in / ( encoder_val.gate + self.ep * TT.sgn(encoder_val.gate))).dimshuffle( 0, 1, 'x') R_reset_h = h_before * self.reset_hidden / ( encoder_val.reset + self.ep * TT.sgn(encoder_val.reset)).dimshuffle(0, 'x', 1) R_reset_x = R_reset_in_x * ( encoder_val.reset_in / (encoder_val.reset + self.ep * TT.sgn(encoder_val.reset))).dimshuffle(0, 1, 'x') R_reseted_h = R_reset_h * self.weight + TT.eye(self.dim, self.dim) * self.weight R_reseted_x = R_reset_x * self.weight encoder_val.reseted = encoder_val.reseted.dimshuffle(0, 1, 'x') R_state_reseted = encoder_val.reseted * self.input_hidden / ( encoder_val.state + self.ep * TT.sgn(encoder_val.state)).dimshuffle(0, 'x', 1) R_state_reseted = R_state_reseted.dimshuffle(0, 2, 1) R_state_h = TT.batched_dot(R_state_reseted, R_reseted_h) R_state_x = TT.batched_dot(R_state_reseted, R_reseted_x) R_state_x += R_state_in_x * ( (encoder_val.state_in / (encoder_val.state + self.ep * TT.sgn(encoder_val.state))).dimshuffle(0, 1, 'x')) R_h = (encoder_val.gate * encoder_val.state / (encoder_val.h + self.ep * TT.sgn(encoder_val.h))).dimshuffle( 0, 1, 'x') * self.weight R_h_h = R_state_h * R_h + R_gate_h * R_h R_h2 = ((1 - encoder_val.gate) * encoder_val.h_before / (encoder_val.h + self.ep * TT.sgn(encoder_val.h))).dimshuffle( 0, 1, 'x') R_h_h = TT.identity_like(R_h_h[0]) * R_h2 R_h_x = R_gate_x * R_h + R_state_x * R_h return R_h_x, R_h_h
def get_monitoring_channels(self, model, X, Y=None, **kwargs): if not self.supervised: Y = None WBW = T.dot(model.W.T * model.beta, model.W) target = T.identity_like(WBW) err = WBW - target penalty = T.sqr(err).sum() log_likelihood = model.log_likelihood(X).mean() diag = (T.sqr(model.W) * model.beta.dimshuffle(0, 'x')).sum(axis=0) diag_penalty = T.sqr(diag - 1.).sum() rval = { 'constraint_sum_sq_err': penalty, 'diagonal_constraint_sum_sq_err': diag_penalty, 'log_likelihood': log_likelihood } if self.use_admm: dual = model.dual rval['dual_min'] = dual.min() rval['dual_max'] = dual.max() rval['dual_mean'] = dual.mean() abs_dual = abs(dual) rval['abs_dual_min'] = abs_dual.min() rval['abs_dual_mean'] = abs_dual.mean() rval['abs_dual_max'] = abs_dual.max() return rval
def loss(lv1, lv2): """ Contrastive cosine distance optimization target """ # number of samples in batch n = lv1.shape[0] # compute cosine distance D = lv1.dot(lv2.T) # compute arcus cosinus -> converts similarity into distance D = T.arccos(D) # distance between matching pairs d = D.diagonal().reshape((-1, 1)) # distance between non-matching pairs M = T.identity_like(D) O = D[(M <= 0).nonzero()].reshape((n, n - 1)) # max margin hinge loss L = gamma + d L = T.repeat(L, n - 1, 1) L -= O L = T.clip(L, 0, 1000) # compute batch mean loss = L.mean() return weight * loss
def batch_jacobian(f, wrt, size=None, *args, **kwargs): """Computes the jacobian of f(x) w.r.t. x in parallel. Args: f: Symbolic function. x: Variables to differentiate with respect to. size: Expected vector size of f(x). *args: Additional positional arguments to pass to `f()`. **kwargs: Additional key-word arguments to pass to `f()`. Returns: Theano tensor. """ if isinstance(wrt, T.TensorVariable): if size is None: y = f(wrt, *args, **kwargs).shape[-1] x_rep = T.tile(wrt, (size, 1)) y_rep = f(x_rep, *args, **kwargs) else: if size is None: size = f(*wrt, *args, **kwargs).shape[-1] x_rep = [T.tile(x, (size, 1)) for x in wrt] y_rep = f(*x_rep, *args, **kwargs) J = T.grad( cost=None, wrt=x_rep, known_grads={y_rep: T.identity_like(y_rep)}, disconnected_inputs="ignore", ) return J
def chi2_test_statistic(M, Obs, K, num_M, num_Obs): #Getting frequencies from observations Ns = T.dot(Obs, T.ones((K, 1))) p = Obs / Ns #Find the zeros so we can deal with them later pZEROs = T.eq(p, 0) mZEROs = T.eq(M, 0) #log probabilities, with -INF as log(0) lnM = T.log(M + mZEROs) - INF * mZEROs lnp = T.log(p + pZEROs) - INF * pZEROs #Using kroneker products so every row of M hits every row of P in the difference klnM - kln O_ones = T.ones((num_Obs, 1)) M_ones = T.ones((num_M, 1)) klnM = kron(lnM, O_ones) klnP = kron(M_ones, lnp) klnP_M = klnP - klnM kObs = kron(M_ones, Obs) G = 2.0 * T.dot(klnP_M, kObs.T) G = G * T.identity_like(G) G = T.dot(G, T.ones((num_M * num_Obs, 1))) G = T.reshape(G, (num_M, num_Obs)) #The following quotient improves the convergence to chi^2 by an order of magnitude #source: http://en.wikipedia.org/wiki/Multinomial_test #numerator = T.dot(- 1.0/(M + 0.01),T.ones((K,1))) - T.ones((num_M,1)) #q1 = T.ones((num_M,num_Obs)) + T.dot(numerator,1.0/Ns.T/6.0)/(K-1.0) return G #/q1
def __init__(self, steps = 1, num_layers = 2, num_units = 32, eps = 1e-2): self.X, self.Z = T.fvectors('X','Z') self.P, self.Q, self.R = T.fmatrices('P','Q','R') self.dt = T.scalar('dt') self.matrix_inv = T.nlinalg.MatrixInverse() self.ar = AutoRegressiveModel(steps = steps, num_layers = num_layers, num_units = num_units, eps = eps) l = InputLayer(input_var = self.X, shape = (steps,)) l = ReshapeLayer(l, shape = (1,steps,)) l = self.ar.network(l) l = ReshapeLayer(l, shape=(1,)) self.l_ = l self.f_ = get_output(self.l_) self.X_ = T.concatenate([self.f_, T.dot(T.eye(steps)[:-1], self.X)], axis=0) self.fX_ = G.jacobian(self.X_.flatten(), self.X) self.P_ = T.dot(T.dot(self.fX_, self.P), T.transpose(self.fX_)) + \ T.dot(T.dot(T.eye(steps)[:,0:1], self.dt * self.Q), T.eye(steps)[0:1,:]) self.h = T.dot(T.eye(steps)[0:1], self.X_) self.y = self.Z - self.h self.hX_ = G.jacobian(self.h, self.X_) self.S = T.dot(T.dot(self.hX_, self.P_), T.transpose(self.hX_)) + self.R self.K = T.dot(T.dot(self.P_, T.transpose(self.hX_)), self.matrix_inv(self.S)) self.X__ = self.X_ + T.dot(self.K, self.y) self.P__ = T.dot(T.identity_like(self.P) - T.dot(self.K, self.hX_), self.P_) self.prediction = theano.function(inputs = [self.X, self.P, self.Q, self.dt], outputs = [self.X_, self.P_], allow_input_downcast = True) self.update = theano.function(inputs = [self.X, self.Z, self.P, self.Q, self.R, self.dt], outputs = [self.X__, self.P__], allow_input_downcast = True)
def ncac(target, embedding): """Return the sample wise NCA for classification method. This corresponds to the probability that a point is correctly classified with a soft knn classifier using leave-one-out. Each neighbour is weighted according to an exponential of its negative Euclidean distance. Afterwards, a probability is calculated for each class depending on the weights of the neighbours. For details, we refer you to 'Neighbourhood Component Analysis' by J Goldberger, S Roweis, G Hinton, R Salakhutdinov (2004). :param target: An array of shape `(n,)` where `n` is the number of samples. Each entry of the array should be an integer between `0` and `k-1`, where `k` is the number of classes. :param embedding: An array of shape `(n, d)` where each row represents a point in d dimensional space. :returns: Array of shape `(n, 1)`. """ # Matrix of the distances of points. dist = distance_matrix(embedding) thisid = T.identity_like(dist) # Probability that a point is neighbour of another point based on # the distances. top = T.exp(-dist) + 1e-8 # Add a small constant for stability. bottom = (top - thisid * top).sum(axis=0) p = top / bottom # Create a matrix that matches same classes. sameclass = T.eq(distance_matrix(target), 0) - thisid loss_vector = -(p * sameclass).sum(axis=1) # To be compatible with the API, we make this a (n, 1) matrix. return T.shape_padright(loss_vector)
def get_opt_A(self, tau, EPhiTPhi, YT_EPhi): SigInv = EPhiTPhi + (tau**-1 + 1e-4) * T.identity_like(EPhiTPhi) cholTauSigInv = tau**0.5 * sT.cholesky(SigInv) invCholTauSigInv = sT.matrix_inverse(cholTauSigInv) tauInvSig = invCholTauSigInv.T.dot(invCholTauSigInv) Sig_EPhiT_Y = tau * tauInvSig.dot(YT_EPhi.T) return Sig_EPhiT_Y, tauInvSig, cholTauSigInv
def ncar(target, embedding): """Return the NCA for regression loss. This is similar to NCA for classification, except that not soft KNN classification but regression performance is maximized. (Actually, the negative performance is minimized.) For details, we refer you to 'Pose-sensitive embedding by nonlinear nca regression' by Taylor, G. and Fergus, R. and Williams, G. and Spiro, I. and Bregler, C. (2010) Parameters ---------- target : Theano variable An array of shape ``(n, d)`` where ``n`` is the number of samples and ``d`` the dimensionalty of the target space. embedding : Theano variable An array of shape ``(n, d)`` where each row represents a point in ``d``-dimensional space. Returns ------- res : Theano variable Array of shape ``(n, 1)``. """ # Matrix of the distances of points. dist = distance_matrix(embedding) ** 2 thisid = T.identity_like(dist) # Probability that a point is neighbour of another point based on # the distances. top = T.exp(-dist) + 1E-8 # Add a small constant for stability. bottom = (top - thisid * top).sum(axis=0) p = top / bottom # Create matrix of distances. target_distance = distance_matrix(target, target, 'soft_l1') # Set diagonal to 0. target_distance -= target_distance * T.identity_like(target_distance) loss_vector = (p * target_distance ** 2).sum(axis=1) # To be compatible with the API, we make this a (n, 1) matrix. return T.shape_padright(loss_vector)
def get_model(self, X, Y, x_test): ''' Gaussian Process Regression model. Reference: C.E. Rasmussen, "Gaussian Process for Machine Learning", MIT Press 2006 Args: X: tensor matrix, training data Y: tensor matrix, training target x_test: tensor matrix, testing data Returns: K: prior cov matrix Ks: prior joint cov matrix Kss: prior cov matrix for testing data Posterior Distribution: alpha: alpha = inv(K)*(mu-m) sW: vector containing diagonal of sqrt(W) L: L = chol(sW*K*sW+eye(n)) y_test_mu: predictive mean y_test_var: predictive variance fs2: predictive latent variance Note: the cov matrix inverse is computed through Cholesky factorization https://makarandtapaswi.wordpress.com/2011/07/08/cholesky-decomposition-for-matrix-inversion/ ''' # Compute GP prior distribution: mean and covariance matrices (eq 2.13, 2.14) K = self.covFunc(X, X, 'K') # pior cov #m = T.mean(Y)*T.ones_like(Y) # pior mean m = self.mean * T.ones_like(Y) # pior mean # Compute GP joint prior distribution between training and test (eq 2.18) Ks = self.covFunc(X, x_test, 'Ks') # Pay attention!! here is the self test cov matrix. Kss = self.covFunc(x_test, x_test, 'Kss', mode='self_test') # Compute posterior distribution with noise: L,alpha,sW,and log_likelihood. sn2 = T.exp(2 * self.sigma_n) # noise variance of likGauss L = sT.cholesky(K / sn2 + T.identity_like(K)) sl = sn2 alpha = T.dot(sT.matrix_inverse(L.T), T.dot(sT.matrix_inverse(L), (Y - m))) / sl sW = T.ones_like(T.sum(K, axis=1)).reshape( (K.shape[0], 1)) / T.sqrt(sl) log_likelihood = T.sum(-0.5 * (T.dot((Y - m).T, alpha)) - T.sum(T.log(T.diag(L))) - X.shape[0] / 2 * T.log(2. * np.pi * sl)) # Compute predictive distribution using the computed posterior distribution. fmu = m + T.dot(Ks.T, alpha) # Prediction Mu fs|f, eq 2.25 V = T.dot(sT.matrix_inverse(L), T.extra_ops.repeat(sW, x_test.shape[0], axis=1) * Ks) fs2 = Kss - (T.sum(V * V, axis=0)).reshape( (1, V.shape[1])).T # Predication Sigma, eq 2.26 fs2 = T.maximum(fs2, 0) # remove negative variance noise #fs2 = T.sum(fs2,axis=1) # in case x has multiple dimensions y_test_mu = fmu y_test_var = fs2 + sn2 return K, Ks, Kss, y_test_mu, y_test_var, log_likelihood, L, alpha, V, fs2, sW
def onestep_attend_copy(): i_t = T.dot(x_t, Wi) + T.dot(pre_h, Ui) + T.dot(pre_z, Zi) i_t_shape = T.shape(i_t) bi_reshape = T.repeat(bi, i_t_shape[0], 0) bi_reshape_2x = T.repeat(bi_reshape, i_t_shape[1], 1) bf_reshape = T.repeat(bf, i_t_shape[0], 0) bf_reshape_2x = T.repeat(bf_reshape, i_t_shape[1], 1) bc_reshape = T.repeat(bc, i_t_shape[0], 0) bc_reshape_2x = T.repeat(bc_reshape, i_t_shape[1], 1) bo_reshape = T.repeat(bo, i_t_shape[0], 0) bo_reshape_2x = T.repeat(bo_reshape, i_t_shape[1], 1) i_t_new= sigmoid(i_t + bi_reshape_2x) f_t= sigmoid(T.dot(x_t, Wf) + T.dot(pre_h, Uf) + T.dot(pre_z, Zf) + bf_reshape_2x) o_t= sigmoid(T.dot(x_t, Wo) + T.dot(pre_h, Uo) + T.dot(pre_z, Zo) + bo_reshape_2x) c_th = tanh(T.dot(x_t, Wc) + T.dot(pre_h, Uc) + T.dot(pre_z, Zc) + bc_reshape_2x) c_t = f_t*pre_c + i_t_new*c_th h_t = o_t*T.tanh(c_t) #shape (1, N, h_dim) h_t_context = T.repeat(h_t, image_feature_region.shape[1], axis = 0) #new shape (No_region, N, h_dim) image_feature_reshape = T.transpose(image_feature_region, (1, 0, 2)) #compute non-linear correlation between h_t(current text) to image_feature_region (64 for 128*128 and 196 for 224*224) # pdb.set_trace() m_t = T.tanh(T.dot(h_t_context, Hcontext) + T.dot(image_feature_reshape, Zcontext)) #shape (No_region, N, context_dim) e = T.dot(m_t, Va) #No_region, N, 1 e_reshape = e.reshape((e.shape[0], T.prod(e.shape[1:]))) e_softmax = softmax_along_axis(e_reshape, axis = 0) #shape No_region, N e_t = T.transpose(e_softmax, (1,0)) #shape N, No_region e_t_r = e_t.reshape([-1, e_softmax.shape[0], e_softmax.shape[1]]) #3D tensor 1, N, No_region e_t_r_t = T.transpose(e_t_r, (1,0, 2)) # shape N, 1, No_region e_3D = T.repeat(e_t_r_t, e_t_r_t.shape[2], axis = 1) #shape N, No_region, No_region image_feature_region.shape[1] e_3D_t = T.transpose(e_3D, (1,2,0)) #No_region, No_region, N identity_2D = T.identity_like(e_3D_t)# shape No_region, No_region identity_3D = identity_2D.reshape([-1, identity_2D.shape[0], identity_2D.shape[1]]) # shape 1, No_region, No_region identity_3D_t = T.repeat(identity_3D, image_feature_region.shape[0], axis = 0) e_3D_diagonal = e_3D*identity_3D_t #diagonal tensor 3D (N, No_region, No_region) out_weight_y, updates = theano.scan(fn=onestep_weight_feature_multiply, outputs_info=[weight_y], sequences=[e_3D_diagonal, image_feature_region], non_sequences=[]) z_t = T.sum(out_weight_y, axis = 1) #shape (N, feature_dim) z_t_r = z_t.reshape((-1,z_t.shape[0],z_t.shape[1])) return [h_t, c_t, z_t_r]
def ldet( theta = Th.dvector('theta'), M = Th.dmatrix('M') , STA = Th.dvector('STA'), STC = Th.dmatrix('STC'), **other): ''' Return log-det of I-sym(M), for display/debugging purposes. ''' ImM = Th.identity_like(M)-(M+M.T)/2 w, v = eig(ImM) return Th.sum(Th.log(w))
def correlation(self, H1, H2, m): H1bar = H1 H2bar = H2 SigmaHat12 = (1.0/(m-1))*T.dot(H1bar, H2bar.T) SigmaHat11 = (1.0/(m-1))*T.dot(H1bar, H1bar.T) SigmaHat11 = SigmaHat11 + self.r1*T.identity_like(SigmaHat11) SigmaHat22 = (1.0/(m-1))*T.dot(H2bar, H2bar.T) SigmaHat22 = SigmaHat22 + self.r2*T.identity_like(SigmaHat22) Tval = T.dot(SigmaHat11**(-0.5), T.dot(SigmaHat12, SigmaHat22**(-0.5))) corr = T.nlinalg.trace(T.dot(Tval.T, Tval))**(0.5) self.SigmaHat11 = SigmaHat11 self.SigmaHat12 = SigmaHat12 self.SigmaHat22 = SigmaHat22 self.H1bar = H1bar self.H2bar = H2bar self.Tval = Tval return -1*corr
def orthogonality(x): ''' Penalty for deviation from orthogonality: ||dot(x.T, x) - I||**2 ''' xTx = T.dot(x.T, x) return T.sum(T.square(xTx - T.identity_like(xTx)))
def eigs( theta = Th.dvector('theta'), M = Th.dmatrix('M') , STA = Th.dvector('STA') , STC = Th.dmatrix('STC'), **other): ''' Return eigenvalues of I-sym(M), for display/debugging purposes. ''' ImM = Th.identity_like(M)-(M+M.T)/2 w,v = eig( ImM ) return w
def inner_lda_objective(y_true, y_pred): """ It is the loss function of LDA as introduced in the original paper. It is adopted from the the original implementation in the following link: https://github.com/CPJKU/deep_lda Note: it is implemented by Theano tensor operations, and does not work on Tensorflow backend """ r = 1e-4 # init groups yt = T.cast(y_true.flatten(), "int32") groups = numpy_unique(yt) def compute_cov(group, Xt, yt): Xgt = Xt[T.eq(yt, group).nonzero()[0], :] Xgt_bar = Xgt - T.mean(Xgt, axis=0) m = T.cast(Xgt_bar.shape[0], 'float32') return (1.0 / (m - 1)) * T.dot(Xgt_bar.T, Xgt_bar) # scan over groups covs_t, _ = theano.scan( fn=compute_cov, outputs_info=None, sequences=[groups], non_sequences=[y_pred, yt], # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True), mode='DebugMode') # compute average covariance matrix (within scatter) Sw_t = T.mean(covs_t, axis=0) # compute total scatter Xt_bar = y_pred - T.mean(y_pred, axis=0) m = T.cast(Xt_bar.shape[0], 'float32') St_t = (1.0 / (m - 1)) * T.dot(Xt_bar.T, Xt_bar) # compute between scatter Sb_t = St_t - Sw_t # cope for numerical instability (regularize) Sw_t += T.identity_like(Sw_t) * r # return T.cast(T.neq(yt[0], -1), 'float32')*T.nlinalg.trace(T.dot(T.nlinalg.matrix_inverse(St_t), Sb_t)) # compute eigenvalues evals_t = T.slinalg.eigvalsh(Sb_t, Sw_t) # get eigenvalues top_k_evals = evals_t[-n_components:] # maximize variance between classes # (k smallest eigenvalues below threshold) thresh = T.min(top_k_evals) + margin top_k_evals = top_k_evals[(top_k_evals <= thresh).nonzero()] costs = T.mean(top_k_evals) return -costs
def _calc_caylay_delta(step_size, param, gradient): A = Tensor.dot(((step_size / 2) * gradient).T, param) - Tensor.dot(param.T, ((step_size / 2) * gradient)) I = Tensor.identity_like(A) temp = I + A # Q = Tensor.dot(batched_inv(temp.dimshuffle('x',0,1))[0], (I - A)) Q = Tensor.dot(matrix_inverse(temp), I - A) update = Tensor.dot(param, Q) delta = (step_size / 2) * Tensor.dot((param + update), A) return update, delta
def get_output_singlesample(self, M): """ Given a molecule tensor M, calculate its fingerprint. """ # if incoming tensor M has padding # remove padding first # this is the part getting slow-down if self.padding: rowsum = M.sum(axis=0) trim = rowsum[:, -1] trim_to = T.eq(trim, 0).nonzero()[0][0] # first index with no bonds M = M[:trim_to, :trim_to, :] # reduced graph # dimshuffle to get diagonal items to # form atom matrix A (A_tmp, updates) = theano.scan(lambda x: x.diagonal(), sequences=M[:, :, :-1].dimshuffle( (2, 0, 1))) # Now the attributes is (N_features x N_atom), so we need to transpose A = A_tmp.T # get connectivity matrix: N_atom * N_atom C = M[:, :, -1] + T.identity_like(M[:, :, -1]) # get bond tensor: N_atom * N_atom * (N_features-1) B_tmp = M[:, :, :-1] - A coeff = K.concatenate([M[:, :, -1:]] * self.inner_dim, axis=2) B = merge([B_tmp, coeff], mode="mul") # Get initial fingerprint presum_fp = self.attributes_to_fp_contribution(A, 0) fp_all_depth = presum_fp # Iterate through different depths, updating atom matrix each time A_new = A for depth in range(self.depth): temp = K.dot(K.dot(C, A_new) + K.sum(B, axis=1), self.W_inner[depth+1, :, :])\ + self.b_inner[depth+1, 0, :] if self.dropout_rate_inner != 0.0: mask = K.variable( np.ones(shape=(self.padding_final_size, self.inner_dim), dtype=np.float32)) self.mask_inner.append(mask) n_atom = K.shape(temp)[0] temp *= mask[:n_atom, :] A_new = self.activation_inner(temp) presum_fp_new = self.attributes_to_fp_contribution( A_new, depth + 1) fp_all_depth = fp_all_depth + presum_fp_new fp = K.sum(fp_all_depth, axis=0) # sum across atom contributions return fp
def compute(self, symmetric_double_encoder, params): regularization = 0 for layer in symmetric_double_encoder: OutputLog().write('Adding orthonormal regularization for layer') Wy_Square = Tensor.dot(layer.Wy.T, layer.Wy) Wx_Square = Tensor.dot(layer.Wx.T, layer.Wx) regularization += Tensor.sum((Wy_Square - Tensor.identity_like(Wy_Square)) ** 2, dtype=Tensor.config.floatX) regularization += Tensor.sum((Wx_Square - Tensor.identity_like(Wx_Square)) ** 2, dtype=Tensor.config.floatX) OutputLog().write('Computing regularization') regularization -= self._zeroing_param return regularization * (self.weight / 2) * (regularization > 0)
def eig_pos_barrier( theta = Th.dvector('theta'), M = Th.dmatrix('M') , STA = Th.dvector('STA'), STC = Th.dmatrix('STC'), U = Th.dmatrix('U') , V1 = Th.dvector('V1'), **other): ''' A barrier enforcing that the log-det of M should be > exp(-6), and all the eigenvalues of M > 0. Returns true if barrier is violated. ''' ImM = Th.identity_like(M)-(M+M.T)/2 w,v = eig( ImM ) return 1-(Th.sum(Th.log(w))>-250)*(Th.min(w)>0)*(Th.min(V1.flatten())>0) \
def correlation(self, H1, H2): # H1 = self.output.T m = 10000 H1bar = H1 # - (1.0/m)*T.dot(H1, T.shared(numpy.ones((m,m)))) H2bar = H2 # - (1.0/m)*T.dot(H1, T.ones_like(numpy.ones((m,m)))) SigmaHat12 = (1.0 / (m - 1)) * T.dot(H1bar, H2bar.T) SigmaHat11 = (1.0 / (m - 1)) * T.dot(H1bar, H1bar.T) SigmaHat11 = SigmaHat11 + self.r1 * T.identity_like(SigmaHat11) SigmaHat22 = (1.0 / (m - 1)) * T.dot(H2bar, H2bar.T) SigmaHat22 = SigmaHat22 + self.r2 * T.identity_like(SigmaHat22) Tval = T.dot(SigmaHat11 ** (-0.5), T.dot(SigmaHat12, SigmaHat22 ** (-0.5))) corr = T.nlinalg.trace(T.dot(Tval.T, Tval)) ** (0.5) self.SigmaHat11 = SigmaHat11 self.SigmaHat12 = SigmaHat12 self.SigmaHat22 = SigmaHat22 self.H1bar = H1bar self.H2bar = H2bar self.Tval = Tval return -1 * corr
def get_model(self,X, Y, x_test): ''' Gaussian Process Regression model. Reference: C.E. Rasmussen, "Gaussian Process for Machine Learning", MIT Press 2006 Args: X: tensor matrix, training data Y: tensor matrix, training target x_test: tensor matrix, testing data Returns: K: prior cov matrix Ks: prior joint cov matrix Kss: prior cov matrix for testing data Posterior Distribution: alpha: alpha = inv(K)*(mu-m) sW: vector containing diagonal of sqrt(W) L: L = chol(sW*K*sW+eye(n)) y_test_mu: predictive mean y_test_var: predictive variance fs2: predictive latent variance Note: the cov matrix inverse is computed through Cholesky factorization https://makarandtapaswi.wordpress.com/2011/07/08/cholesky-decomposition-for-matrix-inversion/ ''' # Compute GP prior distribution: mean and covariance matrices (eq 2.13, 2.14) K = self.covFunc(X,X,'K') # pior cov #m = T.mean(Y)*T.ones_like(Y) # pior mean m = self.mean*T.ones_like(Y) # pior mean # Compute GP joint prior distribution between training and test (eq 2.18) Ks = self.covFunc(X,x_test,'Ks') # Pay attention!! here is the self test cov matrix. Kss = self.covFunc(x_test,x_test,'Kss',mode='self_test') # Compute posterior distribution with noise: L,alpha,sW,and log_likelihood. sn2 = T.exp(2*self.sigma_n) # noise variance of likGauss L = sT.cholesky(K/sn2 + T.identity_like(K)) sl = sn2 alpha = T.dot(sT.matrix_inverse(L.T), T.dot(sT.matrix_inverse(L), (Y-m)) ) / sl sW = T.ones_like(T.sum(K,axis=1)).reshape((K.shape[0],1)) / T.sqrt(sl) log_likelihood = T.sum(-0.5 * (T.dot((Y-m).T, alpha)) - T.sum(T.log(T.diag(L))) - X.shape[0] / 2 * T.log(2.*np.pi*sl)) # Compute predictive distribution using the computed posterior distribution. fmu = m + T.dot(Ks.T, alpha) # Prediction Mu fs|f, eq 2.25 V = T.dot(sT.matrix_inverse(L),T.extra_ops.repeat(sW,x_test.shape[0],axis=1)*Ks) fs2 = Kss - (T.sum(V*V,axis=0)).reshape((1,V.shape[1])).T # Predication Sigma, eq 2.26 fs2 = T.maximum(fs2,0) # remove negative variance noise #fs2 = T.sum(fs2,axis=1) # in case x has multiple dimensions y_test_mu = fmu y_test_var = fs2 + sn2 return K, Ks, Kss, y_test_mu, y_test_var, log_likelihood, L, alpha,V, fs2,sW
def compute(self, symmetric_double_encoder, params): regularization = 0 layer_number = len(symmetric_double_encoder) for ndx, layer in enumerate(symmetric_double_encoder): hidden_x = layer.output_forward_y hidden_y = layer.output_forward_x cov_x = Tensor.dot(hidden_x.T, hidden_x) cov_y = Tensor.dot(hidden_y.T, hidden_y) gama = (ndx / layer_number) regularization += gama * 0.5 * nlinalg.trace(cov_x - Tensor.identity_like(cov_x)) regularization += (1 - gama) * 0.5 * nlinalg.trace(cov_y - Tensor.identity_like(cov_y)) return regularization
def orthogonal_pools(W, pool_size): """ Returns the orthogonality penalty ||W^T W - I||. :param W: T.matrix, storing filters in column format """ (n_v, n_h) = W.shape n_pools = n_h / pool_size W3 = T.reshape(W.T, (n_pools, pool_size, n_v), ndim=3) W3T = W3.dimshuffle([0,2,1]) WTW = blas.gpu_gemm_batched(W3, W3T) I = T.shape_padleft(T.identity_like(WTW[0])) penalty = T.sum((WTW - I)**2) return penalty
def predict(self, X1, y1, X2): cov_train = self.compute_cov_s(X1,self.N) cov_test = self.compute_cov_s(X2,self.M) cov_te_tr = self.compute_cov(X1,X2,self.N,self.M) cov_tr_te = cov_te_tr.T arg0 = T.inv(cov_train+self.noise**2 *T.identity_like(cov_train)) #arg0 = T.inv(cov_train) arg1 = T.dot(cov_te_tr, arg0) mu = T.dot(arg1,y1) sigma = cov_test - T.dot(arg1, cov_tr_te) return mu,T.diag(sigma)
def LNLEP( theta = Th.dvector('theta'), M = Th.dmatrix('M') , STA = Th.dvector('STA') , STC = Th.dmatrix('STC'), N_spike = Th.dscalar('N_spike'), **other): ''' The actual quadratic-Poisson model, as a function of theta and M, without any barriers or priors. ''' ImM = Th.identity_like(M)-(M+M.T)/2 ldet = logdet(ImM) # Th.log( det( ImM) ) # logdet(ImM) return -0.5 * N_spike *( ldet \ - Th.sum(Th.dot(matrix_inverse(ImM),theta) * theta) \ + 2. * Th.sum( theta * STA ) \ + Th.sum( M * (STC + Th.outer(STA,STA)) ))
def zero_diagonal(X): """Given a square matrix ``X``, return a theano variable with the diagonal of ``X`` set to zero. Parameters ---------- X : theano 2d tensor Returns ------- Y : theano 2d tensor""" thisid = T.identity_like(X) return (X - thisid * X)
def objective(Xt, yt): """ DeepLDA optimization target """ # init groups groups = T.arange(0, n_classes) def compute_cov(group, Xt, yt): """ Compute class covariance matrix for group """ Xgt = Xt[T.eq(yt, group).nonzero()] Xgt_bar = Xgt - T.mean(Xgt, axis=0) m = T.cast(Xgt_bar.shape[0], 'float32') return (1.0 / (m - 1)) * T.dot(Xgt_bar.T, Xgt_bar) # scan over groups covs_t, updates = theano.scan(fn=compute_cov, outputs_info=None, sequences=[groups], non_sequences=[Xt, yt]) # compute average covariance matrix (within scatter) Sw_t = T.mean(covs_t, axis=0) # compute total scatter Xt_bar = Xt - T.mean(Xt, axis=0) m = T.cast(Xt_bar.shape[0], 'float32') St_t = (1.0 / (m - 1)) * T.dot(Xt_bar.T, Xt_bar) # compute between scatter Sb_t = St_t - Sw_t # cope for numerical instability (regularize) Sw_t += T.identity_like(Sw_t) * r # compute eigenvalues evals_t = slinalg.eigvalsh(Sb_t, Sw_t) # get eigenvalues top_k_evals = evals_t[-n_components:] # maximize variance between classes # (k smallest eigenvalues below threshold) thresh = T.min(top_k_evals) + 1.0 top_k_evals = top_k_evals[(top_k_evals <= thresh).nonzero()] costs = -T.mean(top_k_evals) return costs
def get_updates(h, c, U, V, d, bias=1e-5, decomposition="svd", zca=True): updates = [] checks = [] # theano applies updates in parallel, so all updates are in terms # of the old values. use this and assign the return value, i.e. # x = update(x, foo()). x is then a non-shared variable that # refers to the updated value. def update(variable, new_value): updates.append((variable, new_value)) return new_value # compute canonical parameters W = T.dot(U, V) b = d - T.dot(c, W) # update estimates of c, U c = update(c, h.mean(axis=0)) U = update(U, whiten_by[decomposition](h - c, bias, zca)) # check that the new covariance is indeed identity n = h.shape[0].astype(theano.config.floatX) covar = T.dot((h - c).T, (h - c)) / (n - 1) whiteh = T.dot(h - c, U) whitecovar = T.dot(whiteh.T, whiteh) / (n - 1) checks.append(PdbBreakpoint ("correlated after whitening") (1 - T.allclose(whitecovar, T.identity_like(whitecovar), rtol=1e-3, atol=1e-3), c, U, covar, whitecovar, h)[0]) # adjust V, d so that the total transformation is unchanged # (lstsq is much more stable than T.inv) V = update(V, util.lstsq()(U, W, -1)[0]) d = update(d, b + T.nlinalg.matrix_dot(c, U, V)) # check that the total transformation is unchanged before = b + T.dot(h, W) after = d + T.nlinalg.matrix_dot(h - c, U, V) checks.append( PdbBreakpoint ("transformation changed") (1 - T.allclose(before, after, rtol=1e-3, atol=1e-3), T.constant(0.0), W, b, c, U, V, d, h, before, after)[0]) return updates, checks
def quadratic_Poisson( theta = Th.dvector('theta'), M = Th.dmatrix('M') , STA = Th.dvector('STA') , STC = Th.dmatrix('STC'), N_spike = Th.dscalar('N_spike'), logprior = 0 , **other): ''' The actual quadratic-Poisson model, as a function of theta and M, with a barrier on the log-det term and a prior. ''' ImM = Th.identity_like(M)-(M+M.T)/2 ldet = logdet(ImM) # Th.log( det( ImM) ) # logdet(ImM) return -0.5 * N_spike *( ldet + logprior \ - 1./(ldet+250.)**2. \ - Th.sum(Th.dot(matrix_inverse(ImM),theta) * theta) \ + 2. * Th.sum( theta * STA ) \ + Th.sum( M * (STC + Th.outer(STA,STA)) ))
def test_grad_W(self): """tests that the gradient of the log probability with respect to W matches my analytical derivation """ #self.model.set_param_values(self.new_params) g = T.grad(self.prob, self.model.W, consider_constant = self.mf_obs.values()) B = self.model.B W = self.model.W mean_hsv = self.stats.d['mean_hsv'] mean_sq_hs = self.stats.d['mean_sq_hs'] mean_HS = self.mf_obs['H_hat'] * self.mf_obs['S_hat'] m = mean_HS.shape[0] outer_prod = T.dot(mean_HS.T,mean_HS) outer_prod.name = 'outer_prod<from_observations>' outer = outer_prod/m mask = T.identity_like(outer) second_hs = (1.-mask) * outer + alloc_diag(mean_sq_hs) term1 = (B * mean_hsv).T term2 = - B.dimshuffle(0,'x') * T.dot(W, second_hs) analytical = term1 + term2 f = function([],(g,analytical)) gv, av = f() assert gv.shape == av.shape max_diff = np.abs(gv-av).max() if max_diff > self.tol: print "gv" print gv print "av" print av raise Exception("analytical gradient on W deviates from theano gradient on W by up to "+str(max_diff))
def dex_cost(self, I, dex_lam=0.00): """ Simple exemplar-svm-like function to optimize. This loss is based on unnormalized grounded grounded density estimation via Negative Sampling -- Noise-Contrastive Estimation. """ #assert(I.shape[0] == self.X_in.shape[0]) Wt = T.take(self.W, I, axis=0) bt = T.take(self.b, I) k = I.size - 1 F = T.dot(self.X_in, Wt.T) + bt #F = T.dot(self.X_in, self.X_in.T) mask = T.ones_like(F) - T.identity_like(F) dex_loss = T.sum((mask * F) + T.log(1.0 + k*T.exp(-F))) / (k + 1) reg_loss = dex_lam * T.sum(F**2.0) / (k + 1) C = dex_loss + reg_loss self.dW = T.grad(C, Wt) self.db = T.grad(C, bt) return C
def __call__(self, model, X, Y=None, dual=None, **kwargs): assert (Y is None) == (not self.supervised) WBW = T.dot(model.W.T * model.beta, model.W) target = T.identity_like(WBW) err = WBW - target penalty = T.sqr(err).sum() basic_cost = - model.log_likelihood(X).mean() + self.constraint_coeff * penalty if self.use_admm: if dual is None: if not hasattr(model, 'dual'): model.dual = sharedX(np.zeros((model.nhid, model.nhid)), 'lambda') dual = model.dual augmented_lagrangian = basic_cost + (dual * err).sum() return augmented_lagrangian else: return basic_cost assert False # should be unreached
def get_gradients(self, model, X, Y=None, **kwargs): assert 'dual' not in kwargs updates = {} if self.use_admm: rho = self.constraint_coeff * 2. dual = model.dual WBW = T.dot(model.W.T * model.beta, model.W) target = T.identity_like(WBW) err = WBW - target new_dual = dual + rho * err new_dual = block_gradient(new_dual) kwargs['dual'] = new_dual updates[dual] = new_dual cost = self(model, X, Y, **kwargs) params = model.get_params() assert not isinstance(params, set) return dict(zip(params, T.grad(cost, params))), updates
def _build_conditional(self, Xnew, pred_noise, diag): Xs, y, sigma = self.Xs, self.y, self.sigma # Old points X = cartesian(*Xs) delta = y - self.mean_func(X) Kns = [f(x) for f, x in zip(self.cov_funcs, Xs)] eigs_sep, Qs = zip(*map(eigh, Kns)) # Unzip QTs = list(map(tt.transpose, Qs)) eigs = kron_diag(*eigs_sep) # Combine separate eigs if sigma is not None: eigs += sigma**2 # New points Km = self.cov_func(Xnew, diag=diag) Knm = self.cov_func(X, Xnew) Kmn = Knm.T # Build conditional mu alpha = kron_dot(QTs, delta) alpha = alpha/eigs[:, None] alpha = kron_dot(Qs, alpha) mu = tt.dot(Kmn, alpha).ravel() + self.mean_func(Xnew) # Build conditional cov A = kron_dot(QTs, Knm) A = A/tt.sqrt(eigs[:, None]) if diag: Asq = tt.sum(tt.square(A), 0) cov = Km - Asq if pred_noise: cov += sigma else: Asq = tt.dot(A.T, A) cov = Km - Asq if pred_noise: cov += sigma * tt.identity_like(cov) return mu, cov
def stabilize(K): """ adds small diagonal to a covariance matrix """ return K + 1e-6 * tt.identity_like(K)
def __call__(self, loss): loss += K.sum(K.square(self.p.dot(self.p.T) - T.identity_like(self.p))) * self.strength return loss