def get_model(data, K, alpha, sigma, sigma2, eta, *args, **kargs): r = data.pivot(index='MUTID', columns='SAMPLEID', values='r').values R = data.pivot(index='MUTID', columns='SAMPLEID', values='R').values VAF0 = data.pivot(index='MUTID', columns='SAMPLEID', values='VAF0').values r, R, VAF0 = r[:, :, None], R[:, :, None], VAF0[:, :, None] nsamples = data.SAMPLEID.nunique() idxs = aux.corr_vector_to_matrix_indices(nsamples) D = tns.eye(nsamples) * sigma**2 with pmc.Model() as model: # alpha = pmc.Gamma('alpha', 1.0, 1.0) u = pmc.Beta('u', 1.0, alpha, shape=K - 1) lw = pmc.Deterministic('lw', aux.stick_breaking_log(u)) C_ = pmc.LKJCorr('C', eta=eta, n=nsamples) C = tns.fill_diagonal(C_[idxs], 1.0) Sigma = D.dot(C) psi = pmc.MvNormal('psi', mu=nmp.zeros(nsamples), cov=Sigma, shape=(K, nsamples)) phi = pmc.Deterministic('phi', pmc.invlogit(psi.T)) # psi = pmc.MvNormal('psi', mu=nmp.zeros(nsamples), cov=D, shape=(K, nsamples)) # phi = pmc.Deterministic('phi', pmc.invlogit(psi.T)) theta = pmc.Deterministic('theta', VAF0 * phi[None, :, :]) pmc.DensityDist('r', aux.binmixND_logp_fcn(R, theta, lw), observed=r) return model
def Kcost_nesterov(self, learning_rate = 1e-2, epsilon = 1, gamma = 0.9): """ Returns the cost of SGD with Nesterov's accelerated gradient. """ print ('Using Nesterov with gamma = %f, learning rate = %f, epsilon = %f'\ % (gamma, learning_rate, epsilon)) if self.gpu: vW = theano.shared(np.zeros(self.W.eval().shape).astype(np.float32)) vb = theano.shared(np.zeros(self.b.eval().shape).astype(np.float32)) else: vW = theano.shared(np.zeros(self.W.eval().shape)) vb = theano.shared(np.zeros(self.b.eval().shape)) nextW = self.W - gamma * vW nextb = self.b - gamma * vb cost = T.mean(T.exp((0.5 - self.x) * (T.dot(self.x,\ T.fill_diagonal(nextW, 0)) + nextb))) * epsilon Wgrad = T.grad(cost, nextW) bgrad = T.grad(cost, nextb) gparams = [Wgrad, bgrad] momentum = [vW, vb] momentum_updates = [(v, gamma * v + learning_rate * gparam)\ for v, gparam in zip(momentum, gparams)] updates = [(param, param - v) \ for param, v in zip(self.params, momentum)] updates = updates + momentum_updates return cost, updates
def get_model(x, r, R, vaf0, K=10): nsamples = r.shape[1] r, R, vaf0 = r[:, :, None], R[:, :, None], vaf0[:, :, None] idxs = aux.corr_vector_to_matrix_indices(K) with pmc.Model() as model: w = pmc.Dirichlet('w', nmp.ones(K)) lw = tns.log(w) # alpha = pmc.Gamma('alpha', 1.0, 1.0) # u = pmc.Beta('u', 1.0, alpha, shape=K-1) # lw = aux.stick_breaking_log(u) rho = pmc.Gamma('rho', 1.0, 1.0) Cc = tns.fill_diagonal(pmc.LKJCorr('C', eta=2.0, n=K)[idxs], 1.0) Cr = aux.cov_quad_exp(x, 1.0, rho) mu_psi = pmc.MatrixNormal('mu_psi', mu=nmp.zeros((nsamples, K)), rowcov=Cr, colcov=Cc, shape=(nsamples, K)) psi = pmc.Normal('psi', mu=mu_psi, sd=0.1, shape=(nsamples, K)) phi = pmc.Deterministic('phi', pmc.invlogit(psi)) # psi = pmc.MvNormal('psi', mu=nmp.zeros(K), tau=nmp.eye(K), shape=(nsamples, K)) # phi = pmc.Deterministic('phi', pmc.invlogit(psi)) theta = pmc.Deterministic('theta', vaf0 * phi[None, :, :]) pmc.DensityDist('r', aux.binmixND_logp_fcn(R, theta, lw), observed=r) return model
def cost_var(X, Y, sigma, Adj, l_kl, l_e, l_c, l_r, r_eps): N = X.shape[0] num_edges = 0.5 * T.sum(Adj) # Used to normalize s.t. the l_*'s sum up to one. l_sum = l_kl + l_e + l_c + l_r p_ij_conditional = p_ij_conditional_var(X, sigma) p_ij = p_ij_sym_var(p_ij_conditional) q_ij = q_ij_student_t_var(Y) p_ij_safe = T.maximum(p_ij, epsilon) q_ij_safe = T.maximum(q_ij, epsilon) # Kullback-Leibler term kl = T.sum(p_ij * T.log(p_ij_safe / q_ij_safe), axis=1) # Edge contraction term edge_contraction = (1 / (2 * num_edges)) * T.sum(Adj * sqeuclidean_var(Y), axis=1) # Compression term compression = (1 / (2 * N)) * T.sum(Y**2, axis=1) # Repulsion term # repulsion = (1 / (2 * N**2)) * T.sum(T.fill_diagonal(1 / (euclidean_var(Y) + r_eps), 0), axis=1) repulsion = -(1 / (2 * N**2)) * T.sum( T.fill_diagonal(T.log(euclidean_var(Y) + r_eps), 0), axis=1) cost = (l_kl / l_sum) * kl + (l_e / l_sum) * edge_contraction + ( l_c / l_sum) * compression + (l_r / l_sum) * repulsion return cost
def _results_inner(self,n,x): out,_ = theano.scan(lambda x_in: (n-1.)*\ tt.log(tt.nlinalg.det( tt.fill_diagonal(x_in[self.tri_index]*(self.tri_index!=-1),1) )),sequences = [x]) return out
def softmax_neg(self, X): if hasattr(self, 'hack_matrix'): X = X * self.hack_matrix e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x')) * self.hack_matrix else: e_x = T.fill_diagonal(T.exp(X - X.max(axis=1).dimshuffle(0, 'x')), 0) return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')
def cost_var(X, Y, sigma, l_kl, l_c, l_r, r_eps): N = X.shape[0] # Used to normalize s.t. the l_*'s sum up to one. l_sum = l_kl + l_c + l_r p_ij_conditional = p_ij_conditional_var(X, sigma) p_ij = p_ij_sym_var(p_ij_conditional) q_ij = q_ij_student_t_var(Y) p_ij_safe = T.maximum(p_ij, epsilon) q_ij_safe = T.maximum(q_ij, epsilon) # Kullback-Leibler term kl = T.sum(p_ij * T.log(p_ij_safe / q_ij_safe), axis=1) # Compression term compression = (1 / (2 * N)) * T.sum(Y**2, axis=1) # Repulsion term repulsion = -(1 / (2 * N**2)) * T.sum( T.fill_diagonal(T.log(euclidean_var(Y) + r_eps), 0), axis=1) # Sum of all terms. cost = (l_kl / l_sum) * kl + (l_c / l_sum) * compression + ( l_r / l_sum) * repulsion return cost
def logp(self, x): n = self.n p = self.p s = self.s if s > 1: X = self._x_creation(x) result = self._normalizing_constant(n, p, s) + self._results_inner(n,x) return pm.dist_math.bound(result, tt.all(X <= 1), tt.all(X >= -1), self._check_pos_def(x), n > 0) else: X = x[self.tri_index] X = tt.fill_diagonal(X, 1) result = self._normalizing_constant(n, p, s) result += (n - 1.) * tt.log(tt.nlinalg.det(X)) # n-1 probably needs to become structure[0]-1 # I don't really know the likehood structure honestly return pm.dist_math.bound(result, tt.all(X <= 1), tt.all(X >= -1), matrix_pos_def(X), n > 0)
def fill_correlation_matrix(c_vec): """ Create a Theano tensor object representing a correlation matrix of a multivariate normal distribution. :param c_vec: PyMC3 model variable corresponding to the `LKJCorr` prior on elements of the correlation matrix :return: correlation matrix as a Theano tensor object """ n = c_vec.tag.test_value.shape[0] n_layers = n - 1 m = np.zeros((n, n)) res = tt.nlinalg.matrix_dot(m, 1) ind = 0 for layer in range(n_layers): start_col = layer + 1 for j in range(start_col, n): m[layer, j] = 1 m[j, layer] = 1 res += tt.nlinalg.matrix_dot(m, c_vec[ind]) ind += 1 m[layer, j] = 0 m[j, layer] = 0 res = tt.fill_diagonal(res, 1.) return res
def Kcost_momentum(self, learning_rate = 1e-2, epsilon = 1, gamma = 0.9): """ Returns the cost of SGD with Momentum. """ print ('Using Momentum with gamma = %f, learning rate = %f, epsilon = %f'\ % (gamma, learning_rate, epsilon)) cost = T.mean(T.exp((0.5 - self.x) * \ (T.dot(self.x, T.fill_diagonal(self.W, 0)) + self.b))) * epsilon gparams = T.grad(cost, self.params) if self.gpu: vW = theano.shared(np.zeros(self.W.eval().shape).astype(np.float32)) vb = theano.shared(np.zeros(self.b.eval().shape).astype(np.float32)) else: vW = theano.shared(np.zeros(self.W.eval().shape)) vb = theano.shared(np.zeros(self.b.eval().shape)) momentum = [vW, vb] momentum_updates = [(v, gamma * v + learning_rate * gparam) \ for v, gparam in zip(momentum, gparams)] updates = [(param, param - v) \ for param, v in zip(self.params, momentum)] updates = updates + momentum_updates return cost, updates
def Kcost_adagrad(self, learning_rate = 1e-2, epsilon = 1, smoothingterm = 1): """ Returns the cost of SGD using adagrad. """ print ('Using Adagrad with smoothing term = %.9f, learning rate = %f, epsilon = %f'\ % (smoothingterm, learning_rate, epsilon)) param_shapes = [param.get_value().shape for param in self.params ] grad_hists = [theano.shared(np.zeros(param_shape, dtype = theano.config.floatX), borrow = True, name = 'grad_hist_' + param.name) for param_shape, param in zip(param_shapes, self.params)] cost = T.mean(T.exp((0.5 - self.x) * (T.dot(self.x,\ T.fill_diagonal(self.W, 0)) + self.b))) * epsilon gparams = T.grad(cost, self.params) grad_hist_updates = [(g_hist, g_hist + g ** 2) for g_hist, g in zip(grad_hists, gparams)] updates = [(param, param - learning_rate * gparam/(T.sqrt(grad_hist + smoothingterm)))\ for param, grad_hist, gparam in zip(self.params, grad_hists, gparams)] updates = updates + grad_hist_updates return cost, updates
def logp(self, x): n = self.n p = self.p X = x[self.tri_index] X = T.fill_diagonal(X, 1) result = self._normalizing_constant(n, p) result += (n - 1.) * T.log(det(X)) return bound(result, T.all(X <= 1), T.all(X >= -1), n > 0)
def logp(self, x): n = self.n p = self.p X = x[self.tri_index] X = T.fill_diagonal(X, 1) result = self._normalizing_constant(n, p) result += (n - 1.0) * T.log(det(X)) return bound(result, T.all(X <= 1), T.all(X >= -1), n > 0)
def p_ij_conditional_var(X, sigma): N = X.shape[0] sqdistance = X**2 esqdistance = T.exp(-sqdistance / ((2 * (sigma**2)).reshape((N, 1)))) esqdistance_zd = T.fill_diagonal(esqdistance, 0) row_sum = T.sum(esqdistance_zd, axis=1).reshape((N, 1)) return esqdistance_zd / row_sum # Possibly dangerous
def logp(self, x): n = self.n p = self.p X = x[self.tri_index] X = tt.fill_diagonal(X, 1) result = self._normalizing_constant(n, p) result += (n - 1.) * tt.log(det(X)) return bound(result, tt.all(X <= 1), tt.all(X >= -1), matrix_pos_def(X), n > 0)
def _results_inner(self,n,p,s,x): # use theano.scan to create x result1 = self._normalizing_constant(n,p)*s result2,_ = theano.scan(lambda x_min: (n-1) * tt.log( tt.nlinalg.det( tt.fill_diagonal( x_min[self.tri_index],1) ) ), sequences = [x]) return result1+result2
def loss_forcedsymmetry(self, learning_rate = 1e-2, epsilon = 1): """ Returns the cost of vanilla SGD. """ cost = T.mean(T.exp((0.5 - self.x) * (T.dot(self.x, self.W) + self.b))) * epsilon Wgrad = T.grad(cost, self.W) bgrad = T.grad(cost, self.b) Wupdate = T.fill_diagonal(0.5 * ((self.W - learning_rate * Wgrad) + (self.W - learning_rate * Wgrad).T), 0) updates = [(self.W, Wupdate), (self.b, self.b - learning_rate * bgrad )] return cost, updates
def logp(self, x): n = self.n eta = self.eta X = x[self.tri_index] X = tt.fill_diagonal(X, 1) result = _lkj_normalizing_constant(eta, n) result += (eta - 1.) * tt.log(det(X)) return bound(result, tt.all(X <= 1), tt.all(X >= -1), matrix_pos_def(X), eta > 0, broadcast_conditions=False)
def logp(self, x): n = self.n eta = self.eta X = x[self.tri_index] X = tt.fill_diagonal(X, 1) result = _lkj_normalizing_constant(eta, n) result += (eta - 1.) * tt.log(det(X)) return bound(result, tt.all(X <= 1), tt.all(X >= -1), matrix_pos_def(X), eta > 0, broadcast_conditions=False )
def p_Xp_given_X_var(X, sigma, metric): N = X.shape[0] if metric == 'euclidean': sqdistance = sqeuclidean_var(X) elif metric == 'precomputed': sqdistance = X**2 else: raise Exception('Invalid metric') esqdistance = T.exp(-sqdistance / ((2 * (sigma**2)).reshape((N, 1)))) esqdistance_zd = T.fill_diagonal(esqdistance, 0) row_sum = T.sum(esqdistance_zd, axis=1).reshape((N, 1)) return esqdistance_zd/row_sum # Possibly dangerous
def p_Xp_given_X_var(X, sigma, metric): N = X.shape[0] if metric == 'euclidean': sqdistance = sqeuclidean_var(X) elif metric == 'precomputed': sqdistance = X**2 else: raise Exception('Invalid metric') esqdistance = T.exp(-sqdistance / ((2 * (sigma**2)).reshape((N, 1)))) esqdistance_zd = T.fill_diagonal(esqdistance, 0) row_sum = T.sum(esqdistance_zd, axis=1).reshape((N, 1)) return esqdistance_zd / row_sum # Possibly dangerous
def calc_original_cond_prob(X, sigma, metric): N = X.shape[0] if metric == 'euclidean': data_distances = calc_euclidean_norms(X) elif metric == 'precomputed': data_distances = X**2 else: raise Exception('Invalid metric') esqdistance = T.exp(-data_distances / ((2 * (sigma**2)).reshape((N, 1)))) esqdistance_zd = T.fill_diagonal(esqdistance, 0) row_sum = T.sum(esqdistance_zd, axis=1).reshape((N, 1)) return esqdistance_zd / row_sum # Possibly dangerous
def pm_make_cov(sigma_priors, corr_coeffs, ndim): """Assemble a covariance matrix single variable standard deviations and correlation coefficients""" # Citation: AM 207 lecture notes: http://am207.info/wiki/corr.html # Diagonal matrix of standard deviation for each varialbes sigma_matrix = tt.nlinalg.diag(sigma_priors) # A symmetric nxn matrix has n choose 2 = n(n-1)/2 distinct elements n_elem = int(ndim * (ndim - 1) / 2) # Convert between array indexing and [i, j) indexing tri_index = np.zeros([ndim, ndim], dtype=int) tri_index[np.triu_indices(ndim, k=1)] = np.arange(n_elem) tri_index[np.triu_indices(ndim, k=1)[::-1]] = np.arange(n_elem) # Assemble the covariance matrix using the equation # CovMat = DiagMat * CorrMat * DiagMat corr_matrix = corr_coeffs[tri_index] corr_matrix = tt.fill_diagonal(corr_matrix, 1) return tt.nlinalg.matrix_dot(sigma_matrix, corr_matrix, sigma_matrix)
def Kcost(self, learning_rate=0.01): """ Returns the cost """ cost = T.mean(T.exp((0.5 - self.x) * (T.dot(self.x, self.W) + self.b))) # gparams = T.grad(cost, self.params) # updates = [(param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams)] Wgrad = T.grad(cost, self.W) # T.fill_diagonal(Wgrad, 0) bgrad = T.grad(cost, self.b) Wupdate = T.fill_diagonal( 0.5 * ((self.W - learning_rate * Wgrad) + (self.W - learning_rate * Wgrad).T), 0) updates = [(self.W, Wupdate), (self.b, self.b - learning_rate * bgrad)] # updates = [(self.W, self.W - learning_rate * Wgrad), (self.b, self.b - learning_rate * bgrad )] return cost, updates
def logp(self, x): # x is assumed to be (s x n_elem) if s > 1 or n_elem n = self.n p = self.p s = self.s if s !=1: X = self._X_inner_creation(x) result = self._results_inner(n,p,s,x) return pm.dist_math.bound(result, tt.all(X <= 1), tt.all(X >= -1), n > 0) else: X = x[self.tri_index] X = tt.fill_diagonal(X, 1) result = self._normalizing_constant(n, p) result += (n - 1.) * tt.log(tt.nlinalg.det(X)) return pm.dist_math.bound(result, tt.all(X <= 1), tt.all(X >= -1), n > 0)
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) ok = tt.all(tt.nlinalg.diag(chol_x) > 0) chol_x = tt.switch(ok, chol_x, tt.fill_diagonal(chol_x, 1)) dz = tt.switch(ok, dz, floatX(1)) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tt.tril(mtx) - tt.diag(tt.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" solve = tt.slinalg.Solve(A_structure="upper_triangular") return solve(outer.T, solve(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tt.tril(s + s.T) - tt.diag(tt.diagonal(s)) else: grad = tt.triu(s + s.T) - tt.diag(tt.diagonal(s)) return [tt.switch(ok, grad, floatX(np.nan))]
def cost(self, lr=1e-2, epsilon=1): """ Returns the cost of vanilla SGD. The update rule enforces that the W matrix at each timestep is symmetric. """ print(51 * '=') print(24 * '#' + 'MPF' + 24 * '#') print(51 * '=') print('Input size: {0}'.format(self.n)) print('Learning temperature: {0}'.format(self.temperature)) print('Learning rate: {0}'.format(lr)) cost = epsilon * T.mean( T.exp((0.5 - self.x) * (T.dot(self.x, self.W) + self.b) / self.temperature)) Wgrad = T.grad(cost, self.W) bgrad = T.grad(cost, self.b) Wupdate = T.fill_diagonal( 0.5 * ((self.W - lr * Wgrad) + (self.W - lr * Wgrad).T), 0) updates = [(self.W, Wupdate), (self.b, self.b - lr * bgrad)] return cost, updates
def _X_inner_creation(self,x): # use theano.scan to create x result,_ = theano.scan(lambda x_min: tt.fill_diagonal(x_min[self.tri_index]*(self.tri_index!=-1),1), # update for specialized structure sequences = [x]) return result
def logp(self, x): # -1/2 (x-mu) @ Sigma^-1 @ (x-mu)^T - 1/2 log(2pi^k|Sigma|) # Sigma = diag(std) @ Corr @ diag(std) # Sigma^-1 = diag(std^-1) @ Corr^-1 @ diag(std^-1) # Corr is a block matrix of special form # +----------+ # Corr = [[ | 1, b1, b1|, 0, 0, 0,..., 0] # [ |b1, 1, b1|, 0, 0, 0,..., 0] # [ |b1, b1, 1|, 0, 0, 0,..., 0] # +-----------+----------+ # [ 0, 0, 0, | 1, b2, b2|,..., 0] # [ 0, 0, 0, |b2, 1, b2|,..., 0] # [ 0, 0, 0, |b2, b2, 1|,..., 0] # +----------+ # [ ... ] # [ 0, 0, 0, 0, 0, 0 ,..., 1]] # # Corr = [[B1, 0, 0, ..., 0] # [ 0, B2, 0, ..., 0] # [ 0, 0, B3, ..., 0] # [ ... ] # [ 0, 0, 0, ..., Bk]] # # Corr^-1 = [[B1^-1, 0, 0, ..., 0] # [ 0, B2^-1, 0, ..., 0] # [ 0, 0, B3^-1, ..., 0] # [ ... ] # [ 0, 0, 0, ..., Bk^-1]] # # |B| matrix of rank r is easy # https://math.stackexchange.com/a/1732839 # Let D = eye(r) * (1-b) # Then B = D + b * ones((r, r)) # |B| = (1-b) ** r + b * r * (1-b) ** (r-1) # |B| = (1.-b) ** (r-1) * (1. + b * (r - 1)) # log(|B|) = log(1-b)*(r-1) + log1p(b*(r-1)) # # Inverse B^-1 is easy as well # https://math.stackexchange.com/a/1766118 # let # c = 1/b + r*1/(1-b) # (B^-1)ii = 1/(1-b) - 1/(c*(1-b)**2) # (B^-1)ij = - 1/(c*(1-b)**2) # # assuming # z = (x - mu) / std # we have det fix # detfix = -sum(log(std)) # # now we need to compute z @ Corr^-1 @ z^T # note that B can be unique per timestep # so we need z_t @ Corr_t^-1 @ z_t^T in perfect # z_t @ Corr_t^-1 @ z_t^T is a sum of block terms # quad = z_ct @ B_ct^-1 @ z_ct^T = (B^-1)_iict * sum(z_ct**2) + (B^-1)_ijct*sum_{i!=j}(z_ict * z_jct) # # finally all terms are computed explicitly # logp = detfix - 1/2 * ( quad + log(pi*2) * k + log(|B|) ) x = tt.as_tensor_variable(x) clust_ids, clust_pos, clust_counts = \ tt.extra_ops.Unique(return_inverse=True, return_counts=True)(self.clust) clust_order = tt.argsort(clust_pos) mu = self.mu corr = self.corr[..., clust_ids] std = self.std if std.ndim == 0: std = tt.repeat(std, x.shape[-1]) if std.ndim == 1: std = std[None, :] if corr.ndim == 1: corr = corr[None, :] z = (x - mu) / std z = z[..., clust_order] detfix = -tt.log(std).sum(-1) # following the notation above r = clust_counts b = corr # detB = (1.-b) ** (r-1) * (1. + b * (r - 1)) logdetB = tt.log1p(-b) * (r - 1) + tt.log1p(b * (r - 1)) c = 1 / b + r / (1. - b) invBij = -1. / (c * (1. - b)**2) invBii = 1. / (1. - b) + invBij invBij = tt.repeat(invBij, clust_counts, axis=-1) invBii = tt.repeat(invBii, clust_counts, axis=-1) # to compute (Corr^-1)_ijt*sum_{i!=j}(z_it * z_jt) # we use masked cross products mask = tt.arange(x.shape[-1])[None, :] mask = tt.repeat(mask, x.shape[-1], axis=0) mask = tt.maximum(mask, mask.T) block_end_pos = tt.cumsum(r) block_end_pos = tt.repeat(block_end_pos, clust_counts) mask = tt.lt(mask, block_end_pos) mask = tt.and_(mask, mask.T) mask = tt.fill_diagonal(mask.astype('float32'), 0.) # type: tt.TensorVariable invBiizizi_sum = ((z**2) * invBii).sum(-1) invBijzizj_sum = ( (z.dimshuffle(0, 1, 'x') * mask.dimshuffle('x', 0, 1) * z.dimshuffle(0, 'x', 1)) * invBij.dimshuffle(0, 1, 'x')).sum( [-1, -2]) quad = invBiizizi_sum + invBijzizj_sum k = pm.floatX(x.shape[-1]) logp = (detfix - .5 * (quad + pm.floatX(np.log(np.pi * 2)) * k + logdetB.sum(-1))) if self.nonzero: logp = tt.switch(tt.eq(x, 0).any(-1), 0., logp) return bound(logp, tt.gt(corr, -1.), tt.lt(corr, 1.), tt.gt(std, 0.), broadcast_conditions=False)
def _X_inner_creation(self,x): # use theano.scan to create x result,_ = theano.scan(lambda x_min: tt.fill_diagonal(x_min[self.tri_index],1), sequences = [x]) return result
def q_ij_student_t_var(Y): sqdistance = sqeuclidean_var(Y) one_over = T.fill_diagonal(1 / (sqdistance + 1), 0) return one_over / one_over.sum()
def cov_funct_special(lkj,sigma,tri_index): # need to approach the structure part result,_ = theano.scan(lambda l,s: tt.diag(s).dot(tt.fill_diagonal(l[tri_index]*1*(tri_index!=-1),1)).dot(tt.diag(s)), sequences = [lkj,sigma]) return result
def _check_pos_def(self,x): out,_ = theano.scan(lambda x_in: tt.all(tt.nlinalg.eigh(tt.fill_diagonal(x_in[self.tri_index]*(self.tri_index!=-1),1))[0]>0), sequences = [x]) return tt.all(out)
def _x_creation(self,x): out,_ = theano.scan(lambda x_in: tt.fill_diagonal(x_in[self.tri_index]*(self.tri_index!=-1),1), sequences = [x]) return out
def _Tcov(sigma, rho): """Build a covariance matrix""" C = T.alloc(rho, 2, 2) C = T.fill_diagonal(C, 1.) S = T.diag(sigma) return T.nlinalg.matrix_dot(S, C, S)
def __init__(self, dimension, mu_data, tau_data, prior="Gaussian", parameters={ "location": None, "scale": None, "corr": False }, hyper_alpha=None, hyper_beta=None, hyper_gamma=None, hyper_delta=None, transformation=None, parametrization="non-central", name='', model=None): assert isinstance(dimension, int), "dimension must be integer!" assert dimension in [3, 5, 6], "Not a valid dimension!" D = dimension # 2) call super's init first, passing model and name # to it name will be prefix for all variables here if # no name specified for model there will be no prefix super().__init__(str(D) + "D", model) # now you are in the context of instance, # `modelcontext` will return self you can define # variables in several ways note, that all variables # will get model's name prefix #------------------- Data ------------------------------------------------------ N = int(len(mu_data) / D) if N == 0: sys.exit( "Data has length zero!. You must provide at least one data point" ) #------------------------------------------------------------------------------- #============= Transformations ==================================== if transformation is "mas": Transformation = Iden elif transformation is "pc": if D is 3: Transformation = cartesianToSpherical elif D is 6: Transformation = phaseSpaceToAstrometry_and_RV elif D is 5: Transformation = phaseSpaceToAstrometry D = 6 else: sys.exit("Transformation is not accepted") #================================================================== #================ Hyper-parameters ===================================== if hyper_delta is None: shape = 1 else: shape = len(hyper_delta) #--------- Location ---------------------------------- if parameters["location"] is None: location = [ pm.Normal("loc_{0}".format(i), mu=hyper_alpha[i][0], sigma=hyper_alpha[i][1], shape=shape) for i in range(D) ] #--------- Join variables -------------- mu = pm.math.stack(location, axis=1) else: mu = parameters["location"] #------------------------------------------------------ #------------- Scale -------------------------- if parameters["scale"] is None: scale = [ pm.Gamma("scl_{0}".format(i), alpha=2.0, beta=2.0 / hyper_beta[i][0], shape=shape) for i in range(D) ] else: scale = parameters["scale"] #-------------------------------------------------- #----------------------- Correlation ----------------------------------------- if parameters["corr"]: pm.LKJCorr('chol_corr', eta=hyper_gamma, n=D) C = tt.fill_diagonal( self.chol_corr[np.zeros((D, D), dtype=np.int64)], 1.) # print_ = tt.printing.Print('C')(C) else: C = np.eye(D) #----------------------------------------------------------------------------- #-------------------- Covariance ------------------------- sigma_diag = pm.math.stack(scale, axis=1) cov = theano.shared(np.zeros((shape, D, D))) for i in range(shape): sigma = tt.nlinalg.diag(sigma_diag[i]) covi = tt.nlinalg.matrix_dot(sigma, C, sigma) cov = tt.set_subtensor(cov[i], covi) #--------------------------------------------------------- #======================================================================== #===================== True values ============================================ if prior is "Gaussian": pm.MvNormal("source", mu=mu, cov=cov[0], shape=(N, D)) elif prior is "GMM": pm.Dirichlet("weights", a=hyper_delta, shape=shape) comps = [ pm.MvNormal.dist(mu=mu[i], cov=cov[i]) for i in range(shape) ] pm.Mixture("source", w=self.weights, comp_dists=comps, shape=(N, D)) else: sys.exit("The specified prior is not supported") #================================================================================= #----------------------- Transformation--------------------------------------- transformed = Transformation(self.source) #----------------------------------------------------------------------------- #------------ Flatten -------------------------------------------------------- true = pm.math.flatten(transformed) #---------------------------------------------------------------------------- #----------------------- Likelihood ---------------------------------------- pm.MvNormal('obs', mu=true, tau=tau_data, observed=mu_data) #------------------------------------------------------------------------------
def covariance(sigma, rho): C = T.fill_diagonal(T.alloc(rho, 2, 2), 1.) S = T.diag(sigma) M = S.dot(C).dot(S) return M
def q_ij_gaussian_var(Y): sqdistance = sqeuclidean_var(Y) gauss = T.fill_diagonal(T.exp(-sqdistance), 0) return gauss / gauss.sum()
# In order to convert the upper triangular correlation values to a complete # correlation matrix, we need to construct an index matrix: n_elem = int(n_var * (n_var - 1) / 2) tri_index = np.zeros([n_var, n_var], dtype=int) tri_index[np.triu_indices(n_var, k=1)] = np.arange(n_elem) tri_index[np.triu_indices(n_var, k=1)[::-1]] = np.arange(n_elem) with pm.Model() as model: mu = pm.Normal('mu', mu=0, sd=1, shape=n_var) # We can specify separate priors for sigma and the correlation matrix: sigma = pm.Uniform('sigma', shape=n_var) corr_triangle = pm.LKJCorr('corr', n=1, p=n_var) corr_matrix = corr_triangle[tri_index] corr_matrix = tt.fill_diagonal(corr_matrix, 1) cov_matrix = tt.diag(sigma).dot(corr_matrix.dot(tt.diag(sigma))) like = pm.MvNormal('likelihood', mu=mu, cov=cov_matrix, observed=dataset) def run(n=1000): if n == "short": n = 50 with model: start = pm.find_MAP() step = pm.NUTS(scaling=start) trace = pm.sample(n, step=step, start=start) return trace
# In order to convert the upper triangular correlation values to a complete # correlation matrix, we need to construct an index matrix: n_elem = int(n_var * (n_var - 1) / 2) tri_index = np.zeros([n_var, n_var], dtype=int) tri_index[np.triu_indices(n_var, k=1)] = np.arange(n_elem) tri_index[np.triu_indices(n_var, k=1)[::-1]] = np.arange(n_elem) with Model() as model: mu = Normal('mu', mu=0, tau=1**-2, shape=n_var) # We can specify separate priors for sigma and the correlation matrix: sigma = Uniform('sigma', shape=n_var) corr_triangle = LKJCorr('corr', n=1, p=n_var) corr_matrix = corr_triangle[tri_index] corr_matrix = tt.fill_diagonal(corr_matrix, 1) cov_matrix = tt.diag(sigma).dot(corr_matrix.dot(tt.diag(sigma))) like = MvNormal('likelihood', mu=mu, tau=inv(cov_matrix), observed=dataset) def run(n=1000): if n == "short": n = 50 with model: start = find_MAP() step = NUTS(scaling=start) tr = sample(n, step=step, start=start)
def p_Yp_Y_var(Y): sqdistance = sqeuclidean_var(Y) one_over = T.fill_diagonal(1/(sqdistance + 1), 0) return one_over/one_over.sum() # Possibly dangerous
x = np.random.uniform(0, 10, size=N) y = np.random.normal(np.sin(x), np.sqrt(0.01)) plt.plot(x, y, 'o') plt.xlabel('$x$', fontsize=16) plt.ylabel('$f(x)$', fontsize=16, rotation=0) with pm.Model() as GP: mu = np.zeros(N) eta = pm.HalfCauchy('eta', 0.1) rho = pm.HalfCauchy('rho', 1) sigma = pm.HalfCauchy('sigma', 1) D = squared_distance(x, x) #SED(x,x) K = tt.fill_diagonal(eta * pm.math.exp(-rho * D), eta + sigma) #(K(x, x) + σ I) obs = pm.MvNormal('obs', mu, cov=K, observed=y) test_points = np.linspace(0, 10, 100) D_pred = squared_distance(test_points, test_points) #SED(x*,x*) D_off_diag = squared_distance(x, test_points) #SED(x,x*) n * N K_oo = eta * pm.math.exp(-rho * D_pred) #K(x*,x*) K_o = eta * pm.math.exp(-rho * D_off_diag) #K(x,x*) inv_K = tt.nlinalg.matrix_inverse(K) mu_post = pm.Deterministic('mu_post', pm.math.dot(pm.math.dot(K_o.T, inv_K), y)) SIGMA_post = pm.Deterministic(
def build_mod_bpmf_model(train, alpha=2, dim=10, std=0.01): """Build the modified BPMF model using pymc3. The original model uses Wishart priors on the covariance matrices. Unfortunately, the Wishart distribution in pymc3 is currently not suitable for sampling. This version decomposes the covariance matrix into: diag(sigma) \dot corr_matrix \dot diag(std). We use uniform priors on the standard deviations (sigma) and LKJCorr priors on the correlation matrices (corr_matrix): sigma ~ Uniform corr_matrix ~ LKJCorr(n=1, p=dim) """ n, m = train.shape beta_0 = 1 # scaling factor for lambdas; unclear on its use # Mean value imputation on training data. train = train.copy() nan_mask = np.isnan(train) train[nan_mask] = train[~nan_mask].mean() # We will use separate priors for sigma and correlation matrix. # In order to convert the upper triangular correlation values to a # complete correlation matrix, we need to construct an index matrix: n_elem = dim * (dim - 1) / 2 tri_index = np.zeros([dim, dim], dtype=int) tri_index[np.triu_indices(dim, k=1)] = np.arange(n_elem) tri_index[np.triu_indices(dim, k=1)[::-1]] = np.arange(n_elem) logging.info('building the BPMF model') with pm.Model() as bpmf: # Specify user feature matrix sigma_u = pm.Uniform('sigma_u', shape=dim) corr_triangle_u = pm.LKJCorr('corr_u', n=1, p=dim, testval=np.random.randn(n_elem) * std) corr_matrix_u = corr_triangle_u[tri_index] corr_matrix_u = t.fill_diagonal(corr_matrix_u, 1) cov_matrix_u = t.diag(sigma_u).dot(corr_matrix_u.dot(t.diag(sigma_u))) lambda_u = t.nlinalg.matrix_inverse(cov_matrix_u) mu_u = pm.Normal('mu_u', mu=0, tau=beta_0 * t.diag(lambda_u), shape=dim, testval=np.random.randn(dim) * std) U = pm.MvNormal('U', mu=mu_u, tau=lambda_u, shape=(n, dim), testval=np.random.randn(n, dim) * std) # Specify item feature matrix sigma_v = pm.Uniform('sigma_v', shape=dim) corr_triangle_v = pm.LKJCorr('corr_v', n=1, p=dim, testval=np.random.randn(n_elem) * std) corr_matrix_v = corr_triangle_v[tri_index] corr_matrix_v = t.fill_diagonal(corr_matrix_v, 1) cov_matrix_v = t.diag(sigma_v).dot(corr_matrix_v.dot(t.diag(sigma_v))) lambda_v = t.nlinalg.matrix_inverse(cov_matrix_v) mu_v = pm.Normal('mu_v', mu=0, tau=beta_0 * t.diag(lambda_v), shape=dim, testval=np.random.randn(dim) * std) V = pm.MvNormal('V', mu=mu_v, tau=lambda_v, shape=(m, dim), testval=np.random.randn(m, dim) * std) # Specify rating likelihood function R = pm.Normal('R', mu=t.dot(U, V.T), tau=alpha * np.ones((n, m)), observed=train) logging.info('done building the BPMF model') return bpmf
w = Dirichlet('w', a=pm.floatX(alpha), shape=(n_components, )) # Impose sparse structure onto mean with off-diagonal elements all being the same, because background should be the same throughout. mus_signal = MvNormal( 'mus_signal', mu=pm.floatX(signalMean_priorMean), tau=pm.floatX(np.eye(n_dimensions) / signalMean_priorSD**2), shape=n_dimensions) mus_background = MvNormal('mus_background', mu=pm.floatX(backgroundMean_priorMean), tau=pm.floatX( np.eye(n_dimensions) / backgroundMean_priorSD**2), shape=n_dimensions) mus = tt.fill_diagonal( tt.reshape(tt.tile(mus_background, n_components), (n_components, n_dimensions)), 0) + tt.eye(n_components, n_dimensions) * mus_signal # Impose structure for covariance as well, with off-diagonal elements being zero, just because that model is easier to fit. sigmas_signal = pm.Gamma('sigmas_signal', mu=pm.floatX(signalSD_priorMean), sd=pm.floatX(signalSD_priorSD), shape=n_dimensions) sigmas_background = pm.Gamma('sigmas_background', mu=pm.floatX(backgroundSD_priorMean), sd=pm.floatX(backgroundSD_priorSD), shape=n_dimensions) sigmas = tt.fill_diagonal( tt.reshape(tt.tile(sigmas_background, n_components), (n_components, n_dimensions)), 0) + tt.eye(n_components, n_dimensions) * sigmas_signal