def data_likelihood_exact(genotype, observed_alleles): """'Exact' data likelihood, sum of sampling probability * join Q score for the observed alleles over all possible underlying 'true allele' combinations.""" #print "probability that observations", [o['alt'] for o in observed_alleles], "arise from genotype", genotype observation_count = len(observed_alleles) ploidy = sum([count for allele, count in genotype]) allele_probs = [count / float(ploidy) for allele, count in genotype] probs = [] # for all true allele combinations X permutations for true_allele_combination in multiset.multichoose(observation_count, [x[0] for x in genotype]): for true_allele_permutation in multiset.permutations(true_allele_combination): # this mapping allows us to use sampling_prob the same way as we do when we use JSON allele observation records true_alleles = [{'alt':allele} for allele in true_allele_permutation] allele_groups = group_alleles(true_alleles) observations = [] for allele, count in genotype: if allele_groups.has_key(allele): observations.append(len(allele_groups[allele])) else: observations.append(0) #sprob = dirichlet_maximum_likelihood_ratio(allele_probs, observations) # distribution parameter here lnsampling_prob = multinomialln(allele_probs, observations) prob = lnsampling_prob + likelihood_given_true_alleles(observed_alleles, true_alleles) #print math.exp(prob), sprob, genotype, true_allele_permutation #print genotype, math.exp(prob), sprob, true_allele_permutation, [o['alt'] for o in observed_alleles] probs.append(prob) # sum the individual probability of all combinations p = logsumexp(probs) #print math.exp(p) return p
def logitMnPred(model, X): W = model.W (d,n) = np.shape(X) X = np.concatenate((X,np.ones((1,n))),0) A = W.transpose().dot(X) P = np.exp(A-logsumexp(A,0)) y = np.argmax(P,axis=0) return y,P
def newtonRaphson(X, t, Lambda): (d, n) = np.shape(X) k = np.max(t) tol = 1e-4 maxiter = 100 inf = 1000000 llh = np.ones(maxiter) * (-inf) dk = d * k idx = np.array(range(0, dk)) if len(idx) == 0: idx = 0 dg = sub2ind((dk, dk), idx, idx) T = csr_matrix( (np.ones(n, ), ((t - 1).reshape(n, ), np.array(range(0, n)))), shape=(k, n)).toarray() W = np.zeros((d, k)) HT = np.zeros((d, k, d, k)) for iter in range(1, maxiter): A = W.transpose().dot(X) logY = A - logsumexp(A, 0) llh[iter] = np.multiply( T, logY).sum() - 0.5 * Lambda * np.multiply(W, W).sum() if (llh[iter] - llh[iter - 1] < tol): break Y = np.exp(logY) for i in range(0, k): for j in range(0, k): r = Y[i, ] * ((i == j) - Y[j, ] ) # r has negative value, so cannot use sqrt HT[:, i, :, j] = (X * r).dot(X.transpose()) G = X.dot((Y - T).transpose()) + Lambda * W H = np.reshape(HT, (dk, dk)) Hi = H.flatten() Hi[dg] = Hi[dg] + Lambda H = Hi.reshape(H.shape) Wi = W.flatten() - mldivide(H, G.flatten()) W = Wi.reshape(W.shape) llh = llh[1:iter] return W, llh
def data_likelihood_exact(genotype, observed_alleles): """'Exact' data likelihood, sum of sampling probability * join Q score for the observed alleles over all possible underlying 'true allele' combinations.""" #print "probability that observations", [o['alt'] for o in observed_alleles], "arise from genotype", genotype observation_count = len(observed_alleles) ploidy = sum([count for allele, count in genotype]) allele_probs = [count / float(ploidy) for allele, count in genotype] probs = [] # for all true allele combinations X permutations for true_allele_combination in multiset.multichoose( observation_count, [x[0] for x in genotype]): for true_allele_permutation in multiset.permutations( true_allele_combination): # this mapping allows us to use sampling_prob the same way as we do when we use JSON allele observation records true_alleles = [{ 'alt': allele } for allele in true_allele_permutation] allele_groups = group_alleles(true_alleles) observations = [] for allele, count in genotype: if allele_groups.has_key(allele): observations.append(len(allele_groups[allele])) else: observations.append(0) #sprob = dirichlet_maximum_likelihood_ratio(allele_probs, observations) # distribution parameter here lnsampling_prob = multinomialln(allele_probs, observations) prob = lnsampling_prob + likelihood_given_true_alleles( observed_alleles, true_alleles) #print math.exp(prob), sprob, genotype, true_allele_permutation #print genotype, math.exp(prob), sprob, true_allele_permutation, [o['alt'] for o in observed_alleles] probs.append(prob) # sum the individual probability of all combinations p = logsumexp(probs) #print math.exp(p) return p
genotype_combo_probs.append([combo, combo_prob]) genotype_combo_probs = sorted(genotype_combo_probs, key=lambda c: c[1], reverse=True) #for line in [json.dumps({'prob':prior_probability_of_genotype, 'combo':combo}) for combo, prior_probability_of_genotype in genotype_combo_probs]: # print line # sum, use to normalize # apply bayes rule #print genotype_combo_probs #print [prob for combo, prob in genotype_combo_probs] #for combo, prob in genotype_combo_probs: # print prob posterior_normalizer = logsumexp( [prob for combo, prob in genotype_combo_probs]) # handle marginals for sample, genotype_probs in marginals.iteritems(): for genotype, probs in genotype_probs.iteritems(): marginals[sample][genotype] = logsumexp( probs) - posterior_normalizer best_genotype_combo = genotype_combo_probs[0][0] best_genotype_combo_prob = genotype_combo_probs[0][1] #best_genotype_probability = math.exp(sum([prob for name, (genotype, prob) in best_genotype_combo]) \ # + allele_frequency_probabilityln(count_frequencies([genotype for name, (genotype, prob) in best_genotype_combo])) \ # - posterior_normalizer) best_genotype_probability = math.exp(best_genotype_combo_prob - posterior_normalizer)
def get_LL(y_hat, y, tau): # this is eqn (8) from https://arxiv.org/pdf/1506.02142.pdf (Gal) n_mc = len(y_hat) #print "get_LL... n_mc=", n_mc return logsumexp(-.5 * tau * (y_hat - y)**2) - np.log(n_mc) - .5 * np.log( 2 * np.pi) - .5 * np.log(tau**-1)
else: marginals[name][gstr] = [combo_prob] genotype_combo_probs.append([combo, combo_prob]) genotype_combo_probs = sorted(genotype_combo_probs, key=lambda c: c[1], reverse=True) #for line in [json.dumps({'prob':prior_probability_of_genotype, 'combo':combo}) for combo, prior_probability_of_genotype in genotype_combo_probs]: # print line # sum, use to normalize # apply bayes rule #print genotype_combo_probs #print [prob for combo, prob in genotype_combo_probs] #for combo, prob in genotype_combo_probs: # print prob posterior_normalizer = logsumexp([prob for combo, prob in genotype_combo_probs]) # handle marginals for sample, genotype_probs in marginals.iteritems(): for genotype, probs in genotype_probs.iteritems(): marginals[sample][genotype] = logsumexp(probs) - posterior_normalizer best_genotype_combo = genotype_combo_probs[0][0] best_genotype_combo_prob = genotype_combo_probs[0][1] #best_genotype_probability = math.exp(sum([prob for name, (genotype, prob) in best_genotype_combo]) \ # + allele_frequency_probabilityln(count_frequencies([genotype for name, (genotype, prob) in best_genotype_combo])) \ # - posterior_normalizer) best_genotype_probability = math.exp(best_genotype_combo_prob - posterior_normalizer) position['best_genotype_combo'] = [[name, genotype_str(genotype), math.exp(marginals[name][genotype_str(genotype)])] for name, (genotype, prob) in best_genotype_combo]
def gaussian_dpmixture_gibbsstep(X, assignments, prior, post): ##post: posteriors #N sample (N,dim) = np.shape(X) #assignments = np.zeros((N,n_components)) n_components = np.shape(assignments)[1] for n in range(0,N): #return indices of nonzero element (row,cur_z) = np.nonzero(assignments[n,:]) assignments[n,cur_z] = 0 # Nc = 0 if post.ns(cur_z) == 1 : ## delete not assigned component #high dimension matrix delete???? post['Chols'].pop(cur_z) assignments = np.delete(assignments, (cur_z), axis=1) post['ms'] = np.delete(post['ms'], (cur_z), axis=0) post['ns'] = np.delete(post['ns'], cur_z) post['rs'] = np.delete(post['rs'], cur_z) post['nus'] = np.delete(post['nus'], cur_z) post['alphas'] = np.delete(post['alphas'], cur_z) n_components = n_components-1 else: ## omit the current sample ?????? #cholupdate function ??? post['Chols'][cur_z] = cholupdate(post.Chols(:,:,cur_z),sqrt(post.rs(cur_z))*post.ms(cur_z,:)','+') post['ms'][cur_z,:] = post['ms'][cur_z,:]*(prior['r']+post['ns'][cur_z])-X[n,:] post['ns'][cur_z] = post['ns'][cur_z]-1 post['rs'][cur_z] = post['rs'][cur_z]-1 post['nus'][cur_z] = post['nus'][cur_z]-1 post['ms'][cur_z,:] = post['ms'][cur_z,:]/(prior['r']+post['ns'][cur_z]) post.Chols(:,:,cur_z) = cholupdate(post.Chols(:,:,cur_z),X(n,:)','-'); post.Chols(:,:,cur_z) = cholupdate(post.Chols(:,:,cur_z),sqrt(post.rs(cur_z))*post.ms(cur_z,:)','-') post.alphas(cur_z) = post.alphas(cur_z)-1; ## Dirichlet-multinomial part # sum(assignments) = Nc prob = [np.log(sum(assignments)),np.log(prior['alpha'])] ##prob = [log(post.ns(:,1)'),log(prior.alpha)]; ## Gaussian-Wishart part for z in range(0, n_components): ## calculate posteriors when sample n is assigned to z Chol = cholupdate(post.Chols(:,:,z),sqrt(post.rs(z))*post.ms(z,:)','+'); m = post['ms'][z,:]*(prior['r']+post['ns'][z])+X[n,:] num = post['ns'][z]+1 r = post['rs'][z]+1 nu = post['nus'][z]+1 m = m/(prior['r']+num) Chol = cholupdate(Chol,X(n,:)','+') Chol = cholupdate(Chol,sqrt(r)*m','-') ##prob(z) = prob(z) + loglikelihood(num,dim,r,nu,Chol); prob[z] = prob[z] + loglikelihood(dim,post['ns'][z],post['rs'][z],post['nus'][z],post['Chols'][z],num,r,nu,Chol) newm = (prior['r']*prior['m']+X[n,:])/(prior['r']+1) newr = prior['r']+1 newnu = prior['nu']+1 newS = prior['S']+np.matrix(X[n,:]).T*X[n,:]+prior['r']*(np.matrix(prior['m']).T*prior['m'])-newr*(np.matrix(newm).T*newm) newChol = sl.cholesky(newS) ##prob(end) = prob(end) + loglikelihood(1,dim,newr,newnu,newChol); ##prob(end) = prob(end) + loglikelihood(dim, 1,dim,newr,newnu,newChol); prob[-1] = prob[-1] + loglikelihood(dim,0,prior['r'],prior['nu'],prior['Chol'],1,newr,newnu,newChol) ## normalize so as to sum to one psum = logsumexp(prob,2) prob = np.exp(prob-psum) end = len(prob) ## sampling assignment ##newassignment = mnrnd(1,[prob(0:end-1),1-sum(prob(0:end-1))]) newassignment = np.random.multinomial(1, [prob(0:end-1),1-sum(prob(0:end-1))], size=1) if isnan(newassignment) : ## FIXME newassignment = np.zeros((1,n_components+1)) [ignore,maxind]=max(prob) newassignment(maxind) = 1 cur_z = np.nonzero(newassignment) if newassignment[-1] == 1: ##initialize posteriors assignments = [assignments,np.zeros((N,1))] assignments[n,cur_z] = 1 post.ns[cur_z] = 1 post.rs[cur_z] = prior['r']+1 post.nus[cur_z] = prior['nu']+1 post.ms[cur_z,:] = (prior['r']*prior['m']+X[n,:])/(prior['r']+1) S = prior['S']+np.matrix(X[n,:]).T*X[n,:]+prior['r']*(np.matrix(prior['m']).T*prior['m'])-post['rs'][cur_z]*(np.matrix(post['ms'][cur_z,:]).T*post['ms'][cur_z,:]) post['Chols'][cur_z] = sl.cholskey(S) post['alphas'][cur_z] = prior['alpha']+1 n_components = n_components+1 else: ## update the hyperparameters according to sampled assignment assignments[n,cur_z] = 1 post['Chols'][cur_z] = cholupdate(post.Chols(:,:,cur_z),sqrt(post.rs(cur_z))*post.ms(cur_z,:)','+') post['ms'][cur_z,:] = post['ms'][cur_z,:]*(prior['r']+post['ns'][cur_z])+X[n,:] post['ns'][cur_z] = post['ns'][cur_z]+1 post['rs'][cur_z] = post['rs'][cur_z]+1 post['nus'][cur_z] = post['nus'][cur_z]+1 post['ms'][cur_z,:] = post['ms'][cur_z,:]/(prior['r']+post['ns'][cur_z]) post['Chols'][cur_z] = cholupdate(post.Chols(:,:,cur_z),X(n,:)','+'); post['Chols'][cur_z] = cholupdate(post.Chols(:,:,cur_z),sqrt(post.rs(cur_z))*post.ms(cur_z,:)','-'); post['alphas'][cur_z] = post['alphas'][cur_z] ## check updates ##{ ##for z = 1:n_components ##fprintf('---------------\n'); ##Xz = X(find(assignments(:,z)==1),:); ##post.ms(z,:) ##(prior.r*prior.m+sum(Xz,1))/(prior.r+post.ns(z)) ##S = prior.S+Xz'*Xz+prior.r*(prior.m'*prior.m)-post.rs(z)*(post.ms(z,:)'*post.ms(z,:)); ##post.Chols(:,:,z) ##cholcov(S) ##end L = 0 ## likelihood for Dirichlet process L = L+n_components*math.log(prior['alpha']) for z in range(0, n_components): L = L+math.log(math.gamma(post['ns'][z])) L = L-math.log(math.gamma(prior['alpha']+N)) L = L+math.log(math.gamma(prior['alpha'])) ## likelihood for Gaussian-Wishart for z in range(0, n_components): L = L - 0.5*post['ns'][z]*dim*math.log(math.pi) L = L - 0.5*dim*math.log(post['rs'][z]) L = L - post['nus'][z]*sum(log(diag(post['Chols'][z]))) for d in range(0, dim): L = L + math.log(math.gamma(0.5*(post['nus'][z]+1-d))) return (L, assignments, post)