Esempio n. 1
0
def data_likelihood_exact(genotype, observed_alleles):
    """'Exact' data likelihood, sum of sampling probability * join Q score for
    the observed alleles over all possible underlying 'true allele'
    combinations."""
    #print "probability that observations", [o['alt'] for o in observed_alleles], "arise from genotype", genotype
    observation_count = len(observed_alleles)
    ploidy = sum([count for allele, count in genotype])
    allele_probs = [count / float(ploidy) for allele, count in genotype]
    probs = []
    # for all true allele combinations X permutations
    for true_allele_combination in multiset.multichoose(observation_count, [x[0] for x in genotype]):
        for true_allele_permutation in multiset.permutations(true_allele_combination):
            # this mapping allows us to use sampling_prob the same way as we do when we use JSON allele observation records
            true_alleles = [{'alt':allele} for allele in true_allele_permutation]
            allele_groups = group_alleles(true_alleles)
            observations = []
            for allele, count in genotype:
                if allele_groups.has_key(allele):
                    observations.append(len(allele_groups[allele]))
                else:
                    observations.append(0)
            #sprob = dirichlet_maximum_likelihood_ratio(allele_probs, observations) # distribution parameter here
            lnsampling_prob = multinomialln(allele_probs, observations)
            prob = lnsampling_prob + likelihood_given_true_alleles(observed_alleles, true_alleles)
            #print math.exp(prob), sprob, genotype, true_allele_permutation
            #print genotype, math.exp(prob), sprob, true_allele_permutation, [o['alt'] for o in observed_alleles]
            probs.append(prob)
    # sum the individual probability of all combinations
    p = logsumexp(probs)
    #print math.exp(p)
    return p
Esempio n. 2
0
def logitMnPred(model, X):
    W = model.W
    (d,n) = np.shape(X)
    X = np.concatenate((X,np.ones((1,n))),0)
    A = W.transpose().dot(X)
    P = np.exp(A-logsumexp(A,0))
    y =  np.argmax(P,axis=0)
    return y,P
Esempio n. 3
0
def newtonRaphson(X, t, Lambda):
    (d, n) = np.shape(X)
    k = np.max(t)
    tol = 1e-4
    maxiter = 100
    inf = 1000000
    llh = np.ones(maxiter) * (-inf)
    dk = d * k
    idx = np.array(range(0, dk))
    if len(idx) == 0:
        idx = 0
    dg = sub2ind((dk, dk), idx, idx)
    T = csr_matrix(
        (np.ones(n, ), ((t - 1).reshape(n, ), np.array(range(0, n)))),
        shape=(k, n)).toarray()
    W = np.zeros((d, k))
    HT = np.zeros((d, k, d, k))
    for iter in range(1, maxiter):
        A = W.transpose().dot(X)
        logY = A - logsumexp(A, 0)
        llh[iter] = np.multiply(
            T, logY).sum() - 0.5 * Lambda * np.multiply(W, W).sum()
        if (llh[iter] - llh[iter - 1] < tol):
            break
        Y = np.exp(logY)
        for i in range(0, k):
            for j in range(0, k):
                r = Y[i, ] * ((i == j) - Y[j, ]
                              )  # r has negative value, so cannot use sqrt
                HT[:, i, :, j] = (X * r).dot(X.transpose())
        G = X.dot((Y - T).transpose()) + Lambda * W
        H = np.reshape(HT, (dk, dk))
        Hi = H.flatten()
        Hi[dg] = Hi[dg] + Lambda
        H = Hi.reshape(H.shape)
        Wi = W.flatten() - mldivide(H, G.flatten())
        W = Wi.reshape(W.shape)

    llh = llh[1:iter]
    return W, llh
Esempio n. 4
0
def data_likelihood_exact(genotype, observed_alleles):
    """'Exact' data likelihood, sum of sampling probability * join Q score for
    the observed alleles over all possible underlying 'true allele'
    combinations."""
    #print "probability that observations", [o['alt'] for o in observed_alleles], "arise from genotype", genotype
    observation_count = len(observed_alleles)
    ploidy = sum([count for allele, count in genotype])
    allele_probs = [count / float(ploidy) for allele, count in genotype]
    probs = []
    # for all true allele combinations X permutations
    for true_allele_combination in multiset.multichoose(
            observation_count, [x[0] for x in genotype]):
        for true_allele_permutation in multiset.permutations(
                true_allele_combination):
            # this mapping allows us to use sampling_prob the same way as we do when we use JSON allele observation records
            true_alleles = [{
                'alt': allele
            } for allele in true_allele_permutation]
            allele_groups = group_alleles(true_alleles)
            observations = []
            for allele, count in genotype:
                if allele_groups.has_key(allele):
                    observations.append(len(allele_groups[allele]))
                else:
                    observations.append(0)
            #sprob = dirichlet_maximum_likelihood_ratio(allele_probs, observations) # distribution parameter here
            lnsampling_prob = multinomialln(allele_probs, observations)
            prob = lnsampling_prob + likelihood_given_true_alleles(
                observed_alleles, true_alleles)
            #print math.exp(prob), sprob, genotype, true_allele_permutation
            #print genotype, math.exp(prob), sprob, true_allele_permutation, [o['alt'] for o in observed_alleles]
            probs.append(prob)
    # sum the individual probability of all combinations
    p = logsumexp(probs)
    #print math.exp(p)
    return p
Esempio n. 5
0
            genotype_combo_probs.append([combo, combo_prob])

        genotype_combo_probs = sorted(genotype_combo_probs,
                                      key=lambda c: c[1],
                                      reverse=True)
        #for line in [json.dumps({'prob':prior_probability_of_genotype, 'combo':combo}) for combo, prior_probability_of_genotype in genotype_combo_probs]:
        #    print line

        # sum, use to normalize
        # apply bayes rule

        #print genotype_combo_probs
        #print [prob for combo, prob in genotype_combo_probs]
        #for combo, prob in genotype_combo_probs:
        #    print prob
        posterior_normalizer = logsumexp(
            [prob for combo, prob in genotype_combo_probs])

        # handle marginals
        for sample, genotype_probs in marginals.iteritems():
            for genotype, probs in genotype_probs.iteritems():
                marginals[sample][genotype] = logsumexp(
                    probs) - posterior_normalizer

        best_genotype_combo = genotype_combo_probs[0][0]
        best_genotype_combo_prob = genotype_combo_probs[0][1]

        #best_genotype_probability = math.exp(sum([prob for name, (genotype, prob) in best_genotype_combo]) \
        #        + allele_frequency_probabilityln(count_frequencies([genotype for name, (genotype, prob) in best_genotype_combo])) \
        #        - posterior_normalizer)
        best_genotype_probability = math.exp(best_genotype_combo_prob -
                                             posterior_normalizer)
def get_LL(y_hat, y, tau):
    # this is eqn (8) from https://arxiv.org/pdf/1506.02142.pdf (Gal)
    n_mc = len(y_hat)
    #print "get_LL... n_mc=", n_mc
    return logsumexp(-.5 * tau * (y_hat - y)**2) - np.log(n_mc) - .5 * np.log(
        2 * np.pi) - .5 * np.log(tau**-1)
Esempio n. 7
0
                else:
                    marginals[name][gstr] = [combo_prob]
            genotype_combo_probs.append([combo, combo_prob])

        genotype_combo_probs = sorted(genotype_combo_probs, key=lambda c: c[1], reverse=True)
        #for line in [json.dumps({'prob':prior_probability_of_genotype, 'combo':combo}) for combo, prior_probability_of_genotype in genotype_combo_probs]:
        #    print line

        # sum, use to normalize
        # apply bayes rule

        #print genotype_combo_probs
        #print [prob for combo, prob in genotype_combo_probs]
        #for combo, prob in genotype_combo_probs:
        #    print prob
        posterior_normalizer = logsumexp([prob for combo, prob in genotype_combo_probs])

        # handle marginals
        for sample, genotype_probs in marginals.iteritems():
            for genotype, probs in genotype_probs.iteritems():
                marginals[sample][genotype] = logsumexp(probs) - posterior_normalizer

        best_genotype_combo = genotype_combo_probs[0][0]
        best_genotype_combo_prob = genotype_combo_probs[0][1]

        #best_genotype_probability = math.exp(sum([prob for name, (genotype, prob) in best_genotype_combo]) \
        #        + allele_frequency_probabilityln(count_frequencies([genotype for name, (genotype, prob) in best_genotype_combo])) \
        #        - posterior_normalizer)
        best_genotype_probability = math.exp(best_genotype_combo_prob - posterior_normalizer)
        position['best_genotype_combo'] = [[name, genotype_str(genotype), math.exp(marginals[name][genotype_str(genotype)])] 
                                                  for name, (genotype, prob) in best_genotype_combo]
def gaussian_dpmixture_gibbsstep(X, assignments, prior, post):
    ##post: posteriors
    
    #N sample
    (N,dim) = np.shape(X)
    #assignments = np.zeros((N,n_components))
    n_components = np.shape(assignments)[1]

    for n in range(0,N):
        #return indices of nonzero element
        (row,cur_z) = np.nonzero(assignments[n,:])

        assignments[n,cur_z] = 0
        # Nc = 0
        if post.ns(cur_z) == 1 :
            ## delete not assigned component
#high dimension matrix delete????
            post['Chols'].pop(cur_z)
            assignments = np.delete(assignments, (cur_z), axis=1)
            post['ms'] = np.delete(post['ms'], (cur_z), axis=0)
            post['ns'] = np.delete(post['ns'], cur_z)
            post['rs'] = np.delete(post['rs'], cur_z)
            post['nus'] = np.delete(post['nus'], cur_z)
            post['alphas'] = np.delete(post['alphas'], cur_z)
            n_components = n_components-1
         else:
            ## omit the current sample  ??????
            #cholupdate function ???
            post['Chols'][cur_z] = cholupdate(post.Chols(:,:,cur_z),sqrt(post.rs(cur_z))*post.ms(cur_z,:)','+')
            post['ms'][cur_z,:] = post['ms'][cur_z,:]*(prior['r']+post['ns'][cur_z])-X[n,:]
            post['ns'][cur_z] = post['ns'][cur_z]-1
            post['rs'][cur_z] = post['rs'][cur_z]-1
            post['nus'][cur_z] = post['nus'][cur_z]-1
            post['ms'][cur_z,:] = post['ms'][cur_z,:]/(prior['r']+post['ns'][cur_z])
            post.Chols(:,:,cur_z) = cholupdate(post.Chols(:,:,cur_z),X(n,:)','-');
            post.Chols(:,:,cur_z) = cholupdate(post.Chols(:,:,cur_z),sqrt(post.rs(cur_z))*post.ms(cur_z,:)','-')
            post.alphas(cur_z) = post.alphas(cur_z)-1;
      
    
        ## Dirichlet-multinomial part
        # sum(assignments) = Nc
        prob = [np.log(sum(assignments)),np.log(prior['alpha'])]
        ##prob = [log(post.ns(:,1)'),log(prior.alpha)];
    
        ## Gaussian-Wishart part
        for z in range(0, n_components):
           ## calculate posteriors when sample n is assigned to z
            Chol = cholupdate(post.Chols(:,:,z),sqrt(post.rs(z))*post.ms(z,:)','+');
            m = post['ms'][z,:]*(prior['r']+post['ns'][z])+X[n,:]
            num = post['ns'][z]+1
            r = post['rs'][z]+1
            nu = post['nus'][z]+1
            m = m/(prior['r']+num)
            Chol = cholupdate(Chol,X(n,:)','+')
            Chol = cholupdate(Chol,sqrt(r)*m','-')
            ##prob(z) = prob(z) + loglikelihood(num,dim,r,nu,Chol);
            prob[z] = prob[z] + loglikelihood(dim,post['ns'][z],post['rs'][z],post['nus'][z],post['Chols'][z],num,r,nu,Chol)
   
        newm = (prior['r']*prior['m']+X[n,:])/(prior['r']+1)
        newr = prior['r']+1
        newnu = prior['nu']+1
        newS = prior['S']+np.matrix(X[n,:]).T*X[n,:]+prior['r']*(np.matrix(prior['m']).T*prior['m'])-newr*(np.matrix(newm).T*newm)
        newChol = sl.cholesky(newS)
        ##prob(end) = prob(end) + loglikelihood(1,dim,newr,newnu,newChol);
        ##prob(end) = prob(end) + loglikelihood(dim, 1,dim,newr,newnu,newChol);
        prob[-1] = prob[-1] + loglikelihood(dim,0,prior['r'],prior['nu'],prior['Chol'],1,newr,newnu,newChol)
    
        ## normalize so as to sum to one
        psum = logsumexp(prob,2)
        prob = np.exp(prob-psum)
        end = len(prob)
        ## sampling assignment
        ##newassignment = mnrnd(1,[prob(0:end-1),1-sum(prob(0:end-1))])
        newassignment = np.random.multinomial(1, [prob(0:end-1),1-sum(prob(0:end-1))], size=1)
        if isnan(newassignment) : ## FIXME
            newassignment = np.zeros((1,n_components+1))
            [ignore,maxind]=max(prob)
            newassignment(maxind) = 1
    
        cur_z = np.nonzero(newassignment)

        if newassignment[-1] == 1:
            ##initialize posteriors
            assignments = [assignments,np.zeros((N,1))]
            assignments[n,cur_z] = 1
            post.ns[cur_z] = 1
            post.rs[cur_z] = prior['r']+1
            post.nus[cur_z] = prior['nu']+1
            post.ms[cur_z,:] = (prior['r']*prior['m']+X[n,:])/(prior['r']+1)
            S = prior['S']+np.matrix(X[n,:]).T*X[n,:]+prior['r']*(np.matrix(prior['m']).T*prior['m'])-post['rs'][cur_z]*(np.matrix(post['ms'][cur_z,:]).T*post['ms'][cur_z,:])
            post['Chols'][cur_z] = sl.cholskey(S)
            post['alphas'][cur_z] = prior['alpha']+1
            n_components = n_components+1
        else:
            ## update the hyperparameters according to sampled assignment
            assignments[n,cur_z] = 1
            post['Chols'][cur_z] = cholupdate(post.Chols(:,:,cur_z),sqrt(post.rs(cur_z))*post.ms(cur_z,:)','+')
            post['ms'][cur_z,:] = post['ms'][cur_z,:]*(prior['r']+post['ns'][cur_z])+X[n,:]
            post['ns'][cur_z] = post['ns'][cur_z]+1
            post['rs'][cur_z] = post['rs'][cur_z]+1
            post['nus'][cur_z] = post['nus'][cur_z]+1
            post['ms'][cur_z,:] = post['ms'][cur_z,:]/(prior['r']+post['ns'][cur_z])
            post['Chols'][cur_z] = cholupdate(post.Chols(:,:,cur_z),X(n,:)','+');
            post['Chols'][cur_z] = cholupdate(post.Chols(:,:,cur_z),sqrt(post.rs(cur_z))*post.ms(cur_z,:)','-');
            post['alphas'][cur_z] = post['alphas'][cur_z]
    
        ## check updates
    ##{
    ##for z = 1:n_components    
        ##fprintf('---------------\n');
        ##Xz = X(find(assignments(:,z)==1),:);
        ##post.ms(z,:)
        ##(prior.r*prior.m+sum(Xz,1))/(prior.r+post.ns(z))
        ##S = prior.S+Xz'*Xz+prior.r*(prior.m'*prior.m)-post.rs(z)*(post.ms(z,:)'*post.ms(z,:));
        ##post.Chols(:,:,z)
        ##cholcov(S)
    ##end
    


    L = 0
    ## likelihood for Dirichlet process
    L = L+n_components*math.log(prior['alpha'])
    for z in range(0, n_components):
        L = L+math.log(math.gamma(post['ns'][z]))

    L = L-math.log(math.gamma(prior['alpha']+N))
    L = L+math.log(math.gamma(prior['alpha']))

    ## likelihood for Gaussian-Wishart
    for z in range(0, n_components):
        L = L - 0.5*post['ns'][z]*dim*math.log(math.pi)
        L = L - 0.5*dim*math.log(post['rs'][z])
        L = L - post['nus'][z]*sum(log(diag(post['Chols'][z])))
        for d in range(0, dim):
           L = L + math.log(math.gamma(0.5*(post['nus'][z]+1-d)))



    return (L, assignments, post)