def update_Estep(self, cascade, scores, alpha, beta, tau, pi, gamma, mu=None, B=None, omega=None, omegao=None): footprint_logodds = np.zeros((self.N,1),dtype=float) if gamma.model=='modelA': lhoodA, lhoodB = likelihoodAB(cascade, B=B, model=gamma.model) elif gamma.model=='modelB': lhoodA, lhoodB = likelihoodAB(cascade, mu=mu, model=gamma.model) elif gamma.model=='modelC': lhoodA, lhoodB, lhoodC = likelihoodAB(cascade, B=B, omega=omega, omegao=omegao, model=gamma.model) for j in xrange(pi.J): footprint_logodds += insum((1-gamma.value[j])*(lhoodB.value[j]-lhoodA.value[j]) \ + gamma.value[j]*(nplog(pi.estim[j])-nplog(gamma.value[j])) \ + (1-gamma.value[j])*(nplog(1-pi.estim[j])-nplog(1-gamma.value[j])),[1]) self.estim[:,1:] = beta.estim[0] + beta.estim[1]*scores + footprint_logodds \ + gammaln(self.total+alpha.estim[1]) - gammaln(self.total+alpha.estim[0]) \ + gammaln(alpha.estim[0]) - gammaln(alpha.estim[1]) \ + alpha.estim[1]*nplog(tau.estim[1]) - alpha.estim[0]*nplog(tau.estim[0]) \ + self.total*(nplog(1-tau.estim[1])-nplog(1-tau.estim[0])) self.estim[:,0] = 0. self.estim[self.estim==np.inf] = MAX self.estim = np.exp(self.estim-np.max(self.estim,1).reshape(self.N,1)) self.estim = self.estim/insum(self.estim,[1]) if np.isnan(self.estim).any(): print "Nan in Eta" raise ValueError if np.isinf(self.estim).any(): print "Inf in Eta" raise ValueError
def __init__(self, cascade, totalreads, scores, gamma=None, beta=None, \ pi=None, mu=None, B=None, omega=None, omegao=None, alpha=None, tau=None): self.N = cascade.N self.total = totalreads.reshape(self.N, 1) self.estim = np.zeros((self.N, 2), dtype=float) if alpha is None: indices = np.argsort(self.total.ravel())[:self.N / 2] self.estim[indices, 1:] = -MAX indices = np.argsort(self.total.ravel())[self.N / 2:] self.estim[indices, 1:] = MAX else: footprint_logodds = np.zeros((self.N, 1), dtype=float) if gamma.model == 'modelA': lhoodA, lhoodB = likelihoodAB(cascade, B=B, model=gamma.model) elif gamma.model == 'modelB': lhoodA, lhoodB = likelihoodAB(cascade, mu=mu, model=gamma.model) elif gamma.model == 'modelC': lhoodA, lhoodB, lhoodC = likelihoodAB(cascade, B=B, omega=omega, omegao=omegao, model=gamma.model) for j in xrange(pi.J): if model == 'modelC': footprint_logodds += insum( gamma.value[j] * lhoodA.value[j] - lhoodC.value[j] + (1 - gamma.value[j]) * lhoodB.value[j], [1]) else: footprint_logodds += insum( (1 - gamma.value[j]) * (lhoodB.value[j] - lhoodA.value[j]), [1]) footprint_logodds += insum(gamma.value[j]*(nplog(pi.estim[j])-nplog(gamma.value[j])) \ + (1-gamma.value[j])*(nplog(1-pi.estim[j])-nplog(1-gamma.value[j])),[1]) self.estim[:,1:] = beta.estim[0] + beta.estim[1]*scores + footprint_logodds \ + gammaln(self.total+alpha.estim[1]) - gammaln(self.total+alpha.estim[0]) \ + gammaln(alpha.estim[0]) - gammaln(alpha.estim[1]) \ + alpha.estim[1]*nplog(tau.estim[1]) - alpha.estim[0]*nplog(tau.estim[0]) \ + self.total*(nplog(1-tau.estim[1])-nplog(1-tau.estim[0])) if alpha is None: self.estim[self.estim == np.inf] = MAX self.estim = np.exp(self.estim - np.max(self.estim, 1).reshape(self.N, 1)) self.estim = self.estim / insum(self.estim, [1]) else: self.estim[:, 1:] = self.estim[:, 1:] / np.log(10)
def update_Estep(self, cascade, scores, alpha, beta, tau, pi, gamma, mu=None, B=None, omega=None, omegao=None): footprint_logodds = np.zeros((self.N, 1), dtype=float) if gamma.model == 'modelA': lhoodA, lhoodB = likelihoodAB(cascade, B=B, model=gamma.model) elif gamma.model == 'modelB': lhoodA, lhoodB = likelihoodAB(cascade, mu=mu, model=gamma.model) elif gamma.model == 'modelC': lhoodA, lhoodB, lhoodC = likelihoodAB(cascade, B=B, omega=omega, omegao=omegao, model=gamma.model) for j in xrange(pi.J): footprint_logodds += insum((1-gamma.value[j])*(lhoodB.value[j]-lhoodA.value[j]) \ + gamma.value[j]*(nplog(pi.estim[j])-nplog(gamma.value[j])) \ + (1-gamma.value[j])*(nplog(1-pi.estim[j])-nplog(1-gamma.value[j])),[1]) self.estim[:,1:] = beta.estim[0] + beta.estim[1]*scores + footprint_logodds \ + gammaln(self.total+alpha.estim[1]) - gammaln(self.total+alpha.estim[0]) \ + gammaln(alpha.estim[0]) - gammaln(alpha.estim[1]) \ + alpha.estim[1]*nplog(tau.estim[1]) - alpha.estim[0]*nplog(tau.estim[0]) \ + self.total*(nplog(1-tau.estim[1])-nplog(1-tau.estim[0])) self.estim[:, 0] = 0. self.estim[self.estim == np.inf] = MAX self.estim = np.exp(self.estim - np.max(self.estim, 1).reshape(self.N, 1)) self.estim = self.estim / insum(self.estim, [1]) if np.isnan(self.estim).any(): print "Nan in Eta" raise ValueError if np.isinf(self.estim).any(): print "Inf in Eta" raise ValueError
def likelihood(cascade, scores, eta, gamma, pi, alpha, beta, tau, mu=None, B=None, omega=None, omegao=None): apriori = beta.estim[0] + beta.estim[1]*scores if gamma.model=='modelA': lhoodA, lhoodB = likelihoodAB(cascade, B=B, model=gamma.model) elif gamma.model=='modelB': lhoodA, lhoodB = likelihoodAB(cascade, mu=mu, model=gamma.model) elif gamma.model=='modelC': lhoodA, lhoodB, lhoodC = likelihoodAB(cascade, B=B, omega=omega, omegao=omegao, model=gamma.model) footprint = np.zeros((cascade.N,1),dtype=float) for j in xrange(pi.J): footprint += insum(gamma.value[j]*lhoodA.value[j] + (1-gamma.value[j])*lhoodB.value[j] \ + gamma.value[j]*(nplog(pi.estim[j])-nplog(gamma.value[j])) \ + (1-gamma.value[j])*(nplog(1-pi.estim[j])-nplog(1-gamma.value[j])),[1]) P_1 = footprint + gammaln(eta.total+alpha.estim[1]) - gammaln(alpha.estim[1]) \ + alpha.estim[1]*nplog(tau.estim[1]) + eta.total*nplog(1-tau.estim[1]) P_1[P_1==np.inf] = MAX P_1[P_1==-np.inf] = -MAX null = np.zeros((cascade.N,1),dtype=float) for j in xrange(cascade.J): if gamma.model=='modelC': null = null + insum(lhoodC.value[j],[1]) else: null = null + insum(lhoodA.value[j],[1]) P_0 = null + gammaln(eta.total+alpha.estim[0]) - gammaln(alpha.estim[0]) \ + alpha.estim[0]*nplog(tau.estim[0]) + eta.total*nplog(1-tau.estim[0]) P_0[P_0==np.inf] = MAX P_0[P_0==-np.inf] = -MAX L = P_0*eta.estim[:,:1] + insum(P_1*eta.estim[:,1:],[1]) + apriori*(1-eta.estim[:,:1]) \ - nplog(1+np.exp(apriori)) - insum(eta.estim*nplog(eta.estim),[1]) L = L.sum() if np.isnan(L): print "Nan in LogLike" raise ValueError if np.isinf(L): print "Inf in LogLike" raise ValueError return L
def F(x): arg = x[0]+x[1]*scores func = arg*insum(eta.estim[:,1:],1) - nplog(1+np.exp(arg)) f = -1.*func.sum() if np.isnan(f) or np.isinf(f): return np.inf else: return f
def F(x): arg = x[0] + x[1] * scores func = arg * insum(eta.estim[:, 1:], 1) - nplog(1 + np.exp(arg)) f = -1. * func.sum() if np.isnan(f) or np.isinf(f): return np.inf else: return f
def Fprime(x): arg = x[0]+x[1]*scores df1 = insum(eta.estim[:,1:],1) - logistic(-arg) df2 = df1*scores Df = -1.*np.array([df1.sum(), df2.sum()]) if np.isnan(Df).any() or np.isinf(Df).any(): return np.inf else: return Df
def Fprime(x): arg = x[0] + x[1] * scores df1 = insum(eta.estim[:, 1:], 1) - logistic(-arg) df2 = df1 * scores Df = -1. * np.array([df1.sum(), df2.sum()]) if np.isnan(Df).any() or np.isinf(Df).any(): return np.inf else: return Df
def __init__(self, cascade, totalreads, scores, gamma=None, beta=None, \ pi=None, mu=None, B=None, omega=None, omegao=None, alpha=None, tau=None): self.N = cascade.N self.total = totalreads.reshape(self.N,1) self.estim = np.zeros((self.N, 2),dtype=float) if alpha is None: indices = np.argsort(self.total.ravel())[:self.N/2] self.estim[indices,1:] = -MAX indices = np.argsort(self.total.ravel())[self.N/2:] self.estim[indices,1:] = MAX else: footprint_logodds = np.zeros((self.N,1),dtype=float) if gamma.model=='modelA': lhoodA, lhoodB = likelihoodAB(cascade, B=B, model=gamma.model) elif gamma.model=='modelB': lhoodA, lhoodB = likelihoodAB(cascade, mu=mu, model=gamma.model) elif gamma.model=='modelC': lhoodA, lhoodB, lhoodC = likelihoodAB(cascade, B=B, omega=omega, omegao=omegao, model=gamma.model) for j in xrange(pi.J): if model=='modelC': footprint_logodds += insum(gamma.value[j]*lhoodA.value[j]-lhoodC.value[j]+(1-gamma.value[j])*lhoodB.value[j],[1]) else: footprint_logodds += insum((1-gamma.value[j])*(lhoodB.value[j]-lhoodA.value[j]),[1]) footprint_logodds += insum(gamma.value[j]*(nplog(pi.estim[j])-nplog(gamma.value[j])) \ + (1-gamma.value[j])*(nplog(1-pi.estim[j])-nplog(1-gamma.value[j])),[1]) self.estim[:,1:] = beta.estim[0] + beta.estim[1]*scores + footprint_logodds \ + gammaln(self.total+alpha.estim[1]) - gammaln(self.total+alpha.estim[0]) \ + gammaln(alpha.estim[0]) - gammaln(alpha.estim[1]) \ + alpha.estim[1]*nplog(tau.estim[1]) - alpha.estim[0]*nplog(tau.estim[0]) \ + self.total*(nplog(1-tau.estim[1])-nplog(1-tau.estim[0])) if alpha is None: self.estim[self.estim==np.inf] = MAX self.estim = np.exp(self.estim-np.max(self.estim,1).reshape(self.N,1)) self.estim = self.estim/insum(self.estim,[1]) else: self.estim[:,1:] = self.estim[:,1:]/np.log(10)
def getnull(self, locations, sample='', width=200): left = self.k/2 right = self.k/2-1 if sample=='': cutrate = self.cutrate else: cutrate = self.cutrate[sample] strand = np.array([1 if loc[3]=='+' else 0 for loc in locations]) # removed a +1 for the - strand sequences = np.array([utils.makestr(self.genome[loc[0]][int(loc[1])-width/2-left:int(loc[1])+width/2+right]) if loc[3]=='+' \ else utils.makestr(self.genome[loc[0]][int(loc[2])-width/2-left:int(loc[2])+width/2+right]) \ for loc in locations]) null = sequence_null.getnull(sequences, strand, cutrate, width, self.k) null[null==0] = 1e-8 null = null/utils.insum(null,[1]) return null
def getnull(self, locations, sample='', width=200): left = self.k / 2 right = self.k / 2 - 1 if sample == '': cutrate = self.cutrate else: cutrate = self.cutrate[sample] strand = np.array([1 if loc[3] == '+' else 0 for loc in locations]) # removed a +1 for the - strand sequences = np.array([utils.makestr(self.genome[loc[0]][int(loc[1])-width/2-left:int(loc[1])+width/2+right]) if loc[3]=='+' \ else utils.makestr(self.genome[loc[0]][int(loc[2])-width/2-left:int(loc[2])+width/2+right]) \ for loc in locations]) null = sequence_null.getnull(sequences, strand, cutrate, width, self.k) null[null == 0] = 1e-8 null = null / utils.insum(null, [1]) return null
def logposteriorodds_multinomial(reads, footprint, null): logodds = insum(reads*nplog(footprint.ravel()),[1]) - insum(reads*nplog(null),[1]) return logodds.ravel()
import numpy as np import scipy.optimize as opt from scipy.special import digamma, gammaln from utils import insum, outsum, nplog, EPS, MAX import cPickle, time, math, pdb logistic = lambda x: 1./(1+insum(np.exp(x),[1])) newlogistic = lambda x: 1./(1+np.exp(x)) class Cascade: def __init__(self, L): self.L = L if math.frexp(self.L)[0]!=0.5: print "profile size is not a power of 2" pdb.set_trace() self.J = math.frexp(self.L)[1]-1 self.data = False self.value = dict() def setreads(self, reads): self.data = True N,L = reads.shape self.N = N if L!=self.L: print "data dimensions do not match" pdb.set_trace() self.transform(reads)
def likelihood(cascade, scores, eta, gamma, pi, alpha, beta, tau, mu=None, B=None, omega=None, omegao=None): apriori = beta.estim[0] + beta.estim[1] * scores if gamma.model == 'modelA': lhoodA, lhoodB = likelihoodAB(cascade, B=B, model=gamma.model) elif gamma.model == 'modelB': lhoodA, lhoodB = likelihoodAB(cascade, mu=mu, model=gamma.model) elif gamma.model == 'modelC': lhoodA, lhoodB, lhoodC = likelihoodAB(cascade, B=B, omega=omega, omegao=omegao, model=gamma.model) footprint = np.zeros((cascade.N, 1), dtype=float) for j in xrange(pi.J): footprint += insum(gamma.value[j]*lhoodA.value[j] + (1-gamma.value[j])*lhoodB.value[j] \ + gamma.value[j]*(nplog(pi.estim[j])-nplog(gamma.value[j])) \ + (1-gamma.value[j])*(nplog(1-pi.estim[j])-nplog(1-gamma.value[j])),[1]) P_1 = footprint + gammaln(eta.total+alpha.estim[1]) - gammaln(alpha.estim[1]) \ + alpha.estim[1]*nplog(tau.estim[1]) + eta.total*nplog(1-tau.estim[1]) P_1[P_1 == np.inf] = MAX P_1[P_1 == -np.inf] = -MAX null = np.zeros((cascade.N, 1), dtype=float) for j in xrange(cascade.J): if gamma.model == 'modelC': null = null + insum(lhoodC.value[j], [1]) else: null = null + insum(lhoodA.value[j], [1]) P_0 = null + gammaln(eta.total+alpha.estim[0]) - gammaln(alpha.estim[0]) \ + alpha.estim[0]*nplog(tau.estim[0]) + eta.total*nplog(1-tau.estim[0]) P_0[P_0 == np.inf] = MAX P_0[P_0 == -np.inf] = -MAX L = P_0*eta.estim[:,:1] + insum(P_1*eta.estim[:,1:],[1]) + apriori*(1-eta.estim[:,:1]) \ - nplog(1+np.exp(apriori)) - insum(eta.estim*nplog(eta.estim),[1]) L = L.sum() if np.isnan(L): print "Nan in LogLike" raise ValueError if np.isinf(L): print "Inf in LogLike" raise ValueError return L
import numpy as np import scipy.optimize as opt from scipy.special import digamma, gammaln from utils import insum, outsum, nplog, EPS, MAX import cPickle, time, math, pdb logistic = lambda x: 1. / (1 + insum(np.exp(x), [1])) newlogistic = lambda x: 1. / (1 + np.exp(x)) class Cascade: def __init__(self, L): self.L = L if math.frexp(self.L)[0] != 0.5: print "profile size is not a power of 2" pdb.set_trace() self.J = math.frexp(self.L)[1] - 1 self.data = False self.value = dict() def setreads(self, reads): self.data = True N, L = reads.shape self.N = N if L != self.L: print "data dimensions do not match" pdb.set_trace() self.transform(reads)
def logposteriorodds_multinomial(reads, footprint, null): logodds = insum(reads * nplog(footprint.ravel()), [1]) - insum(reads * nplog(null), [1]) return logodds.ravel()