def loglik(self,arr,working): working['arrfail'][:] = self.afaildens working['arrfail'][np.logical_and(arr[:,0]<5,arr[:,1]<5)] = 1. working['arrfail_lik'] *= self.eta_arr loglik = np.zeros(len(arr)) for j in range(3): chiamante_statfunc.dmvt(arr,self.mu[j],self.sigma[j],self.df[j],working['arrlik'][:,j],working['workarray']) if self.p[0][j]>0: loglik += np.log(self.p[0][j]) + np.log(working['arrfail_lik']+(1-self.eta_arr)*working['arrlik'][:,j]) return loglik.sum()
def chiamante_qc(parameters,loglik,logpp,i, arr,seq,prior,start,seqfaildens,arrfaildens,popidx,working,niteration,tolerance,df, hwe_prior,calculate_logpp,C,flip,retry,g_corrected,genotype_likelihoods): if seq!=None: doseq=True else: doseq=False ngeno = [((1-working['arrfail'])*working['new_g'][:,j]).sum() for j in range(3)] arr2 = np.power(2,arr) r = arr2.sum(1) theta = 2. * np.arctan2(arr2[:,0],arr2[:,1]) / np.pi mu = parameters['mu'] if not (mu[0][0]>mu[2][0] and mu[2][1]>mu[0][1]): zstat1 = -2 zstat2 = -2 elif ngeno[0]>ngeno[2]: zstat1 = (parameters['mu'][1][1]-parameters['mu'][0][1])/np.sqrt(parameters['sigma'][0][1,1]) zstat2 = chiamante_statfunc.mahalanobis(parameters['mu'][1],parameters['mu'][0],parameters['sigma'][0]) else: zstat1 = (parameters['mu'][1][0]-parameters['mu'][2][0])/np.sqrt(parameters['sigma'][2][0,0]) zstat2 = chiamante_statfunc.mahalanobis(parameters['mu'][1],parameters['mu'][2],parameters['sigma'][2]) if ngeno[0]>1 or ngeno[2]>1: threshold1=1 threshold2=3 else: threshold1=.5 threshold2=2 if genotype_likelihoods: gl = working['gl'] else: gl = None if retry<4 and (zstat1<threshold1 or zstat2<threshold2): # print retry,ngeno,zstat1,zstat2 if doseq and retry<3: ii = np.logical_and(arr.max(1)>6,np.logical_not(np.isnan(seq[:,0]))) dosage = seq[ii,1:].sum(1) # print pearsonr(arr[ii,0],dosage),pearsonr(arr[ii,1],dosage) pval1 = pearsonr(arr[ii,0],dosage)[1] pval2 = pearsonr(arr[ii,1],dosage)[1] if pval1<.0001 and pval2<.0001: for j in range(3): wt = seq[ii,j] start['mu'][j] = (arr[ii].T*wt).sum(1)/wt.sum() retry=3 nrit=niteration return chiamante_mainloop(arr,seq,prior,start,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=nrit, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=retry,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) try: newmu = np.median(arr[arr.max(1)>6],0) except: newmu = np.median(arr,0) muidx = newmu.argmax()*2 if retry<3: nrit=niteration retry=3 else: nrit=1 retry=4 start['mu'][muidx] = newmu#deepcopy(parameters['mu'][muidx]) expected_mean(muidx,start['mu'],prior,parameters['model']) tmpsigma = start['sigma'][muidx] if (arr.max(1)>7).sum()>3: start['sigma'][muidx] = np.cov(arr[arr.max(1)>7].T)# np.diag((1,1))*start['sigma'][muidx].max() ret = chiamante_mainloop(arr,seq,prior,start,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=nrit, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=retry,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) start['sigma'][muidx] = tmpsigma return ret if ngeno[0]<1 and retry<3: ng = sum(ngeno) af = ngeno[1]/(ng*2) eg = (af**2)*ng if eg>1: # print ngeno,"eg =",eg parameters['mu'][0][1] = np.min(arr[arr.max(1)>6,1]) return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=niteration, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=3,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) if ngeno[2]<1 and retry<3: ng = sum(ngeno) af = ngeno[1]/(ng*2) eg = (af**2)*ng if eg>1: # print ngeno,"eg =",eg parameters['mu'][2][0] = np.min(arr[arr.max(1)>6,0]) return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=niteration, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=3,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) if mu[0][0]<mu[1][0] and retry <4: # print "Fixing mu_0" parameters['mu'][0] = parameters['mu'][2][[1,0]] return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=1, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=4,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) if mu[2][1]<mu[1][1] and retry<4: # print "Fixing mu_2" parameters['mu'][2] = parameters['mu'][0][[1,0]] return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=1, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=4,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) # monomorphic checks mono = False calls = working['new_g'].argmax(1) calls[working['new_g'].max(1)<0.9]=3 calls[working['arrfail']>0.1]=3 theta_het = 2.*np.arctan2(2.**parameters['mu'][1][0],2.**parameters['mu'][1][1])/np.pi if (calls==0).sum()>0: # weird thetas if theta[calls==0].min() < theta_het: # print 'hom0 < het centroid',theta[calls==0].min(),theta_het mono = True if (calls==2).sum()>0: if theta[calls==2].max() > theta_het: # print 'hom2 > het centroid',theta[calls==2].max(),theta_het mono = True if not doseq and (parameters['mu'][1].min() < 6 or (parameters['mu'][1].min() < 8 and ngeno[1]<4)): # print "very low het centroid",parameters['mu'][0],parameters['mu'][1],parameters['mu'][2] mono = True if ngeno[0]<1 or ngeno[2]<1: if ngeno[0]>ngeno[2]: zstat1 = (parameters['mu'][1][0]-parameters['mu'][0][0])# /np.sqrt(parameters['sigma'][0][0,0]) else: zstat1 = (parameters['mu'][1][1]-parameters['mu'][2][1])# /np.sqrt(parameters['sigma'][2][1,1]) # print zstat1 if zstat1 < -3: # print 'ridiculuously low het, returning monomorphic fit',zstat1 mono = True if mono: # site looks monomorphic (or very low MAF) monofit = monomorphic_fit(prior,start,arr,seq,working,arrfaildens,df=df,niteration=niteration,tol=.1) if retry < 5: for j in range(3): parameters['mu'][j] = monofit['parameters']['mu'][j] parameters['sigma'][j] = monofit['parameters']['sigma'][j] return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=1, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=5,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) else: parameters = monofit['parameters'] for j in range(3): if model==4: if df==None: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],100) else: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],df[j]) elif df==None: working['arrlik'][:,j] = chiamante_statfunc.dmvnorm(arr,parameters['mu'][j],parameters['sigma'][j]) else: chiamante_statfunc.dmvt(arr,parameters['mu'][j],parameters['sigma'][j],df[j],working['arrlik'][:,j],working['workarray']) if not doseq: chiamante_estep(popidx,monofit['parameters'],hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'],gl=gl) else: chiamante_estep(popidx,monofit['parameters'],hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'], doseq=True,seqlik=seq,seqfail_lik=working['seqfail_lik'],seq_missing=working['seq_missing'],seqfail=working['seqfail'],gl=gl) return dict(parameters=monofit['parameters'], loglik=loglik[:i],logpp=logpp[:i] ,gprobs=working['new_g'],gl=gl,array_fail=working['arrfail'],seq_fail=working['seqfail'] ,u=working['u'],niteration=-1) #everything looks fine! returning the original fit return dict(parameters=parameters, loglik=loglik[:i],logpp=logpp[:i] ,gprobs=working['new_g'],gl=gl,array_fail=working['arrfail'],seq_fail=working['seqfail'] ,u=working['u'],niteration=i)
def monomorphic_fit(prior,parameters,arr,seqlik,working,arrfaildens,df=None,niteration=30,tol=0.01): if seqlik!=None:doseq=True else:doseq=False nsample = len(arr) working['arrfail_lik'][:] = arrfaildens working['arrfail_lik'][np.logical_and(arr[:,0]<5,arr[:,1]<5)] = 1. working['u'][:] = 1. if df != None: if type(df)=='int': df = [df for idx in range(3)] if type(df)=='list': if len(df)!=3: print "df is not a list of length 3 or a scalar" raise NameError('EpicFail') if mquantiles(arr[:,0],.99) > mquantiles(arr[:,1],.99): j = 0 else: j = 2 # j=arr.max(0).argmax()*2 parm = Parameters(afaildens=arrfaildens,df=df,K=3) parm.mu[j] = np.median(arr[arr.max(1)>6],0) v1 = chiamante_statfunc.mad(arr[:,0])[1] v2 = chiamante_statfunc.mad(arr[:,1])[1] if v1<.1: v1=1 if v2<.1: v2=1 parm.sigma[j][:] = np.diag((v1,v2)) parm.eta_arr=0.05 parm.p[0][:] = 0.0 parm.p[0][j] = 1.0 workarray = working['workarray'] arrfail = working['arrfail'] g = working['new_g'] u = working['u'] for i in range(niteration): #E-STEP chiamante_statfunc.dmvt(arr,parm.mu[j],parm.sigma[j],parm.df[j],working['arrlik'][:,j],working['workarray']) arrfail[:] = parm.eta_arr*working['arrfail_lik'] g[:,j] = working['arrlik'][:,j]*(1-parm.eta_arr) working['workarray'][:,0] = g[:,j]+arrfail #denominator g[:,j]/=working['workarray'][:,0] arrfail/=working['workarray'][:,0] ss = workarray[:,1:3] if parm.df!=None: ss[:] = arr[:] ss[:] -= parm.mu[j] u[:,j] = (df[j]+2)/(df[j]+(np.dot(ss,la.inv(parm.sigma[j]))*ss).sum(1)) #M-STEP wtu = workarray[:,0] residuals = workarray[:,1:3] residuals[:] = arr[:] success = 1 - arrfail wtu[:] = (success*g[:,j]) # weights without u ntmp = wtu.sum() # need this denominator wtu *= u[:,j] nu = wtu.sum() isig = la.inv(parm.sigma[j]) den = la.inv(prior['isigma_mu'][j]+isig*nu) parm.mu[j] = np.dot(den,(np.dot(prior['isigma_mu'][j],prior['mu0'][j]) + np.dot(isig,np.dot(wtu,arr)))) residuals[:] -= parm.mu[j] tmp2 = parm.mu[j]-prior['mu0'][j] parm.sigma[j] = (np.diag(np.diag(prior['s0'][j])) + np.diag((wtu*np.power(residuals.T,2)).sum(1))) / (prior['v0'][j] + 4 + ntmp) parm.eta_arr = (prior['arrfail_alpha']+arrfail.sum()-1)/(prior['arrfail_beta']+prior['arrfail_alpha']+nsample-2) if i>0: working['old_g'][:,j] -= g[:,j] if working['old_g'][:,j].max() < tol and working['old_g'][:,j].min() > -tol: break else: working['old_g'][:,j]=g[:,j] if j==0: g[:,1:] = 0.0 if j==2: g[:,:2] = 0.0 # print i,"iterations" expected_mean(j,parm.mu,prior,3) for genotype in range(3): if genotype!=j: parm.sigma[genotype] = [np.array([[0.0255294956024038,0.00467599907475694],[0.00467599907475694,0.327148026714643]]), np.array([[0.0322203889783675,0.0167746786021707],[0.0167746786021707,0.0251180753954836]]), np.array([[0.370072130849117,0.00512061278760211],[0.00512061278760211,0.0212080779437183]])][genotype] if j==2: p = np.array([.95**2,2*.05*.95,.05**2]) else: p = np.array([.05**2,2*.05*.95,.95**2]) if doseq: seqfail_lik=working['seqfail_lik'] seqfail = working['seqfail'] seqfail[:] = 1.0 lik1 = working['workarray'][:,0] lik2 = working['workarray'][:,1] seq_not_missing = np.logical_not(np.isnan(seqlik[:,0])) nseq = seq_not_missing.sum() seqfail[seq_not_missing] = seqfail_lik[seq_not_missing]*parameters['eta_seq']#tmp_p[2] # lik1[seq_not_missing] = (working['arrlik'][seq_not_missing,j]*(1-parm.eta_arr) + parm.eta_arr*working['arrfail_lik'][seq_not_missing]) lik2[seq_not_missing] = (seqlik[seq_not_missing,j]*(1-parameters['eta_seq'])+seqfail[seq_not_missing]) seqfail[seq_not_missing]/=(lik2[seq_not_missing]) eta_seq=(prior['seqfail_alpha']+seqfail[seq_not_missing].sum()-1)/(prior['seqfail_beta']+prior['seqfail_alpha']+nseq-2) return dict(parameters=dict(p=p,df=df,eta_array=parm.eta_arr,mu=parm.mu,sigma=parm.sigma,eta_seq=eta_seq,model=3),gprobs=g,niteration=-1,arrfail=arrfail) else: return dict(parameters=dict(p=p,df=df,eta_array=parm.eta_arr,mu=parm.mu,sigma=parm.sigma,model=3),gprobs=g,niteration=-1,arrfail=arrfail)
def chiamante_mainloop(arr,seq,prior,start,seqfaildens,arrfaildens,popidx,working,niteration=50,tolerance=1e-3,df=None, hwe_prior=True,calculate_logpp=False,C=True,flip=False,retry=0,g_corrected=None,genotype_likelihoods=False): if not arr.shape[1] == 2: raise ValueError("Array does not have 2 columns.") if df != None: if len(df) != 3: raise ValueError("invalid degrees of freedom") else: df = [float(val) for val in df] if df!=None: if sum(df)==0: df = None if genotype_likelihoods: gl = working['gl'] else: gl = None model = prior['model'] nsample = len(arr) npop = len(popidx) working['arrfail_lik'][:] = arrfaildens arr2 = np.power(2,arr) r = arr2.sum(1) working['arrfail_lik'][r<36.] = 1. working['u'][:] = 1. if seq==None: doseq=False else: doseq=True working['seq_missing'] = np.where(np.isnan(seq[:,0]))[0] working['seq_not_missing'] = np.where(np.logical_not(np.isnan(seq[:,0])))[0] nseq = len(working['seq_not_missing']) if hwe_prior and not len(prior['raf_alpha'])==npop: print "Length of raf_alpha not consistent with number of populations" if flip: flip_raf_prior(prior) K=3 #number of classes logpp = np.zeros(niteration) loglik = np.zeros(niteration) parameters = dict(mu = deepcopy(start['mu']), sigma = deepcopy(start['sigma']), alpha = [start['alpha'] for i in range(npop)], eta_array = deepcopy(start['eta_array']), eta_seq = deepcopy(start['eta_seq']),df=df,model=model) if hwe_prior: if type(start['p'])==np.ndarray and len(start['p'])==3: parameters['raf']= [start['raf'] for i in range(npop)] parameters['p'] = [np.array([1./3. for i in range(3)]) for idx in range(npop)] elif type(start['p'])==list and len(start['p'])==npop: if len(start['raf']) != len(start['p']): print "len(start[raf]) != len(start[p])" exit() parameters['p'] = deepcopy(start['p']) parameters['raf'] = deepcopy(start['raf']) else: print "Length of genotype frequencies does not match npop" exit() else: if type(start['p'])==np.ndarray: parameters['p']=deepcopy(start['p']) else: print 'start[p] dont look right' exit() if df != None: if type(df)=='int': df = [df for idx in range(3)] if type(df)=='list': if len(df)!=3: print "df is not a list of length 3 or a scalar" raise ValueError("df is not a list of length 3 or a scalar") for i in range(niteration): for j in range(3): if model==4: if df==None: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],100) else: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],df[j]) elif df==None: working['arrlik'][:,j] = chiamante_statfunc.dmvnorm(arr,parameters['mu'][j],parameters['sigma'][j]) else: chiamante_statfunc.dmvt(arr,parameters['mu'][j],parameters['sigma'][j],df[j],working['arrlik'][:,j],working['workarray']) if C:#there was initially a C version for the EM step but it turned out to be no faster! print "not implemented" quit() else: if not doseq: chiamante_estep(popidx,parameters,hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'],gl=gl) parameters = chiamante_mstep(arr,working['arrfail'],working['new_g'],prior,hwe_prior,model,working['u'],popidx,parameters,working['workarray']) else: chiamante_estep(popidx,parameters,hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'], doseq=True,seqlik=seq,seqfail_lik=working['seqfail_lik'],seq_missing=working['seq_missing'],seqfail=working['seqfail'],gl=gl) parameters = chiamante_mstep(arr,working['arrfail'],working['new_g'],prior,hwe_prior,model,working['u'],popidx,parameters,working['workarray'], doseq,working['seq_not_missing'],nseq,working['seqfail'])#M-STEP if niteration==1: break elif i>1 and (abs(working['old_g']-working['new_g'])).max() < tolerance: break else: tmp_g = working['new_g'] working['new_g'] = working['old_g'] working['old_g'] = tmp_g i+=1 ngeno = [((1-working['arrfail'])*working['new_g'][:,j]).sum() for j in range(3)] #perform various QC checks if we are on the last iteration and if the site is not monomorphic(convergence to monomorphic tends to indicate nothing went wrong) if False: if niteration>1 and round(max(ngeno))<nsample: return chiamante_qc(parameters,loglik,logpp,i ,arr,seq,prior,start,seqfaildens,arrfaildens,popidx,working,niteration,tolerance,df,hwe_prior,calculate_logpp,C,flip,retry,g_corrected,genotype_likelihoods=genotype_likelihoods) if not hwe_prior: parameters['raf'] = parameters['p'][0] + .5*parameters['p'][1] if flip: flip_raf_prior(prior) return dict(parameters=parameters, loglik=loglik[:i],logpp=logpp[:i] ,gprobs=working['new_g'],gl=gl,array_fail=working['arrfail'],seq_fail=working['seqfail'] ,u=working['u'],niteration=i)