def chiamante_qc(parameters,loglik,logpp,i, arr,seq,prior,start,seqfaildens,arrfaildens,popidx,working,niteration,tolerance,df, hwe_prior,calculate_logpp,C,flip,retry,g_corrected,genotype_likelihoods): if seq!=None: doseq=True else: doseq=False ngeno = [((1-working['arrfail'])*working['new_g'][:,j]).sum() for j in range(3)] arr2 = np.power(2,arr) r = arr2.sum(1) theta = 2. * np.arctan2(arr2[:,0],arr2[:,1]) / np.pi mu = parameters['mu'] if not (mu[0][0]>mu[2][0] and mu[2][1]>mu[0][1]): zstat1 = -2 zstat2 = -2 elif ngeno[0]>ngeno[2]: zstat1 = (parameters['mu'][1][1]-parameters['mu'][0][1])/np.sqrt(parameters['sigma'][0][1,1]) zstat2 = chiamante_statfunc.mahalanobis(parameters['mu'][1],parameters['mu'][0],parameters['sigma'][0]) else: zstat1 = (parameters['mu'][1][0]-parameters['mu'][2][0])/np.sqrt(parameters['sigma'][2][0,0]) zstat2 = chiamante_statfunc.mahalanobis(parameters['mu'][1],parameters['mu'][2],parameters['sigma'][2]) if ngeno[0]>1 or ngeno[2]>1: threshold1=1 threshold2=3 else: threshold1=.5 threshold2=2 if genotype_likelihoods: gl = working['gl'] else: gl = None if retry<4 and (zstat1<threshold1 or zstat2<threshold2): # print retry,ngeno,zstat1,zstat2 if doseq and retry<3: ii = np.logical_and(arr.max(1)>6,np.logical_not(np.isnan(seq[:,0]))) dosage = seq[ii,1:].sum(1) # print pearsonr(arr[ii,0],dosage),pearsonr(arr[ii,1],dosage) pval1 = pearsonr(arr[ii,0],dosage)[1] pval2 = pearsonr(arr[ii,1],dosage)[1] if pval1<.0001 and pval2<.0001: for j in range(3): wt = seq[ii,j] start['mu'][j] = (arr[ii].T*wt).sum(1)/wt.sum() retry=3 nrit=niteration return chiamante_mainloop(arr,seq,prior,start,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=nrit, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=retry,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) try: newmu = np.median(arr[arr.max(1)>6],0) except: newmu = np.median(arr,0) muidx = newmu.argmax()*2 if retry<3: nrit=niteration retry=3 else: nrit=1 retry=4 start['mu'][muidx] = newmu#deepcopy(parameters['mu'][muidx]) expected_mean(muidx,start['mu'],prior,parameters['model']) tmpsigma = start['sigma'][muidx] if (arr.max(1)>7).sum()>3: start['sigma'][muidx] = np.cov(arr[arr.max(1)>7].T)# np.diag((1,1))*start['sigma'][muidx].max() ret = chiamante_mainloop(arr,seq,prior,start,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=nrit, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=retry,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) start['sigma'][muidx] = tmpsigma return ret if ngeno[0]<1 and retry<3: ng = sum(ngeno) af = ngeno[1]/(ng*2) eg = (af**2)*ng if eg>1: # print ngeno,"eg =",eg parameters['mu'][0][1] = np.min(arr[arr.max(1)>6,1]) return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=niteration, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=3,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) if ngeno[2]<1 and retry<3: ng = sum(ngeno) af = ngeno[1]/(ng*2) eg = (af**2)*ng if eg>1: # print ngeno,"eg =",eg parameters['mu'][2][0] = np.min(arr[arr.max(1)>6,0]) return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=niteration, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=3,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) if mu[0][0]<mu[1][0] and retry <4: # print "Fixing mu_0" parameters['mu'][0] = parameters['mu'][2][[1,0]] return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=1, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=4,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) if mu[2][1]<mu[1][1] and retry<4: # print "Fixing mu_2" parameters['mu'][2] = parameters['mu'][0][[1,0]] return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=1, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=4,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) # monomorphic checks mono = False calls = working['new_g'].argmax(1) calls[working['new_g'].max(1)<0.9]=3 calls[working['arrfail']>0.1]=3 theta_het = 2.*np.arctan2(2.**parameters['mu'][1][0],2.**parameters['mu'][1][1])/np.pi if (calls==0).sum()>0: # weird thetas if theta[calls==0].min() < theta_het: # print 'hom0 < het centroid',theta[calls==0].min(),theta_het mono = True if (calls==2).sum()>0: if theta[calls==2].max() > theta_het: # print 'hom2 > het centroid',theta[calls==2].max(),theta_het mono = True if not doseq and (parameters['mu'][1].min() < 6 or (parameters['mu'][1].min() < 8 and ngeno[1]<4)): # print "very low het centroid",parameters['mu'][0],parameters['mu'][1],parameters['mu'][2] mono = True if ngeno[0]<1 or ngeno[2]<1: if ngeno[0]>ngeno[2]: zstat1 = (parameters['mu'][1][0]-parameters['mu'][0][0])# /np.sqrt(parameters['sigma'][0][0,0]) else: zstat1 = (parameters['mu'][1][1]-parameters['mu'][2][1])# /np.sqrt(parameters['sigma'][2][1,1]) # print zstat1 if zstat1 < -3: # print 'ridiculuously low het, returning monomorphic fit',zstat1 mono = True if mono: # site looks monomorphic (or very low MAF) monofit = monomorphic_fit(prior,start,arr,seq,working,arrfaildens,df=df,niteration=niteration,tol=.1) if retry < 5: for j in range(3): parameters['mu'][j] = monofit['parameters']['mu'][j] parameters['sigma'][j] = monofit['parameters']['sigma'][j] return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=1, hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=5,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods) else: parameters = monofit['parameters'] for j in range(3): if model==4: if df==None: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],100) else: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],df[j]) elif df==None: working['arrlik'][:,j] = chiamante_statfunc.dmvnorm(arr,parameters['mu'][j],parameters['sigma'][j]) else: chiamante_statfunc.dmvt(arr,parameters['mu'][j],parameters['sigma'][j],df[j],working['arrlik'][:,j],working['workarray']) if not doseq: chiamante_estep(popidx,monofit['parameters'],hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'],gl=gl) else: chiamante_estep(popidx,monofit['parameters'],hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'], doseq=True,seqlik=seq,seqfail_lik=working['seqfail_lik'],seq_missing=working['seq_missing'],seqfail=working['seqfail'],gl=gl) return dict(parameters=monofit['parameters'], loglik=loglik[:i],logpp=logpp[:i] ,gprobs=working['new_g'],gl=gl,array_fail=working['arrfail'],seq_fail=working['seqfail'] ,u=working['u'],niteration=-1) #everything looks fine! returning the original fit return dict(parameters=parameters, loglik=loglik[:i],logpp=logpp[:i] ,gprobs=working['new_g'],gl=gl,array_fail=working['arrfail'],seq_fail=working['seqfail'] ,u=working['u'],niteration=i)
def chiamante_mainloop(arr,seq,prior,start,seqfaildens,arrfaildens,popidx,working,niteration=50,tolerance=1e-3,df=None, hwe_prior=True,calculate_logpp=False,C=True,flip=False,retry=0,g_corrected=None,genotype_likelihoods=False): if not arr.shape[1] == 2: raise ValueError("Array does not have 2 columns.") if df != None: if len(df) != 3: raise ValueError("invalid degrees of freedom") else: df = [float(val) for val in df] if df!=None: if sum(df)==0: df = None if genotype_likelihoods: gl = working['gl'] else: gl = None model = prior['model'] nsample = len(arr) npop = len(popidx) working['arrfail_lik'][:] = arrfaildens arr2 = np.power(2,arr) r = arr2.sum(1) working['arrfail_lik'][r<36.] = 1. working['u'][:] = 1. if seq==None: doseq=False else: doseq=True working['seq_missing'] = np.where(np.isnan(seq[:,0]))[0] working['seq_not_missing'] = np.where(np.logical_not(np.isnan(seq[:,0])))[0] nseq = len(working['seq_not_missing']) if hwe_prior and not len(prior['raf_alpha'])==npop: print "Length of raf_alpha not consistent with number of populations" if flip: flip_raf_prior(prior) K=3 #number of classes logpp = np.zeros(niteration) loglik = np.zeros(niteration) parameters = dict(mu = deepcopy(start['mu']), sigma = deepcopy(start['sigma']), alpha = [start['alpha'] for i in range(npop)], eta_array = deepcopy(start['eta_array']), eta_seq = deepcopy(start['eta_seq']),df=df,model=model) if hwe_prior: if type(start['p'])==np.ndarray and len(start['p'])==3: parameters['raf']= [start['raf'] for i in range(npop)] parameters['p'] = [np.array([1./3. for i in range(3)]) for idx in range(npop)] elif type(start['p'])==list and len(start['p'])==npop: if len(start['raf']) != len(start['p']): print "len(start[raf]) != len(start[p])" exit() parameters['p'] = deepcopy(start['p']) parameters['raf'] = deepcopy(start['raf']) else: print "Length of genotype frequencies does not match npop" exit() else: if type(start['p'])==np.ndarray: parameters['p']=deepcopy(start['p']) else: print 'start[p] dont look right' exit() if df != None: if type(df)=='int': df = [df for idx in range(3)] if type(df)=='list': if len(df)!=3: print "df is not a list of length 3 or a scalar" raise ValueError("df is not a list of length 3 or a scalar") for i in range(niteration): for j in range(3): if model==4: if df==None: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],100) else: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],df[j]) elif df==None: working['arrlik'][:,j] = chiamante_statfunc.dmvnorm(arr,parameters['mu'][j],parameters['sigma'][j]) else: chiamante_statfunc.dmvt(arr,parameters['mu'][j],parameters['sigma'][j],df[j],working['arrlik'][:,j],working['workarray']) if C:#there was initially a C version for the EM step but it turned out to be no faster! print "not implemented" quit() else: if not doseq: chiamante_estep(popidx,parameters,hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'],gl=gl) parameters = chiamante_mstep(arr,working['arrfail'],working['new_g'],prior,hwe_prior,model,working['u'],popidx,parameters,working['workarray']) else: chiamante_estep(popidx,parameters,hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'], doseq=True,seqlik=seq,seqfail_lik=working['seqfail_lik'],seq_missing=working['seq_missing'],seqfail=working['seqfail'],gl=gl) parameters = chiamante_mstep(arr,working['arrfail'],working['new_g'],prior,hwe_prior,model,working['u'],popidx,parameters,working['workarray'], doseq,working['seq_not_missing'],nseq,working['seqfail'])#M-STEP if niteration==1: break elif i>1 and (abs(working['old_g']-working['new_g'])).max() < tolerance: break else: tmp_g = working['new_g'] working['new_g'] = working['old_g'] working['old_g'] = tmp_g i+=1 ngeno = [((1-working['arrfail'])*working['new_g'][:,j]).sum() for j in range(3)] #perform various QC checks if we are on the last iteration and if the site is not monomorphic(convergence to monomorphic tends to indicate nothing went wrong) if False: if niteration>1 and round(max(ngeno))<nsample: return chiamante_qc(parameters,loglik,logpp,i ,arr,seq,prior,start,seqfaildens,arrfaildens,popidx,working,niteration,tolerance,df,hwe_prior,calculate_logpp,C,flip,retry,g_corrected,genotype_likelihoods=genotype_likelihoods) if not hwe_prior: parameters['raf'] = parameters['p'][0] + .5*parameters['p'][1] if flip: flip_raf_prior(prior) return dict(parameters=parameters, loglik=loglik[:i],logpp=logpp[:i] ,gprobs=working['new_g'],gl=gl,array_fail=working['arrfail'],seq_fail=working['seqfail'] ,u=working['u'],niteration=i)