def calculate_FETpvalues_readcounts(vcffile,poolstatustable,poolsize): maxMAF = 0.01; # maximum minor allele frequency, variants above this are not used poolDX = determine_poolphenotype(vcffile,poolstatustable); # phenotype status for each pool variants =0; File = open(vcffile); for line in File: if line[0] == '#': continue; variants +=1; variant = line.strip().split('\t'); samples = len(variant)-9; chrom = variant[0]; position = int(variant[1]); refallele = variant[3]; varalleles = variant[4].split(','); if len(varalleles) ==2: triallelic=1; else: triallelic=0; if len(varalleles) > 2: print >>sys.stderr, '##triallelic',chrom,position,refallele,varalleles; continue; # ignore quad or greater allelic variants for now #if triallelic ==0 or position != 64527465: continue; H0 = 0.0; H1 = 0.0; H2=0.0; D0 = 0.0; D1 = 0.0; D2=0.0; casepools=0; controlpools=0; for i in xrange(samples): if poolDX[i] == -1: continue; # ignore pool for case control analysis try: counts = variant[i+9].split(':'); readsf = counts[2].split(','); readsr = counts[3].split(','); total = int(readsf[0]) + int(readsf[1]) + int(readsr[0]) + int(readsr[1]); if triallelic==1: total += int(readsf[2])+int(readsr[2]); # tri-allelic alt = (float(readsf[1]) + float(readsr[1]))*poolsize; alt /= (total+0.0001); if triallelic==1: alt2 = (float(readsf[2]) + float(readsr[2]))*poolsize; alt2 /= (total+0.0001); else: alt2=0.0; sf = float(2*total)/(2*total + poolsize); if poolDX[i][0] == 0: #H0 += poolsize; H1 = alt; H0 += sf*float(poolsize); H1 += sf*alt; H2 += sf*alt2; controlpools +=1; # 0/1 bit of allelecounts stores case-control status... elif poolDX[i][0] >= 1: D0 += sf*float(poolsize); D1 += sf*alt; D2 += sf*alt2; casepools +=1; #D0 += poolsize; D1 += alt; except IndexError: print 'Exception',i,samples,variant; #if H1 + D1 < 10 or (H0-H1 + D0-D1 <10): lowcountvariants +=1; continue; pvalue = [0,0,0]; pvalue2=[0,0,0]; if H1 + D1 >= 4: pvalue = fet(int(round(H1,0)),int(round(H0,0)),int(round(D1,0)),int(round(D0,0))); print '%0.2f %.2f:%.2f %.2f:%.2f' %(pvalue[0],H0,H1,D0,D1),#'Control',float(H1)/(H0+0.001),'Case',float(D1)/(D0+0.001), if triallelic ==1 and H2+D2 >=4: pvalue2 = fet(int(round(H2,0)),int(round(H0,0)),int(round(D2,0)),int(round(D0,0))); print '2nd-allele %0.2f %.2f:%.2f %.2f:%.2f' %(pvalue2[0],H0,H2,D0,D2), print '%3s %9s %10s %10s %20s ' %(variant[0],variant[1],variant[3],variant[4],variant[5]+':'+variant[6]), print variant[7];
def calculate_FETpvalues_allelecounts(vcffile, poolstatustable, poolsize): poolDX = determine_poolphenotype(vcffile, poolstatustable) # phenotype status for each pool poolDX_random = [] missing = [] for i in xrange(len(poolDX)): if poolDX[i][0] == -1: missing.append(i) poolDX_random.append(poolDX[i]) """ """ random.shuffle(poolDX_random) #print missing,len(poolDX_random); j = 0 for i in xrange(len(poolDX_random)): if poolDX_random[i][0] == -1 and j < len(missing): #print j,poolDX_random[i],i; poolDX_random[i] = poolDX_random[missing[j]] poolDX_random[missing[j]] = [-1] j += 1 for i in xrange(len(poolDX)): poolDX[i] = poolDX_random[i] print >> sys.stderr, poolDX_random variants = 0 trivariants = 0 File = open(vcffile) for line in File: if line[0] == '#': continue variant = line.strip().split('\t') samples = len(variant) - 9 chrom = variant[0] position = int(variant[1]) refallele = variant[3] varalleles = variant[4].split(',') if len(varalleles) == 2: triallelic = 1 trivariants += 1 else: triallelic = 0 variants += 1 if len(varalleles) >= 3 or len(varalleles) == 4: #print >>sys.stderr, '##triallelic',chrom,position,refallele,varalleles,variant[2]; continue # ignore multi-allelic variants for now ## healthy (controls), D (disease), E (early onset), DC (diabetic complications) H0 = 0.00001 H1 = 0.0 H2 = 0.0 D0 = 0.00001 D1 = 0.0 D2 = 0.0 E0 = 0.00001 E1 = 0.0 E2 = 0 DC0 = 0.00001 DC1 = 0 DC2 = 0 casepools = 0 controlpools = 0 for i in xrange(samples): if poolDX_random[i][0] == -1: continue # ignore pool for case control analysis try: genotypes = variant[i + 9].split(':') if triallelic == 0: MLAC = int(genotypes[0]) #meanAC = float(genotypes[2]); QAC = float(genotypes[1]) MLAC2 = 0 else: MLAC = int(genotypes[0].split(',')[0]) #meanAC = float(genotypes[2]); QAC = float(genotypes[1]); MLAC2 = int(genotypes[0].split(',')[1]) #if genotypes[3] == '-inf': varAF = -0.1*QAC; #else: varAF = float(genotypes[3]); if poolDX_random[i][0] == 0: H0 += poolsize H1 += MLAC H2 += MLAC2 elif poolDX_random[i][0] >= 1: D0 += poolsize D1 += MLAC D2 += MLAC2 if poolDX_random[i][0] == 2: E0 += poolsize E1 += MLAC E2 += MLAC2 if len(poolDX_random[i]) >= 2 and poolDX_random[i][1] == 3: DC0 += poolsize DC1 += MLAC DC2 += MLAC2 except IndexError: print 'Exception', i, samples, variant nopvalue_calc = 0 if (H1 + D1 >= 5): pvalue = fet(int(round(H1, 0)), int(round(H0, 0)), int(round(D1, 0)), int(round(D0, 0))) else: pvalue = [0, 0, 0] nopvalue_calc = 1 ## calculate p-values between early onset and controls if (H1 + E1 >= 5): pvalue1 = fet(int(round(H1, 0)), int(round(H0, 0)), int(round(E1, 0)), int(round(E0, 0))) else: pvalue1 = [0, 0, 0] ## calculate p-value between early onset and late-onset if (D1 + E1 >= 5): pvalue2 = fet(int(round(D1, 0)), int(round(D0, 0)), int(round(E1, 0)), int(round(E0, 0))) else: pvalue2 = [0, 0, 0] if nopvalue_calc == 1: continue print '%0.2f %.1f/%.1f %.1f/%.1f %.1f/%.1f %.1f/%.1f' % ( pvalue[0], H1, H0, D1, D0, E1, E0, DC1, DC0), #'Control',float(H1)/(H0+0.001),'Case',float(D1)/(D0+0.001), print '%0.2f %0.2f' % (pvalue1[0], pvalue2[0]), print '%0.4f %0.4f %0.4f %0.4f' % (H1 / H0, D1 / D0, E1 / E0, DC1 / DC0), if triallelic == 1: if H2 + D2 >= 4: pvalue_tri = fet(int(round(H2, 0)), int(round(H0, 0)), int(round(D2, 0)), int(round(D0, 0))) else: pvalue_tri = [0, 0, 0] print 'TRIALLELIC:%0.2f:%.1f:%.1f' % (pvalue_tri[0], H2, D2), else: print '-', print '%3s %9s %s %10s %10s %8s %10s' % ( variant[0], variant[1], variant[2], variant[3], variant[4], variant[5], variant[6]), print variant[7], if PRINTGENOTYPES == 1: for i in xrange(samples): if poolDX_random[i][0] == -1: continue # ignore pool for case control analysis genotypes = variant[i + 9].split(':') print variant[i + 9], #MLAC = int(genotypes[0]); meanAC = float(genotypes[2]); QAC = float(genotypes[1]); #if genotypes[3] == '-inf': varAF = -0.1*QAC; #else: varAF = float(genotypes[3]); #print '%2d:%0.2f:%0.2f' %(MLAC,meanAC,math.sqrt(pow(10,varAF))), print print >> sys.stderr, "variants evaluated", variants, "triallelic or more", trivariants
def calculate_FETpvalues_allelecounts(vcffile,poolstatustable): poolDX = determine_poolphenotype(vcffile,poolstatustable); # phenotype status for each pool variants =0; trivariants =0; File = open(vcffile); for line in File: if line[0] == '#': continue; variant = line.strip().split('\t'); samples = len(variant)-9; chrom = variant[0]; position = int(variant[1]); refallele = variant[3]; varalleles = variant[4].split(','); if len(varalleles) ==2: triallelic=1; trivariants +=1; else: triallelic=0; variants +=1; if len(varalleles) >= 3 or len(varalleles) ==4: #print >>sys.stderr, '##triallelic',chrom,position,refallele,varalleles,variant[2]; continue; # ignore multi-allelic variants for now ## healthy (controls), D (disease), E (early onset), DC (diabetic complications) H0 = 0.0; H1 = 0.0; H2 = 0.0; D0 = 0.0; D1 = 0.0; D2 = 0.0; E0 = 0.0; E1= 0.0; E2 = 0; casepools=0; controlpools=0; for i in xrange(samples): if poolDX[i][0] == -1 or variant[i+9].split(':')[0] == '.': continue; # ignore pool for case control analysis try: poolsize = poolDX[i][1] genotypes = variant[i+9].split(':'); if triallelic==0: MLAC = int(genotypes[0].split(',')[1]); #meanAC = float(genotypes[2]); QAC = float(genotypes[1]); QAC = float(genotypes[1]); MLAC2 = 0; else: MLAC = int(genotypes[0].split(',')[1]); #meanAC = float(genotypes[2]); QAC = float(genotypes[1]); QAC = float(genotypes[1]); MLAC2 = int(genotypes[0].split(',')[2]); #meanAC = float(genotypes[2]); QAC = float(genotypes[1]); #else: varAF = float(genotypes[3]); if poolDX[i][0] == 0: H0 += poolsize; H1 += MLAC; H2 += MLAC2; elif poolDX[i][0] >= 1: D0 +=poolsize; D1 += MLAC; D2 += MLAC2; if poolDX[i][0] == 2: E0 += poolsize; E1 += MLAC; E2 += MLAC2; except IndexError: print 'Exception',i,samples,genotypes; if (H1 + D1 >= 4): pvalue = fet(int(round(H1,0)),int(round(H0,0)),int(round(D1,0)),int(round(D0,0))); else: pvalue = [0,0,0]; ## calculate p-values between early onset and controls if (H1+E1 >=4): pvalue1 = fet(int(round(H1,0)),int(round(H0,0)),int(round(E1,0)),int(round(E0,0))); else: pvalue1 = [0,0,0]; ## calculate p-value between early onset and late-onset if (D1+E1 >=4): pvalue2 = fet(int(round(D1,0)),int(round(D0,0)),int(round(E1,0)),int(round(E0,0))); else: pvalue2 = [0,0,0]; print '%0.2f %.1f/%.1f %.1f/%.1f %.1f/%.1f %.1f/%.1f' %(pvalue[0],H1,H0,D1,D0,E1,E0,DC1,DC0),#'Control',float(H1)/(H0+0.001),'Case',float(D1)/(D0+0.001), print '%0.2f %0.2f' %(pvalue1[0],pvalue2[0]), print '%0.4f %0.4f %0.4f %0.4f' %(H1/(H0+epsilon),D1/(D0+epsilon),E1/(E0+epsilon),DC1/(DC0+epsilon)), #sys.exit() if triallelic ==1: if H2 + D2 >=4: pvalue_tri = fet(int(round(H2,0)),int(round(H0,0)),int(round(D2,0)),int(round(D0,0))); else: pvalue_tri = [0,0,0]; print 'TRIALLELIC:%0.2f:%.1f:%.1f' %(pvalue_tri[0],H2,D2), else: print '-', print '%3s %9s %s %10s %10s %8s %10s' %(variant[0],variant[1],variant[2],variant[3],variant[4],variant[5],variant[6]), print variant[7], print; print >>sys.stderr, "variants evaluated",variants,"triallelic or more",trivariants;
def calculate_FETpvalues_new(vcffile, poolstatustable, poolsize): maxMAF = 0.01 # maximum minor allele frequency, variants above this are not used poolDX = [] # phenotype status for each pool Variants = [] File = open(vcffile) for line in File: if line[0] == '#' and line[1] == '#': continue variant = line.strip().split('\t') if variant[0] == '#CHROM': for i in xrange(9, len(variant)): sampleid = variant[i].split('/')[-1].split('.')[0] #print sampleid; try: status = poolstatustable[sampleid] except KeyError: status = -1 poolDX.append(status) print >> sys.stderr, status, #samples = len(variant)-9; print 'samples',samples,len(variant); print >> sys.stderr continue samples = len(variant) - 9 chrom = variant[0] position = int(variant[1]) refallele = variant[3] varalleles = variant[4].split(',') if len(varalleles) == 2: triallelic = 1 else: triallelic = 0 if len(varalleles) > 2: continue # ignore multi-allelic variants for now #if triallelic ==0 or position != 64527465: continue; H0 = 0.0 H1 = 0.0 H2 = 0.0 D0 = 0.0 D1 = 0.0 D2 = 0.0 casepools = 0 controlpools = 0 for i in xrange(samples): if poolDX[i] == -1: continue # ignore pool for case control analysis try: counts = variant[i + 9].split(':') readsf = counts[2].split(',') readsr = counts[3].split(',') total = int(readsf[0]) + int(readsf[1]) + int(readsr[0]) + int( readsr[1]) if triallelic == 1: total += int(readsf[2]) + int(readsr[2]) # tri-allelic alt = (float(readsf[1]) + float(readsr[1])) * poolsize alt /= (total + 0.0001) if triallelic == 1: alt2 = (float(readsf[2]) + float(readsr[2])) * poolsize alt2 /= (total + 0.0001) else: alt2 = 0.0 sf = float(2 * total) / (2 * total + poolsize) if poolDX[i] == 0: #H0 += poolsize; H1 = alt; H0 += sf * float(poolsize) H1 += sf * alt H2 += sf * alt2 controlpools += 1 # 0/1 bit of allelecounts stores case-control status... elif poolDX[i] >= 1: D0 += sf * float(poolsize) D1 += sf * alt D2 += sf * alt2 casepools += 1 #D0 += poolsize; D1 += alt; except IndexError: print 'Exception', i, samples, variant #if H1 + D1 < 10 or (H0-H1 + D0-D1 <10): lowcountvariants +=1; continue; pvalue = [0, 0, 0] pvalue2 = [0, 0, 0] if H1 + D1 >= 4: pvalue = fet(int(round(H1, 0)), int(round(H0, 0)), int(round(D1, 0)), int(round(D0, 0))) print '%0.2f %.2f:%.2f %.2f:%.2f' % ( pvalue[0], H0, H1, D0, D1), #'Control',float(H1)/(H0+0.001),'Case',float(D1)/(D0+0.001), if triallelic == 1 and H2 + D2 >= 4: pvalue2 = fet(int(round(H2, 0)), int(round(H0, 0)), int(round(D2, 0)), int(round(D0, 0))) print '2nd-allele %0.2f %.2f:%.2f %.2f:%.2f' % (pvalue2[0], H0, H2, D0, D2), print '%3s %9s %10s %10s %20s ' % (variant[0], variant[1], variant[3], variant[4], variant[5] + ':' + variant[6]), print variant[7]
def calculate_FETpvalues(vcffile, poolDX, poolsize): File = open(vcffile) for line in File: if line[0] == '#' and line[1] == '#': continue variant = line.strip().split() if variant[0] == '#CHROM': samples = len(variant) - 9 #samplelist = variant[9+offset:]; samples = len(samplelist); #print samplelist,samples; continue if 'SNP' in variant[2] or 'INDEL' in variant[2]: H0 = 0 H1 = float(0) D0 = 0 D1 = float(0) allelecounts = [] allelecountsD = [] poolsH = 0 poolsD = 0 for i in xrange(samples): if poolDX[i] == -1: continue # ignore pool for case control analysis counts = variant[i + 9].split(':') total = int(counts[0]) + int(counts[1]) #if total < 240: continue; alt = float(counts[1]) * poolsize alt /= (total + 0.01) #if alt < 0.5: alt = 0; #if alt > 0.5 and alt < 1: alt = 1; if poolDX[i] == 0: H0 += poolsize H1 += alt allelecounts.append([0, 0, int(counts[0]), int(counts[1])]) poolsH += 1 # 0/1 bit of allelecounts stores case-control status... elif poolDX[i] == 1: D0 += poolsize D1 += alt allelecounts.append([1, 1, int(counts[0]), int(counts[1])]) poolsD += 1 # print counts,total,alt; if H1 + D1 >= 5: #print allelecounts; allelecounts.sort() HAF = estimate_AF(allelecounts, poolsH, poolsize / 2, 0.001, 0) DAF = estimate_AF(allelecounts, poolsD, poolsize / 2, 0.001, poolsH) """ for p in xrange(HAF[2],HAF[3]): print '%3d %2.2f ' %(p,HAF[0][p]), print 'Binomial maxll',HAF[1],HAF[0][HAF[1]]; for p in xrange(DAF[2],DAF[3]): print '%3d %2.2f ' %(p,DAF[0][p]), print 'Binomial maxll',DAF[1],DAF[0][DAF[1]]; """ pvalue = fet(int(round(H1, 0)), int(H0), int(round(D1, 0)), int(D0)) if pvalue[0] < -2: pvalueperm = probabilisticFET(allelecounts, poolsH, poolsD, poolsize) # output from allele frequency estimation print 'PERM', else: pvalueperm = 1 print 'FET', print math.log( pvalueperm, 10 ), pvalue[0], variant[0], variant[1], variant[2], variant[ 3], variant[4], variant[5], variant[6], variant[7], print '%2.1f %2.1f %2.1f %2.1f' % ( H0, H1, D0, D1 ), #'Control',float(H1)/(H0+0.001),'Case',float(D1)/(D0+0.001), if pvalue < 0.001: print 'LOW' else: print
def calculate_FETpvalues_allelecounts(vcffile,poolstatustable,poolsize,VCFfile2): [SAMPLES,VARLIST,SAMPLELIST] = read_phase3_VCF(VCFfile2); poolDX = determine_poolphenotype(vcffile,poolstatustable); # phenotype status for each pool variants =0; trivariants =0; File = open(vcffile); for line in File: if line[0] == '#': continue; variant = line.strip().split('\t'); samples = len(variant)-9; chrom = variant[0]; position = int(variant[1]); refallele = variant[3]; varalleles = variant[4].split(','); if len(varalleles) ==2: triallelic=1; trivariants +=1; else: triallelic=0; variants +=1; if len(varalleles) >= 3 or len(varalleles) ==4: #print >>sys.stderr, '##triallelic',chrom,position,refallele,varalleles,variant[2]; continue; # ignore multi-allelic variants for now ## healthy (controls), D (disease), E (early onset), DC (diabetic complications) H0 = 0.0; H1 = 0.0; H2 = 0.0; D0 = 0.0; D1 = 0.0; D2 = 0.0; E0 = 0.0; E1= 0.0; E2 = 0; DC0 = 0; DC1 = 0; DC2 = 0; casepools=0; controlpools=0; for i in xrange(samples): if poolDX[i][0] == -1: continue; # ignore pool for case control analysis try: genotypes = variant[i+9].split(':'); if triallelic==0: MLAC = int(genotypes[0]); #meanAC = float(genotypes[2]); QAC = float(genotypes[1]); MLAC2 = 0; else: MLAC = int(genotypes[0].split(',')[0]); #meanAC = float(genotypes[2]); QAC = float(genotypes[1]); MLAC2 = int(genotypes[0].split(',')[1]); #if genotypes[3] == '-inf': varAF = -0.1*QAC; #else: varAF = float(genotypes[3]); if poolDX[i][0] == 0: H0 += poolsize; H1 += MLAC; H2 += MLAC2; elif poolDX[i][0] >= 1: D0 +=poolsize; D1 += MLAC; D2 += MLAC2; if poolDX[i][0] == 2: E0 += poolsize; E1 += MLAC; E2 += MLAC2; if len(poolDX[i]) >=2 and poolDX[i][1] == 3: DC0 += poolsize; DC1 += MLAC; DC2 += MLAC2; except IndexError: print 'Exception',i,samples,variant; ## calculate p-value between cases and controls if (H1 + D1 >= 4): pvalue = fet(int(round(H1,0)),int(round(H0,0)),int(round(D1,0)),int(round(D0,0))); else: pvalue = [0,0,0]; ## calculate p-values between early onset and controls if (H1+E1 >=4): pvalue1 = fet(int(round(H1,0)),int(round(H0,0)),int(round(E1,0)),int(round(E0,0))); else: pvalue1 = [0,0,0]; ## calculate p-value between early onset and late-onset if (D1+E1 >=4): pvalue2 = fet(int(round(D1,0)),int(round(D0,0)),int(round(E1,0)),int(round(E0,0))); else: pvalue2 = [0,0,0]; if (chrom,position,refallele,varalleles[0]) in VARLIST: VAR2 = VARLIST[(chrom,position,refallele,varalleles[0])]; print 'foundvar',VAR2, E11 = E1 - VAR2[1]; E01 = E0 - VAR2[1] - VAR2[0]; if E11 < 0: E11 = 0; D11 = D1 - VAR2[1]-VAR2[3]; if D11 < 0: D11 = 0; D01 = D0 - VAR2[1] - VAR2[0] - VAR2[2] -VAR2[3]; H01=H0; H11 = H1; if pvalue[0] < -0.5: ## low p-values new_pvalue = fet(int(round(H1,0)),int(round(H0,0)),int(round(D11,0)),int(round(D01,0))); else: new_pvalue = pvalue; if pvalue[0] < -0.5: ## low p-values new_pvalue1 = fet(int(round(H1,0)),int(round(H0,0)),int(round(E11,0)),int(round(E01,0))); else: new_pvalue1 = pvalue1; print ' %0.2f %0.2f %d/%d %d/%d %d/%d ' %(new_pvalue[0],new_pvalue1[0],int(H1),int(H0),int(D11),int(D01),int(E11),int(E01)); print 'corr-new %0.2f %0.2f %0.4f %0.4f %0.4f ' %(new_pvalue[0],new_pvalue1[0],H11/H01,D11/D01,E11/E01), else: #print 'missing'; ## 88 for early onset, 218 for late onset #E11 = E1; E01 = E0-88; D11 = D1; D01 = D0-218-88; #print 'corrected -\t-\t H11 = H1; H01 = H0; D11 = D1; D01 = D0; E11 = E1; E01 = E0; print 'corr-orig %0.2f %0.2f %0.4f %0.4f %0.4f ' %(pvalue[0],pvalue1[0],H11/H01,D11/D01,E11/E01), #print 'corrected -\t-\t-\t-', print '%0.2f %0.2f %.1f/%.1f %.1f/%.1f %.1f/%.1f' %(pvalue[0],pvalue1[0],H1,H0,D1,D0,E1,E0),#'Control',float(H1)/(H0+0.001),'Case',float(D1)/(D0+0.001), #print '%0.2f %0.2f' %(pvalue1[0],pvalue2[0]), #print '%0.4f %0.4f %0.4f %0.4f' %(H1/H0,D1/D0,E1/E0,DC1/DC0), print '%0.4f %0.4f %0.4f' %(H1/H0,D1/D0,E1/E0), if triallelic ==1: if H2 + D2 >=4: pvalue_tri = fet(int(round(H2,0)),int(round(H0,0)),int(round(D2,0)),int(round(D0,0))); else: pvalue_tri = [0,0,0]; print 'TRIALLELIC:%0.2f:%.1f:%.1f' %(pvalue_tri[0],H2,D2), else: print '-', print '%3s %9s %s %10s %10s %8s %10s' %(variant[0],variant[1],variant[2],variant[3],variant[4],variant[5],variant[6]), print variant[7], if PRINTGENOTYPES ==1: for i in xrange(samples): if poolDX[i][0] == -1: continue; # ignore pool for case control analysis genotypes = variant[i+9].split(':'); print variant[i+9], #MLAC = int(genotypes[0]); meanAC = float(genotypes[2]); QAC = float(genotypes[1]); #if genotypes[3] == '-inf': varAF = -0.1*QAC; #else: varAF = float(genotypes[3]); #print '%2d:%0.2f:%0.2f' %(MLAC,meanAC,math.sqrt(pow(10,varAF))), print; print >>sys.stderr, "variants evaluated",variants,"triallelic or more",trivariants;