def parse1KGvcf(vcffile, plines, genotypedboutput, refdboutput, altdboutput): ''' parse1KGvcf('../1000GenomesData/CEU.low_coverage.2010_09.genotypes.vcf' , p1lines, 'testoutput', 'testoutputRef', 'testoutputAlt') DEFAULT: only take Illumina 2.5M snps from genotype, assume they are consistant enough ''' #>>> for l in lines: # newg.write(l.split('\t')[0]+','+l.split('\t')[1]) afile = open('mark/arraypool/25M1.1', 'r') lines = afile.readlines() afile.close() asnps = set([]) for l in lines: asnps.add(l.split('\t')[9] + ':' + l.split('\t')[10]) print len(asnps) vfile = open(vcffile, 'r') vcf_reader = vcf.Reader(vfile) poollines = gl.jsonload(plines) outputfile = open(genotypedboutput, 'w') ref = {} alt = {} for record in vcf_reader: try: chrom = record.INFO['GP'].split(':')[0] pos = record.INFO['GP'].split(':')[1] except KeyError: continue if chrom+':'+pos in asnps: ref[chrom+':'+pos] = str(record.REF) alt[chrom+':'+pos] = str(record.ALT[0]) m = chrom+':'+pos+',' sm = 0 for s in poollines: record.genotype(s) try: geno = record.genotype(s)['GT'] except KeyError: print "no genotype" continue if geno: if '|' in geno: g = geno.split('|') if '\\' in geno: g = geno.split('\\') if '/' in geno: g = geno.split('/') an = int(g[0]) + int(g[1]) sm = sm + an m = m + str(an) + ',' else: m = m+str(0) + ',' if sm > 0: outputfile.write(m.strip(',') + '\n') gl.jsondump(ref, refdboutput) gl.jsondump(alt, altdboutput)
def getarraysnps(report, fgenoref, fgenoalt, outname, kwargs): ''' test this with MKReport1bysnps.txt and output of parse1KGvcf function getarraysnps('MKReport1bysnps.txt', 'testoutputRef', 'testoutputAlt') In this version we totally ignore compliments Array can't have duplicates (it does). We take the last one mentioned in the array report, because those are the 1kg ones rather than rsid kwargs: snp chr pos theta header ''' print report print kwargs file = open(report) lines = file.readlines() lines.reverse() file.close() genoref = gl.jsonload(fgenoref) genoalt = gl.jsonload(fgenoalt) output = open(outname, 'w') try: header = kwargs['header'] h = header.split('\\t') print h snpi = h.index("SNP") chri = h.index("Chr") posi = h.index("Position") thetai = h.index("Theta") except KeyError: try: snpi = int(kwargs['snp']) chri = int(kwargs['chr']) posi = int(kwargs['pos']) thetai = int(kwargs['theta']) except KeyError: print "No header or column numbers provided provided" snps = set([]) freq = {} for l in lines: t = l.split('\t') try: if t[chri] not in map(lambda x: str(x), range(1,23)): continue else: snppos = t[chri]+':'+t[posi] ref = t[snpi].split('/')[0][1] alt = t[snpi].split('/')[1][0] if genoref[snppos] == ref and genoalt[snppos] == alt: f = float(t[thetai]) elif genoref[snppos] == alt and genoalt[snppos] == ref: f = 1 - float(t[thetai]) else: continue if f != 0 and f != 1: #freq[snppos] = f if snppos not in snps: snps.add(snppos) output.write(snppos + '\t' + str(f) + '\n') else: continue except: continue