def checkRef(name): reffile = name +'RefT' altfile = name + 'AltT' hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19(download = True) print "Loaded hg19" ref = glob.json(reffile) print "Loaded Ref" alt = glob.json(altfile) print "Loaded Alt" flip = [] errors = [] keys = ref.keys() for snppos in keys: print snppos + '\t' + name t = snppos.split('pos') hg19snp = str(hg19[t[0]][int(t[1])-1]).upper() refsnp = ref[snppos].upper() altsnp = alt[snppos].upper() if hg19snp == refsnp: continue elif hg19snp == altsnp: flip.append(snppos) else: print "Error: Neither Ref nor Alt of SNP corresponds to hg19 sequence" errors.append(snppos) glob.dump(flip, name+'flips') glob.dump(errors, name+'errors') return [flip, errors]
def parse1KGvcf(vcffile, outputname): file = open(vcffile) outputfile = open(outputname+'Geno', 'w') ref = {} alt = {} lines = file.readlines(1000000) while(lines != []): for l in lines: if l.startswith('#CHROM'): g = reduce(lambda x,y: x+','+y, l.strip('\n').split('\t')[9:]) outputfile.write('\t'+str(g) +'\n') if not l.startswith('#'): tokens = l.strip('\n').split('\t') f = filter(lambda x: 'GP' in x, tokens[7].split(';')) if f != []: pos = 'chr'+f[0].split('=')[1].split(':')[0]+'pos'+f[0].split('=')[1].split(':')[1] ref[pos] = tokens[3] alt[pos] = tokens[4] m=pos +'\t' for t in tokens[9:]: m = m + str(int(t[0]) + int(t[2])) + ',' outputfile.write(m.strip(',')+'\n') lines = file.readlines(1000000) glob.dump(ref, outputname+'Ref') glob.dump(alt, outputname+'Alt')
def getsnpgenos(genos, filestruc, chosenSNPs, incarray = 0): lines = filestruc.genofile.readlines() snppos = map(lambda x: x.split('\t')[0], lines[1:]) inboth = set(snppos) & set(chosenSNPs) notingeno = set(filter(lambda x: x not in inboth, chosenSNPs)) try: genos['lines'] = genos['lines'] +lines[0].split('\t')[1].split(',') except KeyError: genos['lines'] = lines[0].split('\t')[1].split(',') print("Number of Lines in Genotype:" + str(len(lines))) for l in lines[1:]: t = l.split('\t') snp = t[0] if snp in inboth: try: genos[snp] = genos[snp].strip(',') + ','+t[1].strip('\n') except KeyError: genos[snp] = t[1].strip('\n') if incarray == 0: for s in notingeno: try: genos[s] = genos[s].strip(',') + ','+('0,' * filestruc.ln) except KeyError: genos[s] = '0,' * filestruc.ln glob.dump(genos, 'tempgenos') return genos
def corrRef(flip, name): reffile = name +'RefT' altfile = name + 'AltT' for snp in flip: t = ref[snp] ref[snp] = alt[snp] alt[snp] = t glob.dump(ref, reffile+'flipped') glob.dump(alt, altfile+'flipped')
def filterzeros(arrayname): """take out those that are 0 """ freq = glob.json(arrayname+'freq') for snp in freq.keys(): if freq[snp] == 0 or math.isnan(freq[snp]): del freq[snp] glob.dump(freq, arrayname+'freq')
def makerhash(): rsid2pos = {} hapmaprsidfile = 'hapmap_rsid_hash_lines' assert hapmaprsidfile in os.listdir('./'), "Need this file {0}".format(hapmaprsidfile) file = open('./hapmap_rsid_hash_lines') with open('./hapmap_rsid_hash_lines') as f: rlines = f.readlines() for r in rlines: t = r.split('\t') rsid2pos[t[0]] = t[1].strip('\n') glob.dump(rsid2pos, 'rsid2poshash')
def flipArray(arrayname, flip, error): """flip array snp frequencies (hash) 1-freq for those in snp list inputed as flip input is constructed in the original getarraysnps() function """ try: arrayfreq = glob.json(arrayname+'freq') except: "No array snp frequency file" for snp in flip: arrayfreq[snp] = 1 - arrayfreq[snp] for snp in error: del arrayfreq[snp] glob.dump(arrayfreq, arrayname+'freq')
def parsehapmap(): import parsehapmapgenotypes ref = {} alt = {} genotype = open('hapmapGeno','w') for c in range(1,23): "here" [r,a] = parsehapmapgenotypes.parsehapmapchrom(c) ref.update(r) alt.update(a) with open('../genotypes/hapmapchr'+str(c)+'genotype') as g: lines = g.readlines() map(lambda l: genotype.write(l), lines) genotype.close() glob.dump(ref, '../genotypes/hapmapRef') glob.dump(alt, '../genotypes/hapmapAlt')
def filterSNPs(name): reffile = name +'Ref' altfile = name + 'Alt' [ref, alt] = map(lambda x: glob.json(x, ''), [reffile, altfile]) print "Loaded Ref {0}, Alt {1}".format(len(ref),len(alt)) keys = ref.keys() complsnps = [] for snppos in keys: if glob.compl[ref[snppos].upper()] == alt[snppos].upper() or ref[snppos].upper() == alt[snppos].upper(): complsnps.append(snppos) del ref[snppos] del alt[snppos] print len(ref) print len(alt) glob.dump(ref, reffile+'T') glob.dump(alt, altfile+'T') return complsnps
def combinegenos(names, chosenSNPs, out = 'combGenosfile', incarray = 0): genos = {} if type(names) is str: f = Genotypes(names) genos = getsnpgenos(genos, f, chosenSNPs, incarray) if type(names) is list: files = map(lambda x: Genotypes(x), names) genos = reduce(lambda x,y: getsnpgenos(x, y, chosenSNPs, incarray), [genos]+files) glob.dump(genos, out+'.json') genos['lines'] = map(lambda x: x.strip('\n'), genos['lines']) output = open(out, 'w') linenames = reduce(lambda x,y: x +',' + y, genos['lines']) output.write('\t' + linenames +'\n') for g in genos.keys(): if g != 'lines': output.write(g + '\t' + genos[g].strip(',') + '\n')
def getarraysnps(report): print report file = open(report) lines = file.readlines() file.close() #header = "SNP Name,Sample ID,Allele1 - Top,Allele2 - Top,GC Score,Allele1 - Plus,Allele2 - Plus,Chr,Position,SNP,Theta,R,X,Y,X Raw,Y Raw,B Allele Freq" #header = "SNP Name,Sample ID,Allele1 - Top,Allele2 - Top,GC Score,Allele1 - Forward,Allele2 - Forward,Allele1 - Plus,Allele2 - Plus,Chr,Position,GT Score,Cluster Sep,SNP,X,Y,X Raw,Y Raw,B Allele,Freq,Log R Ratio,CNV Value,CNV Confidence" #h = header.split(',') #h = ['SNP Name', 'Sample ID', 'Allele1 - Top', 'Allele2 - Top', 'GC Score', 'Allele1 - Forward', 'Allele2 - Forward', 'Allele1 - Plus', 'Allele2 - Plus', 'Chr', 'Position', 'GT Score', 'Cluster Sep', 'SNP', 'X', 'Y', 'X Raw', 'Y Raw', 'B Allele Freq', 'Log R Ratio', 'CNV Value', 'CNV Confidence', 'Top Genomic Sequence', 'Plus/Minus Strand', 'Theta', 'R\r\n'] h = 'SNP Name\tSample ID\tAllele1 - Top\tAllele2 - Top\tGC Score\tSNP Index\tAllele1 - Forward\tAllele2 - Forward\tAllele1 - AB\tAllele2 - AB\tAllele1 - Plus\tAllele2 - Plus\tChr\tPosition\tSNP\tILMN Strand\tTop Genomic Sequence\tPlus/Minus Strand\tTheta\tR\tX\tY\tX Raw\tY Raw\tB Allele Freq\r\n'.split('\t') snpi = h.index("SNP") chri = h.index("Chr") posi = h.index("Position") Yi = h.index("Y") Xi = h.index("X") snplist = [] ref = {} alt = {} freq = {} for l in lines: t = l.split('\t') try: if t[chri] not in map(lambda x: str(x), range(1,23)): continue else: snppos = 'chr'+t[chri]+'pos'+t[posi] snplist.append(snppos) ref[snppos] = t[snpi].split('/')[0][1] alt[snppos] = t[snpi].split('/')[1][0] freq[snppos] = float(t[Yi])/(float(t[Yi])+float(t[Xi])) except: continue glob.dump(snplist, report+'snps') glob.dump(ref, report+'RefT') glob.dump(alt, report+'AltT') glob.dump(freq, report+'freq')