def checkRef(name):
	reffile = name +'RefT'
	altfile = name + 'AltT'
	hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19(download = True)
	print "Loaded hg19"
	ref = glob.json(reffile)
	print "Loaded Ref"
	alt = glob.json(altfile)
	print "Loaded Alt"
	flip = []
	errors = []
	keys = ref.keys()
	for snppos in keys:
		print snppos + '\t' + name
		t = snppos.split('pos')
		hg19snp = str(hg19[t[0]][int(t[1])-1]).upper()
		refsnp = ref[snppos].upper()
		altsnp = alt[snppos].upper()
		if hg19snp == refsnp:
			continue
		elif hg19snp == altsnp:
			flip.append(snppos)
		else:
			print "Error: Neither Ref nor Alt of SNP corresponds to hg19 sequence"
			errors.append(snppos)
	glob.dump(flip, name+'flips')
	glob.dump(errors, name+'errors')
	return [flip, errors]
def parse1KGvcf(vcffile, outputname):
	file = open(vcffile)
	outputfile = open(outputname+'Geno', 'w')
	ref = {}
	alt = {}
	lines = file.readlines(1000000)
	while(lines != []):
		for l in lines:
			if l.startswith('#CHROM'):
				g = reduce(lambda x,y: x+','+y, l.strip('\n').split('\t')[9:])
				outputfile.write('\t'+str(g) +'\n')
			if not l.startswith('#'):
				tokens = l.strip('\n').split('\t')
				f = filter(lambda x: 'GP' in x, tokens[7].split(';'))
				if f != []:
					pos = 'chr'+f[0].split('=')[1].split(':')[0]+'pos'+f[0].split('=')[1].split(':')[1]
					ref[pos] = tokens[3]
					alt[pos] = tokens[4]
					m=pos +'\t'
					for t in tokens[9:]:
						m = m + str(int(t[0]) + int(t[2])) + ','
					outputfile.write(m.strip(',')+'\n')
		lines = file.readlines(1000000)
	glob.dump(ref, outputname+'Ref')
	glob.dump(alt, outputname+'Alt')
def getsnpgenos(genos, filestruc, chosenSNPs, incarray = 0):
	lines = filestruc.genofile.readlines()
	snppos = map(lambda x: x.split('\t')[0], lines[1:])
	inboth = set(snppos) & set(chosenSNPs)
	notingeno = set(filter(lambda x: x not in inboth, chosenSNPs))
	try:
		genos['lines'] = genos['lines'] +lines[0].split('\t')[1].split(',')
	except KeyError:
		genos['lines'] = lines[0].split('\t')[1].split(',')
	print("Number of Lines in Genotype:" + str(len(lines)))
	for l in lines[1:]:
		t = l.split('\t')
		snp = t[0]
		if snp in inboth:
			try:
				genos[snp] = genos[snp].strip(',') + ','+t[1].strip('\n')
			except KeyError:
				genos[snp] = t[1].strip('\n')
	if incarray == 0:
		for s in notingeno:
			try:
				genos[s] = genos[s].strip(',') + ','+('0,' * filestruc.ln)
			except KeyError:
				genos[s] = '0,' * filestruc.ln
	glob.dump(genos, 'tempgenos')				
	return genos
def corrRef(flip, name):
	reffile = name +'RefT'
	altfile = name + 'AltT'
	for snp in flip:
		t = ref[snp]
		ref[snp] = alt[snp]
		alt[snp] = t
	glob.dump(ref, reffile+'flipped')
	glob.dump(alt, altfile+'flipped')	
def filterzeros(arrayname):
	"""take out those that are 0
		
	"""
	
	freq = glob.json(arrayname+'freq')
	for snp in freq.keys():
		if freq[snp] == 0 or math.isnan(freq[snp]):
			del freq[snp]
	glob.dump(freq, arrayname+'freq')
def makerhash():
	rsid2pos = {}
	hapmaprsidfile = 'hapmap_rsid_hash_lines'
	assert hapmaprsidfile in os.listdir('./'), "Need this file {0}".format(hapmaprsidfile)			
	file = open('./hapmap_rsid_hash_lines')
	with open('./hapmap_rsid_hash_lines') as f:
		rlines = f.readlines()
	for r in rlines:
		t = r.split('\t')
		rsid2pos[t[0]] = t[1].strip('\n')	
	glob.dump(rsid2pos, 'rsid2poshash')
def flipArray(arrayname, flip, error):
	"""flip array snp frequencies (hash)
	1-freq for those in snp list inputed as flip
	input is constructed in the original getarraysnps() function
	""" 
	
	try:
		arrayfreq = glob.json(arrayname+'freq')
	except:
		"No array snp frequency file"
	for snp in flip:
		arrayfreq[snp] = 1 - arrayfreq[snp]
	for snp in error:
		del arrayfreq[snp]
	glob.dump(arrayfreq, arrayname+'freq')
def parsehapmap():
	import parsehapmapgenotypes
	ref = {}
	alt = {}	
	genotype = open('hapmapGeno','w')
	for c in range(1,23):
		"here"
		[r,a] = parsehapmapgenotypes.parsehapmapchrom(c)
		ref.update(r)
		alt.update(a)
		with open('../genotypes/hapmapchr'+str(c)+'genotype') as g:
			lines = g.readlines()
			map(lambda l: genotype.write(l), lines)
	genotype.close()
	glob.dump(ref, '../genotypes/hapmapRef')
	glob.dump(alt, '../genotypes/hapmapAlt')
def filterSNPs(name):
	reffile = name +'Ref'
	altfile = name + 'Alt'
	[ref, alt] = map(lambda x: glob.json(x, ''), [reffile, altfile])
	print "Loaded Ref {0}, Alt {1}".format(len(ref),len(alt))
	keys = ref.keys()
	complsnps = []
	for snppos in keys:
		if glob.compl[ref[snppos].upper()] == alt[snppos].upper() or ref[snppos].upper() == alt[snppos].upper():
			complsnps.append(snppos)
			del ref[snppos]
			del alt[snppos]

	print len(ref)
	print len(alt)
	glob.dump(ref, reffile+'T')
	glob.dump(alt, altfile+'T')
	return complsnps
Example #10
0
def combinegenos(names, chosenSNPs, out = 'combGenosfile', incarray = 0):			
	genos = {}
	
	if type(names) is str:
		f = Genotypes(names)
		genos = getsnpgenos(genos, f, chosenSNPs, incarray)

	if type(names) is list:
		files = map(lambda x: Genotypes(x), names)
		genos = reduce(lambda x,y: getsnpgenos(x, y, chosenSNPs, incarray), [genos]+files)
	
	glob.dump(genos, out+'.json')
		
	genos['lines'] = map(lambda x: x.strip('\n'), genos['lines'])
	output = open(out, 'w')
	linenames = reduce(lambda x,y: x +',' + y, genos['lines'])
	output.write('\t' + linenames +'\n')
	for g in genos.keys():
		if g != 'lines':
			output.write(g + '\t' + genos[g].strip(',') + '\n')				
Example #11
0
def getarraysnps(report):
	print report
	file = open(report)
	lines = file.readlines()
	file.close()
	#header = "SNP Name,Sample ID,Allele1 - Top,Allele2 - Top,GC Score,Allele1 - Plus,Allele2 - Plus,Chr,Position,SNP,Theta,R,X,Y,X Raw,Y Raw,B Allele Freq"
	#header = "SNP Name,Sample ID,Allele1 - Top,Allele2 - Top,GC Score,Allele1 - Forward,Allele2 - Forward,Allele1 - Plus,Allele2 - Plus,Chr,Position,GT Score,Cluster Sep,SNP,X,Y,X Raw,Y Raw,B Allele,Freq,Log R Ratio,CNV Value,CNV Confidence"
	#h = header.split(',')
	#h = ['SNP Name', 'Sample ID', 'Allele1 - Top', 'Allele2 - Top', 'GC Score', 'Allele1 - Forward', 'Allele2 - Forward', 'Allele1 - Plus', 'Allele2 - Plus', 'Chr', 'Position', 'GT Score', 'Cluster Sep', 'SNP', 'X', 'Y', 'X Raw', 'Y Raw', 'B Allele Freq', 'Log R Ratio', 'CNV Value', 'CNV Confidence', 'Top Genomic Sequence', 'Plus/Minus Strand', 'Theta', 'R\r\n']
	h = 'SNP Name\tSample ID\tAllele1 - Top\tAllele2 - Top\tGC Score\tSNP Index\tAllele1 - Forward\tAllele2 - Forward\tAllele1 - AB\tAllele2 - AB\tAllele1 - Plus\tAllele2 - Plus\tChr\tPosition\tSNP\tILMN Strand\tTop Genomic Sequence\tPlus/Minus Strand\tTheta\tR\tX\tY\tX Raw\tY Raw\tB Allele Freq\r\n'.split('\t')
	snpi = h.index("SNP")
	chri = h.index("Chr")
	posi = h.index("Position")
	Yi = h.index("Y")
	Xi = h.index("X")
	snplist = []
	ref = {}
	alt = {}
	freq = {}
	for l in lines:
		t = l.split('\t')
		try:
			if t[chri] not in map(lambda x: str(x), range(1,23)):
				continue
			else:
				snppos = 'chr'+t[chri]+'pos'+t[posi]
				snplist.append(snppos)
				ref[snppos] = t[snpi].split('/')[0][1] 
				alt[snppos] = t[snpi].split('/')[1][0]
				freq[snppos] = float(t[Yi])/(float(t[Yi])+float(t[Xi]))
		except:
			continue
		
	glob.dump(snplist, report+'snps')
	glob.dump(ref, report+'RefT')
	glob.dump(alt, report+'AltT')
	glob.dump(freq, report+'freq')