Ejemplo n.º 1
0
	def _stream2SSfile(self, RefSNPdict):
		"""
		Makes simple SNP file for analysis by SNPhylo tree maker

		** 
		- SNPhylo is case-sensitive
		- Also requires single genotype mutation, represent single alleles different from the ref only

		### BASIC FORMAT:
		#chrNum 	position 	Ref 	SampleID1	... 
		1		1000		A 		A 		T 
		"""
		F = open(self.outputPath, 'w')
		F.write('#Chrom\tPos\tRef\tSampleID1\n')
		for chr in self.chrList:
			positions = RefSNPdict[chr].keys()
			positions.sort()
			for pos in positions:
				refGenotype = RefSNPdict[chr][pos][GENOTYPE].upper()
				if pos in self.genome[chr]:
					if len(self.genome[chr][refPos][GENOTYPE]) > 1:
						if refGenotype.lower() == self.genome[chr][refPos][GENOTYPE][0].lower():
							sample = self.genome[chr][refPos][GENOTYPE][1]
						else:
							sample = self.genome[chr][refPos][GENOTYPE][0]
					else:
						sample = self.genome[chr][refPos][GENOTYPE]
					F.write('%s\t%s\t%s\t%s\n' % (chr, pos, refGenotype, sample))
		F.close()
		parseNstream.printMsg("Completed streaming to Simple SNP file format (path=%s)" % self.outputPath)
Ejemplo n.º 2
0
	def _stream2SRSIDfile(self):
		"""
		Streams input data containing SNP rsid's to an output (filename) file
		containing just the RSID values in each line. 
		"""
		F = open(self.outputPath, 'w')
		for chr in self.genome:
			
			for pos in self.genome[chr]:
				F.write('%s\n' % self.genome[chr][pos][RSID])

		F.close()
		parseNstream.printMsg("Completed streaming to simple RSID file format (path=%s)" % self.outputPath)
Ejemplo n.º 3
0
	def __init__(self, chrom=""):
		if chrom == "":
			printMsg("Enter chromosome (1, 2, ..., or 23 (X), 24 (Y), 25 (MT))")
			chrom = raw_input('---> ')

		self.num = int(chrom)
		self.chrLengths = [248956422, 242193529, 198295559, 190214555, 181538259, \
		170805979, 159345973, 145138636, 138394717, 133797422, 135086622, \
		133275309, 114364328, 107043718, 101991189, 90338345, 83257441, \
		80373285, 58617616, 64444167, 46709983, 50818468, 156040895, 57227415, 16569]

		self.positions = []
		self.minSNP = 0
		self.maxSNP = 0
Ejemplo n.º 4
0
def addUserScoredData_toDB(file, userObjID='', streamStatus=''):
	"""
	Adds every single SNP inside the _scoredAllelesObject to the DB. 

	_scoredAllelesObject queries the DB and outputs a dictionary containing all
	SNPs that don't already exist in the DB
	
	file: a file output from scoreAlleles().
	userObjID: a user id from the users_collection.users

	"_id": {
        "$oid": "555a83687a349b6910bdff6c"
    }

	"""
	COUNT = 0
	k = 0

	if userObjID == '':
		userObjID = raw_input('Please enter a user ID: ')
		assert(userObjID != '')

	# Date/time stamp for start of run
	parseNstream.printMsg(datetime.datetime.now().ctime())
	newNumSNPs = 0
	if os.path.exists(file):
		# Implement assertion for file type, ensure all column headers are accurate
		ScoredSNPdict = parseNstream._scoredAllelesObject(file, userObjID)
		alreadyinDB = {}
		alreadyinDB['total'] = 0
		alreadyinDB['ids'] = []
		signal = 0
		for chr in parseNstream.CHROMOSOME_LIST:

			alreadyinDB[chr] = 0

			collect_signals = []
			if any(chr == x for x in ['1', '2', '3', '4', '5']):
				signal += 1
				collection = db_model.CollectbyChromosome_1to5
			elif any(chr == x for x in ['6', '7', '8', '9', '10', '11']):
				signal += 1
				collection = db_model.CollectbyChromosome_6to11 
			elif any(chr == x for x in ['12', '13', '14', '15', '16', '17']):
				signal += 1
				collection = db_model.CollectbyChromosome_12to17 
			elif any(chr == x for x in ['18', '19', '20', '21', '22', '23', '24', '25']):
				signal += 1
				collection = db_model.CollectbyChromosome_18to25
			collect_signals.append(str(signal))

			positions = ScoredSNPdict[chr].keys()
			positions.sort()

			# Iterates through each chromosomal position adding new RSIDS to DB in ascending order
			for position in positions:
				# if not UsersSNPCollection.find({'_id': rsid}).count():
				data = ScoredSNPdict[chr][position]

				if newNumSNPs == 0: 
					parseNstream.printMsg("Sample data object:")
					print data 

				# Insert into DB
				if streamStatus == 'empty':
					collection.insert(data)
					newNumSNPs += 1

				if streamStatus != 'empty':
					if not collection.find({'_id': data[RSID]}).count(): 
						collection.insert(data)
						newNumSNPs += 1
					else: 
						alreadyinDB['total'] += 1
						alreadyinDB[chr] += 1
						alreadyinDB['ids'] += [data[RSID]]

				if newNumSNPs == db_model.CHECKLIST[k]:
					print "[x] Check %s-th complete. %s SNPs parsed so far. %s new SNPs added in total. %s parsed SNPs were already in the DB." % (k+1, db_model.CHECKLIST[k], newNumSNPs, COUNT-newNumSNPs)
					k += 1 

			if collect_signals == ['1', '2', '3', '4', '5']:
				parseNstream.printMsg("Interval 1 complete: (1-5)")
			elif collect_signals == ['6', '7', '8', '9', '10', '11']:
				parseNstream.printMsg("Interval 1 complete: (6-11)")
			elif collect_signals == ['12', '13', '14', '15', '16', '17']:
				parseNstream.printMsg("Interval 1 complete: (12-17)")
			else:
				if collect_signals == ['18', '19', '20', '21', '22', '23', '24', '25']:
					parseNstream.printMsg("Interval 1 complete: (18-25)")

			for chr in alreadyinDB:
				print alreadyinDB[chr]

		print "[x] Last check complete. %s total SNPs added to the UsersSNPCollection." % (db_model.CHECKLIST[k])

		# Date/time stamp for end of run
		parseNstream.printMsg(datetime.datetime.now().ctime())
		print alreadyinDB['ids']
		parseNstream.printMsg('SNPS ALREADY IN DB: %s' % alreadyinDB['total'])
		for chr in alreadyinDB:
			print alreadyinDB[chr]

	else:
		print "Path <%s> not found" % file
Ejemplo n.º 5
0
... etc.
"""

# Make dict of all SNP positions. Keys = Chromosome #, Values = lst of positions
for chrom in ChromosomeDict:
	ChromosomePositions[chrom] = ChromosomeDict[chrom].keys()
	ChromosomePositions[chrom].sort()  # Could go without
	Chromosomes[chrom] = Chromosome(chrom=chrom)
	Chromosomes[chrom].positions = ChromosomePositions[chrom]
	ChromosomePositions[chrom] = np.array(ChromosomePositions[chrom],dtype=np.float64)

# Output min and max values
for chrom in ChromosomePositions:
	Chromosomes[chrom].minSNP = ChromosomePositions[chrom].min()
	Chromosomes[chrom].maxSNP = ChromosomePositions[chrom].max()
	printMsg("Chr%s: lowest chromosome position, highest chromosome position = (%s, %s)" % (chrom, Chromosomes[chrom].minSNP, Chromosomes[chrom].maxSNP))
	
ChromosomeSizes = []
for chrom in Chromosomes:
	ChromosomeSizes.append(Chromosomes[chrom].size())
	print Chromosomes[chrom].size()

ChromosomeSizes.sort()
ChromosomeSizes.reverse()
ChromosomeSizes = np.array(ChromosomeSizes)
largest_size = ChromosomeSizes.max()
smallest_size = ChromosomeSizes.min()
largest_chrom = ""
smallest_chrom = ""
printMsg("Chromosomes By Size")
index = 1
Ejemplo n.º 6
0
def concatenateSNPs(RefSNPFile, UserSNPFile):
	"""
	Structure of RefSNPdict and UserSNPdict:
		key = chromosome string as '1', '2', ..., '23' (X), '24' (Y), '25', (MT/M)

	(*) Reference allele is only ONE letter. 

	- Genotype is a string of len=2, the original allele representations from the 23andme file

	- Variant(s) represents any letters different from the Reference, if none then given '-'

	- Match Score (0,1,2): the SNP is given a score of ...
			- 2 (homozygous) if both letters are the same as the Reference
			- 1 (heterozygous) if one letter is the same as the Reference
			- 0 (recessive) if neither match the reference 

	(*) Variants given a match score of 0 may need to be switched to the 
	opposite letters (A to T, C to G and vice versa); must check to 
	confirm which are minus vs. plus strands)

	### BASIC FORMAT:
	#chrNum 	position 	Ref 	Genotype	Variant 	Matches(0,1,2)
	1			1000		A 		AA 			-			

	RETURNS: none, streams to file... 
	"""
	RefSNPdict = parseNstream._referenceObject(RefSNPFile, 'CHROMOSOME')
	UserSNPdict = parseNstream._23andmeObject(UserSNPFile, 'CHROMOSOME')

	nonMatchedRSIDs = []
	numMatchedRSIDs = 0
	snpsConcatenated = 0
	indel = {}

	F = open(DEFAULT_OUTPUT_FILEPATH, 'w')
	for chr in parseNstream.CHROMOSOME_LIST:
		positions = RefSNPdict[chr].keys()
		positions.sort()
		indel[chr] = {}
		for pos in positions:
			refAllele = RefSNPdict[chr][pos][GENOTYPE].upper()
			rsid = RefSNPdict[chr][pos][RSID]

			# Check allele cases for ref in the user's snps
			if pos in UserSNPdict[chr]:
				if rsid != UserSNPdict[chr][pos][RSID]:
					nonMatchedRSIDs += [(rsid, UserSNPdict[chr][pos][RSID])]

				if isIndel(UserSNPdict[chr][pos][GENOTYPE]):
					indel[chr][pos] = {
						RSID: UserSNPdict[chr][pos][RSID], 
						GENOTYPE: UserSNPdict[chr][pos][GENOTYPE]
						}

				else:
					numMatchedRSIDs += 1
					sampleAllele = UserSNPdict[chr][pos][GENOTYPE]		# 3

					# Chromosomes 1-22 will have an allele pair, hence str len of 2
					if len(sampleAllele) == 2:
						if sampleAllele[0] == '-' and sampleAllele[1] == '-':
							variant = '-'
							score = '-'
						elif sampleAllele[0].lower() == refAllele.lower() and sampleAllele[1].lower() == refAllele.lower():
							variant = '-'
							score = 2
						elif sampleAllele[0].lower() == refAllele.lower():
							variant = sampleAllele[1]
							score = 1
						elif sampleAllele[1].lower() == refAllele.lower():
							variant = sampleAllele[0]
							score = 1
						else:
							variant = sampleAllele
							score = 0

					# Mitochondria, X, and Y chromosome alleles are of length 1
					elif len(sampleAllele) == 1:
						if sampleAllele == '-':
							variant = '-'
							score = '-'
						elif sampleAllele.lower() == refAllele.lower():
							variant = '-'
							score = 1
						else:
							variant = sampleAllele
							score = 0

					if score != '-':
						snpsConcatenated += 1

						if variant == '-':
							F.write('%s' % sampleAllele[0])

						else:
							F.write('%s' % variant)


	F.close()
	parseNstream.printMsg("Completed scoring user SNPs to REF SNPs. Streamed to a .FASTA file (path=%s)" % DEFAULT_OUTPUT_FILEPATH)
	parseNstream.printMsg('Number of matched rsid values: %s' % numMatchedRSIDs)
	print "Only the RSID's of the user that existed in the reference data base were used. Also, INDELs were not concatenated."
	parseNstream.printMsg('Number of unmatched rsid values: %s' % len(nonMatchedRSIDs))
	parseNstream.printMsg('Number of SNPs concatenated: %s' % snpsConcatenated)
Ejemplo n.º 7
0
def scoreAlleles(RefSNPFile, UserSNPFile):
	"""
	Structure of RefSNPdict and UserSNPdict:
		key = chromosome string as '1', '2', ..., '23' (X), '24' (Y), '25', (MT/M)

	(*) Reference allele is only ONE letter. 

	- Genotype is a string of len=2, the original allele representations from the 23andme file

	- Variant(s) represents any letters different from the Reference, if none then given '-'

	- Match Score (0,1,2): the SNP is given a score of ...
			- 2 (homozygous) if both letters are the same as the Reference
			- 1 (heterozygous) if one letter is the same as the Reference
			- 0 (recessive) if neither match the reference 

	(*) Variants given a match score of 0 may need to be switched to the 
	opposite letters (A to T, C to G and vice versa); must check to 
	confirm which are minus vs. plus strands)

	### BASIC FORMAT:
	#chrNum 	position 	Ref 	Genotype	Variant 	Matches(0,1,2)
	1			1000		A 		AA 			-			

	RETURNS: none, streams to file... 
	"""
	RefSNPdict = parseNstream._referenceObject(RefSNPFile, 'CHROMOSOME')
	UserSNPdict = parseNstream._23andmeObject(UserSNPFile, 'CHROMOSOME')

	# Keep track of RSIDs in the  that do not match in position and rsid value in tuples (ref rsid, user rsid)
	nonMatchedRSIDs = []	
	# Count number of RSIDs whose positions in a chr match for both ref and user, and report it at the end. 
	numMatchedRSIDs = 0		
	# Keep track of indels
	indel = {}		

	F = open(DEFAULT_OUTPUT_FILEPATH, 'w')
	F.write('#Chrom\tRSID\tPos\tRef\tGenotype\tVariant(s)\tMatch Score\n')
	for chr in parseNstream.CHROMOSOME_LIST:
		positions = RefSNPdict[chr].keys()
		positions.sort()
		indel[chr] = {}
		for pos in positions:
			refAllele = RefSNPdict[chr][pos][GENOTYPE].upper()
			rsid = RefSNPdict[chr][pos][RSID]

			# Check allele cases for ref in the user's snps
			if pos in UserSNPdict[chr]:
				if rsid != UserSNPdict[chr][pos][RSID]:
					nonMatchedRSIDs += [(rsid, UserSNPdict[chr][pos][RSID])]

				if isIndel(UserSNPdict[chr][pos][GENOTYPE]):
					indel[chr][pos] = {
						RSID: UserSNPdict[chr][pos][RSID], 
						GENOTYPE: UserSNPdict[chr][pos][GENOTYPE]
						}

				else:
					numMatchedRSIDs += 1
					sampleAllele = UserSNPdict[chr][pos][GENOTYPE]		# 3

					# Chromosomes 1-22 will have an allele pair, hence str len of 2
					if len(sampleAllele) == 2:

						# No base call at the current rsid. [BLANK]
						if sampleAllele[0] == '-' and sampleAllele[1] == '-': 
							variant = '-'
							score = '-'

						# Score 2, no variants. Both letters are homologous to the reference. [GRAY - default]
						elif sampleAllele[0].lower() == refAllele.lower() and sampleAllele[1].lower() == refAllele.lower():
							variant = '-'
							score = 2

						# Score 1, one variant. Second letter is homologous to the reference. [GREEN - success]
						elif sampleAllele[0].lower() == refAllele.lower():
							variant = sampleAllele[1]
							score = 1

						# Score 1, one variant. First letter is homologous to the reference. [GREEN - success]
						elif sampleAllele[1].lower() == refAllele.lower():
							variant = sampleAllele[0]
							score = 1

						# Score 0, two variants. No homology to the reference. [ORANGE - primary]
						else:
							variant = sampleAllele
							score = 0

					# Mitochondria, X, and Y chromosome alleles are of length 1
					elif len(sampleAllele) == 1:
						# No base call at the current rsid. [BLANK]
						if sampleAllele == '-':
							variant = '-'
							score = '-'

						# Score 1, no variants. Only letter is homologous to the reference. [GRAY - default]
						elif sampleAllele.lower() == refAllele.lower():
							variant = '-'
							score = 1
						else:
							variant = sampleAllele
							score = 0

					F.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chr, rsid, pos, refAllele, sampleAllele, variant, score))

	F.close()
	parseNstream.printMsg("Completed scoring user SNPs to REF SNPs. Streamed to a .teyden (LOL) file format (path=%s)" % DEFAULT_OUTPUT_FILEPATH)
	parseNstream.printMsg('Number of matched rsid values: %s' % numMatchedRSIDs)
	parseNstream.printMsg('Number of unmatched rsid values: %s' % len(nonMatchedRSIDs))
	return indel