def convertTo(self, format=''):
		"""
		Takes a string indiciating file format and creates a file of that file format
		from the conversion of the original file format

		Returns none
		"""
		fileEndpoint = self._getEndpoint(format)
		self.outputPath = 'output/conversionTo_'+format+fileEndpoint

		if format.upper() == 'VCF':
			self._stream2VCFfile()
		elif format.upper() == 'BED':
			self._stream2BEDfile()
		elif format.upper() == 'SS':
			# Input and output of files should also be via the DB
			ref_snps = parseNstream._referenceObject(self.referencePath, 'CHROMOSOME')  ## parseNstream functions to be added as methods to a Genome object
			self._stream2SSfile(ref_snps)
		elif format.upper() == 'RSID':
			self._stream2SRSIDfile()
def concatenateSNPs(RefSNPFile, UserSNPFile):
	"""
	Structure of RefSNPdict and UserSNPdict:
		key = chromosome string as '1', '2', ..., '23' (X), '24' (Y), '25', (MT/M)

	(*) Reference allele is only ONE letter. 

	- Genotype is a string of len=2, the original allele representations from the 23andme file

	- Variant(s) represents any letters different from the Reference, if none then given '-'

	- Match Score (0,1,2): the SNP is given a score of ...
			- 2 (homozygous) if both letters are the same as the Reference
			- 1 (heterozygous) if one letter is the same as the Reference
			- 0 (recessive) if neither match the reference 

	(*) Variants given a match score of 0 may need to be switched to the 
	opposite letters (A to T, C to G and vice versa); must check to 
	confirm which are minus vs. plus strands)

	### BASIC FORMAT:
	#chrNum 	position 	Ref 	Genotype	Variant 	Matches(0,1,2)
	1			1000		A 		AA 			-			

	RETURNS: none, streams to file... 
	"""
	RefSNPdict = parseNstream._referenceObject(RefSNPFile, 'CHROMOSOME')
	UserSNPdict = parseNstream._23andmeObject(UserSNPFile, 'CHROMOSOME')

	nonMatchedRSIDs = []
	numMatchedRSIDs = 0
	snpsConcatenated = 0
	indel = {}

	F = open(DEFAULT_OUTPUT_FILEPATH, 'w')
	for chr in parseNstream.CHROMOSOME_LIST:
		positions = RefSNPdict[chr].keys()
		positions.sort()
		indel[chr] = {}
		for pos in positions:
			refAllele = RefSNPdict[chr][pos][GENOTYPE].upper()
			rsid = RefSNPdict[chr][pos][RSID]

			# Check allele cases for ref in the user's snps
			if pos in UserSNPdict[chr]:
				if rsid != UserSNPdict[chr][pos][RSID]:
					nonMatchedRSIDs += [(rsid, UserSNPdict[chr][pos][RSID])]

				if isIndel(UserSNPdict[chr][pos][GENOTYPE]):
					indel[chr][pos] = {
						RSID: UserSNPdict[chr][pos][RSID], 
						GENOTYPE: UserSNPdict[chr][pos][GENOTYPE]
						}

				else:
					numMatchedRSIDs += 1
					sampleAllele = UserSNPdict[chr][pos][GENOTYPE]		# 3

					# Chromosomes 1-22 will have an allele pair, hence str len of 2
					if len(sampleAllele) == 2:
						if sampleAllele[0] == '-' and sampleAllele[1] == '-':
							variant = '-'
							score = '-'
						elif sampleAllele[0].lower() == refAllele.lower() and sampleAllele[1].lower() == refAllele.lower():
							variant = '-'
							score = 2
						elif sampleAllele[0].lower() == refAllele.lower():
							variant = sampleAllele[1]
							score = 1
						elif sampleAllele[1].lower() == refAllele.lower():
							variant = sampleAllele[0]
							score = 1
						else:
							variant = sampleAllele
							score = 0

					# Mitochondria, X, and Y chromosome alleles are of length 1
					elif len(sampleAllele) == 1:
						if sampleAllele == '-':
							variant = '-'
							score = '-'
						elif sampleAllele.lower() == refAllele.lower():
							variant = '-'
							score = 1
						else:
							variant = sampleAllele
							score = 0

					if score != '-':
						snpsConcatenated += 1

						if variant == '-':
							F.write('%s' % sampleAllele[0])

						else:
							F.write('%s' % variant)


	F.close()
	parseNstream.printMsg("Completed scoring user SNPs to REF SNPs. Streamed to a .FASTA file (path=%s)" % DEFAULT_OUTPUT_FILEPATH)
	parseNstream.printMsg('Number of matched rsid values: %s' % numMatchedRSIDs)
	print "Only the RSID's of the user that existed in the reference data base were used. Also, INDELs were not concatenated."
	parseNstream.printMsg('Number of unmatched rsid values: %s' % len(nonMatchedRSIDs))
	parseNstream.printMsg('Number of SNPs concatenated: %s' % snpsConcatenated)
Exemple #3
0
def scoreAlleles(RefSNPFile, UserSNPFile):
	"""
	Structure of RefSNPdict and UserSNPdict:
		key = chromosome string as '1', '2', ..., '23' (X), '24' (Y), '25', (MT/M)

	(*) Reference allele is only ONE letter. 

	- Genotype is a string of len=2, the original allele representations from the 23andme file

	- Variant(s) represents any letters different from the Reference, if none then given '-'

	- Match Score (0,1,2): the SNP is given a score of ...
			- 2 (homozygous) if both letters are the same as the Reference
			- 1 (heterozygous) if one letter is the same as the Reference
			- 0 (recessive) if neither match the reference 

	(*) Variants given a match score of 0 may need to be switched to the 
	opposite letters (A to T, C to G and vice versa); must check to 
	confirm which are minus vs. plus strands)

	### BASIC FORMAT:
	#chrNum 	position 	Ref 	Genotype	Variant 	Matches(0,1,2)
	1			1000		A 		AA 			-			

	RETURNS: none, streams to file... 
	"""
	RefSNPdict = parseNstream._referenceObject(RefSNPFile, 'CHROMOSOME')
	UserSNPdict = parseNstream._23andmeObject(UserSNPFile, 'CHROMOSOME')

	# Keep track of RSIDs in the  that do not match in position and rsid value in tuples (ref rsid, user rsid)
	nonMatchedRSIDs = []	
	# Count number of RSIDs whose positions in a chr match for both ref and user, and report it at the end. 
	numMatchedRSIDs = 0		
	# Keep track of indels
	indel = {}		

	F = open(DEFAULT_OUTPUT_FILEPATH, 'w')
	F.write('#Chrom\tRSID\tPos\tRef\tGenotype\tVariant(s)\tMatch Score\n')
	for chr in parseNstream.CHROMOSOME_LIST:
		positions = RefSNPdict[chr].keys()
		positions.sort()
		indel[chr] = {}
		for pos in positions:
			refAllele = RefSNPdict[chr][pos][GENOTYPE].upper()
			rsid = RefSNPdict[chr][pos][RSID]

			# Check allele cases for ref in the user's snps
			if pos in UserSNPdict[chr]:
				if rsid != UserSNPdict[chr][pos][RSID]:
					nonMatchedRSIDs += [(rsid, UserSNPdict[chr][pos][RSID])]

				if isIndel(UserSNPdict[chr][pos][GENOTYPE]):
					indel[chr][pos] = {
						RSID: UserSNPdict[chr][pos][RSID], 
						GENOTYPE: UserSNPdict[chr][pos][GENOTYPE]
						}

				else:
					numMatchedRSIDs += 1
					sampleAllele = UserSNPdict[chr][pos][GENOTYPE]		# 3

					# Chromosomes 1-22 will have an allele pair, hence str len of 2
					if len(sampleAllele) == 2:

						# No base call at the current rsid. [BLANK]
						if sampleAllele[0] == '-' and sampleAllele[1] == '-': 
							variant = '-'
							score = '-'

						# Score 2, no variants. Both letters are homologous to the reference. [GRAY - default]
						elif sampleAllele[0].lower() == refAllele.lower() and sampleAllele[1].lower() == refAllele.lower():
							variant = '-'
							score = 2

						# Score 1, one variant. Second letter is homologous to the reference. [GREEN - success]
						elif sampleAllele[0].lower() == refAllele.lower():
							variant = sampleAllele[1]
							score = 1

						# Score 1, one variant. First letter is homologous to the reference. [GREEN - success]
						elif sampleAllele[1].lower() == refAllele.lower():
							variant = sampleAllele[0]
							score = 1

						# Score 0, two variants. No homology to the reference. [ORANGE - primary]
						else:
							variant = sampleAllele
							score = 0

					# Mitochondria, X, and Y chromosome alleles are of length 1
					elif len(sampleAllele) == 1:
						# No base call at the current rsid. [BLANK]
						if sampleAllele == '-':
							variant = '-'
							score = '-'

						# Score 1, no variants. Only letter is homologous to the reference. [GRAY - default]
						elif sampleAllele.lower() == refAllele.lower():
							variant = '-'
							score = 1
						else:
							variant = sampleAllele
							score = 0

					F.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chr, rsid, pos, refAllele, sampleAllele, variant, score))

	F.close()
	parseNstream.printMsg("Completed scoring user SNPs to REF SNPs. Streamed to a .teyden (LOL) file format (path=%s)" % DEFAULT_OUTPUT_FILEPATH)
	parseNstream.printMsg('Number of matched rsid values: %s' % numMatchedRSIDs)
	parseNstream.printMsg('Number of unmatched rsid values: %s' % len(nonMatchedRSIDs))
	return indel