Esempi in Python per getColName2IndexFromHeader, esempi in Python per pymodule.utils.getColName2IndexFromHeader

Esempio n. 1

0

Mostra file

File: MatrixFile.py Progetto: bopopescu/gwasmodules

	def constructColName2IndexFromHeader(self):
		"""
		2012.8.23
		"""
		self.header = self.next()
		self.col_name2index = utils.getColName2IndexFromHeader(self.header)
		return self.col_name2index

Esempio n. 2

0

Mostra file

File: PutReadBaseCountIntoDB.py Progetto: mjmontague/vervet-web

	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		session.begin()
		
		isq_id2data ={}
		no_of_total_lines = 0
		no_of_isqf_lines = 0
		no_of_isqf_in_db = 0
		for inputFname in self.inputFnameLs:
			reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
			header = reader.next()
			colName2Index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True)
			isq_id_index = colName2Index.get('isq_id')
			isqf_id_index = colName2Index.get('isqf_id')
			read_count_index = colName2Index.get("read_count")
			base_count_index = colName2Index.get("base_count")
			for row in reader:
				isq_id = int(row[isq_id_index])
				isqf_id = row[isqf_id_index]
				read_count = int(row[read_count_index])
				base_count = int(row[base_count_index])
				if isq_id not in isq_id2data:
					isq_id2data[isq_id] = PassingData(read_count=0, base_count=0)
				isq_id2data[isq_id].read_count += read_count
				isq_id2data[isq_id].base_count += base_count
				if isqf_id and isqf_id!='0':
					isqf_id = int(isqf_id)
					no_of_isqf_lines += 1
					no_of_isqf_in_db += self.updateIndividualSequenceFileReadBaseCount(self.db_vervet, isqf_id=isqf_id, \
											read_count=read_count, base_count=base_count)
				no_of_total_lines += 1
			del reader
		logMsg1="%s isqf out of %s were put into db. %s lines in total.\n"%(no_of_isqf_in_db, no_of_isqf_lines, no_of_total_lines)
		sys.stderr.write(logMsg1)
		
		counter = 0
		real_counter = 0
		for isq_id, data in isq_id2data.iteritems():
			real_counter += self.updateIndividualSequenceReadBaseCount(self.db_vervet, isq_id=isq_id, \
										read_count=data.read_count, base_count=data.base_count, genomeSize=self.genomeSize)
			counter += 1
		logMsg2="%s isq out of %s were put into db.\n"%(real_counter, counter)
		sys.stderr.write(logMsg2)
		
		if self.logFilename:
			logF = open(self.logFilename, 'w')
			logF.write(logMsg1)
			logF.write(logMsg2)
			del logF
			
		
		if self.commit:
			self.db_vervet.session.flush()
			self.db_vervet.session.commit()

Esempio n. 3

0

Mostra file

File: PutFlagstatOutput2DB.py Progetto: mjmontague/vervet-web

	def run(self):
		"""
		2012.4.3
			each input has this as its header:
			
			['alignmentID', 'total_no_of_reads', 'perc_reads_mapped', 'perc_duplicates', 'perc_paired', 'perc_properly_paired', \
				'perc_both_mates_mapped', 'perc_singletons',\
				'perc_mapped_to_diff_chrs']
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		session.begin()
		
		no_of_total_lines = 0
		for inputFname in self.inputFnameLs:
			reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
			header = reader.next()
			colName2Index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True)
			alignment_id_index = colName2Index.get('alignmentID')
			total_no_of_reads_index = colName2Index.get('total_no_of_reads')
			perc_reads_mapped_index = colName2Index.get("perc_reads_mapped")
			perc_duplicates_index = colName2Index.get("perc_duplicates")
			perc_paired_index = colName2Index.get("perc_paired")
			perc_properly_paired_index = colName2Index.get("perc_properly_paired")
			perc_both_mates_mapped_index = colName2Index.get("perc_both_mates_mapped")
			perc_singletons_index = colName2Index.get("perc_singletons")
			perc_mapped_to_diff_chrs_index = colName2Index.get("perc_mapped_to_diff_chrs")
			perc_mapq5_mapped_to_diff_chrs_index = colName2Index.get("perc_mapq5_mapped_to_diff_chrs")
			for row in reader:
				alignmentID = int(row[alignment_id_index])
				alignment = VervetDB.IndividualAlignment.get(alignmentID)
				alignment.perc_reads_mapped = float(row[perc_reads_mapped_index])
				alignment.perc_duplicates = float(row[perc_duplicates_index])
				alignment.perc_paired = float(row[perc_paired_index])
				alignment.perc_properly_paired = float(row[perc_properly_paired_index])
				alignment.perc_both_mates_mapped = float(row[perc_both_mates_mapped_index])
				alignment.perc_singletons = float(row[perc_singletons_index])
				alignment.perc_mapped_to_diff_chrs = float(row[perc_mapped_to_diff_chrs_index])
				alignment.perc_mapq5_mapped_to_diff_chrs = float(row[perc_mapq5_mapped_to_diff_chrs_index])
				alignment.total_no_of_reads = int(float(row[total_no_of_reads_index]))
				session.add(alignment)
				no_of_total_lines += 1
			del reader
		sys.stderr.write("%s alignments in total.\n"%(no_of_total_lines))
		
		if self.logFilename:
			logF = open(self.logFilename, 'w')
			logF.write("%s alignments in total.\n"%(no_of_total_lines))
			del logF
		
		if self.commit:
			self.db_vervet.session.flush()
			self.db_vervet.session.commit()

Esempio n. 4

0

Mostra file

	def _parseHeader(self):
		"""
		2013.07.17 bugfix, do not reset self.sample_id_ls in the beginning
		2012.3.28
			add all header content into self.metaInfoLs
				except the last header line, which goes into self.sampleIDHeader
		2011-11-2
			this function is run inside __init__()
		"""
		self.metaInfoLs = []	#2012.3.28 anything before the "#CHROM" line. each entry is a raw line content, including '\n'
		self.sampleIDHeader = []	#2012.3.20 a list of column headers (#CHROM)
		
		self.sample_id2index['ref'] = 0	#ref is at column 0. "ref" must not be equal to any read_group.
		self.sample_id_ls.append('ref')
		
		"""
		writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
		header = ['sample', 'snp_id', 'chr', 'pos', 'qual', 'DP', 'minDP4', 'DP4_ratio', 'MQ']
		moreHeader = ['GQ', 'GL', 'SB', 'QD', 'sndHighestGL', 'deltaGL']
		#['AF', 'AC','AN', 'Dels', 'HRun', 'HaplotypeScore','MQ0', 'QD']	#2011-3-4 useless
		if VCFOutputType==2:
			header += moreHeader
		chr_pure_number_pattern = re.compile(r'[a-z_A-Z]+(\d+)')
		chr_number_pattern = re.compile(r'chr(\d+)')
		"""
		
		counter = 0
		real_counter = 0
		
		for line in self.inf:
			if line[:6]=='#CHROM':
				line = line.strip()	#get rid of the trailing \n
				row = line.split('\t')
				self.sampleIDHeader = row[self.sampleStartingColumn:]
				self.header = row[:]
				self.headerWithoutHash= row[:]
				self.headerWithoutHash[0] = 'CHROM'	#discard the #
				self.col_name2index = getColName2IndexFromHeader(self.headerWithoutHash, skipEmptyColumn=True)
				self.col_index_individual_name_ls = self._getIndividual2ColIndex(self.headerWithoutHash, self.col_name2index)
				for individual_col_index, individual_name in self.col_index_individual_name_ls:
					read_group = individual_name.strip()
					if read_group not in self.sample_id2index:
						self.sample_id2index[read_group] = len(self.sample_id2index)
						self.sample_id_ls.append(read_group)
				break	# "#CHROM" is the last line of the self.headerWithoutHash
			elif line[0]=='#':	#2011-3-4
				self.metaInfoLs.append(line)
				#continue
			else:	#leave everything for parseFile or parseIter
				break

Esempio n. 5

0

Mostra file

File: AddChromosomeLengthToTSVFile.py Progetto: mjmontague/vervet-web

	def processHeader(self, reader=None, extendHeader=None, chrLengthHeader = 'chrLength'):
		"""
		2012.8.7
			modularize so that AddHetFractionToVCFtoolsHWE could inherit
		"""
		header = reader.next()
		self.originalHeader = header
		self.col_name2index = utils.getColName2IndexFromHeader(self.originalHeader, skipEmptyColumn=True)
		self.originalHeaderLength = len(header)
		header.extend(extendHeader)
		if self.divideByLength:
			i = self.divideStartingColumn
			while (i<self.originalHeaderLength):
				statColumnHeader = header[i]
				header.append("%s_div_by_%s"%(statColumnHeader, chrLengthHeader))
				i += 1;
		return header

Esempio n. 6

0

Mostra file

File: UnpackAndAddIndividualSequence2DB.py Progetto: mjmontague/vervet-web

	def getBamBaseFname2MonkeyID_WUSTLDNAData(self, inputFname, ):
		"""
		2011-8-3
			from WUSTL
			the input looks like this:
			#	FlowCell	Lane	Index Sequence	Library	Common Name	Bam Path	MD5
			1	64J6AAAXX	1	VCAC-2007002-1-lib1	African	Green	Monkey	/gscmnt/sata755/production/csf_111215677/gerald_64J6AAAXX_1.bam	/gscmnt/sata755/production/csf_111215677/gerald_64J6AAAXX_1.bam.md5
			2	64J6AAAXX	2	VCAC-2007006-1-lib1	African	Green	Monkey	/gscmnt/sata751/production/csf_111215675/gerald_64J6AAAXX_2.bam	/gscmnt/sata751/production/csf_111215675/gerald_64J6AAAXX_2.bam.md5
		"""
		sys.stderr.write("Getting bamBaseFname2MonkeyID dictionary ...")
		bamBaseFname2MonkeyID = {}
		reader = csv.reader(open(inputFname), delimiter='\t')
		header = reader.next()
		col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
		monkeyIDIndex = col_name2index.get("Library")
		if monkeyIDIndex is None:	#2012.6.7
			monkeyIDIndex = col_name2index.get("library")
		bamFnameIndex = col_name2index.get("Bam Path")
		if bamFnameIndex is None:	#2012.2.9
			bamFnameIndex = col_name2index.get("BAM Path")
		if bamFnameIndex is None:	#2012.2.9
			bamFnameIndex = col_name2index.get("BAM")
		if bamFnameIndex is None:	#2012.6.7
			bamFnameIndex = col_name2index.get("bam pathway")
		#monkeyIDPattern = re.compile(r'\w+-(\w+)-\d+-\w+')	# i.e. VCAC-2007002-1-lib1
		monkeyIDPattern = re.compile(r'\w+-(\w+)-\w+-\w+')	# 2012.5.29 i.e. VCAC-VGA00006-AGM0075-lib1 ,
		# VCAC-VZC1014-AGM0055-lib1, VCAC-1996031-VRV0265-lib2a, VCAC-VKD7-361-VKD7-361-lib1 (VKD7 is to be taken),
		for row in reader:
			monkeyID = row[monkeyIDIndex]
			pa_search = monkeyIDPattern.search(monkeyID)
			if pa_search:
				monkeyID = pa_search.group(1)
			else:
				sys.stderr.write("Warning: could not parse monkey ID from %s. Ignore.\n"%(monkeyID))
				continue
			bamFname = row[bamFnameIndex]
			bamBaseFname = os.path.split(bamFname)[1]
			bamBaseFname2MonkeyID[bamBaseFname] = monkeyID
		sys.stderr.write("%s entries.\n"%(len(bamBaseFname2MonkeyID)))
		return bamBaseFname2MonkeyID

Esempio n. 7

0

Mostra file

File: MatrixFile.py Progetto: bopopescu/gwasmodules

	def smartReadHeader(self, headerPattern=None, commentPattern=None):
		"""
		Note:
			If an input file does not have a header, this function over-reads by one line (stored in self._row)
			so need to process the last self._row before further reading
		2013.08.30 read the header, while ignoring lines fitting the comment pattern
			and construct col_name2index when a line matching headerPattern is encountered
		
		"""
		if headerPattern is None:
			headerPattern = self.headerPattern
		if commentPattern is None:
			commentPattern = self.commentPattern
		row = self.next()
		while commentPattern.search(row[0]):	#passing all comments
			self.comment_row_list.append(row)
			row = self.next()
		if headerPattern.search(row[0]):
			self.header = row
			self.col_name2index = utils.getColName2IndexFromHeader(self.header)
		else:
			self.col_name2index = None
		return self.col_name2index

Esempio n. 8

0

Mostra file

File: GenotypeCallByCoverage.py Progetto: mjmontague/vervet-web

	def discoverFromVCF(cls, inputFname, outputFname, refFastaFname=None, VCFOutputType=2, \
					minMinorAlleleCoverage=1/4., maxMinorAlleleCoverage=3/4.,\
					maxNoOfReads=2., minNoOfReads=1/4., \
					maxNoOfReadsForGenotypingError=1, maxMajorAlleleCoverage=7/8., maxNoOfReadsForAllSamples=1000,\
					nt_set = set(['a','c','g','t','A','C','G','T']), isqID2coverage=None, defaultCoverage=10, \
					outputDelimiter='\t',\
					report=0, site_type=1):
		"""
		2011-9-2
			add argument isqID2coverage, defaultCoverage
		2011-8-26
			add argument site_type
			function is also more robust against missing fields etc.
		2011-7-20
			copied from discoverHetsFromVCF() of vervet.src.misc
		2011-3-24
			add maxMinorAlleleCoverage
			Even a heterozygote's MAC is within [minMinorAlleleCoverage, maxMinorAlleleCoverage], it could still be
				a homozygous SNP.
		2011-3-4
			VCF output by GATK has a different format
			argument VCFOutputType
				1: output by samtools's vcfutils.pl
				2: output by GATK
		2011-1-6
			inputFname is VCF output by "vcfutils.pl varFilter" of samtools
		"""
		import csv
		from pymodule.utils import runLocalCommand, getColName2IndexFromHeader
		sys.stderr.write("Looking for heterozygous SNPs in %s (%s<=MAC<=%s).\n"%(os.path.basename(inputFname), \
																		minMinorAlleleCoverage, maxMinorAlleleCoverage))
		reader =csv.reader(open(inputFname), delimiter='\t')
		
		
		read_group2col_index = {'ref':0}	#ref is at column 0. "ref" must not be equal to any read_group.
		read_group2coverage = {}	#2011-9-2
		locus_id2row_index = {}
		data_matrix = []
		
		tid2refName = {}	#dictionary storing the target references which have SNP calls
		refNameSet = set()
		"""
		writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
		header = ['sample', 'snp_id', 'chr', 'pos', 'qual', 'DP', 'minDP4', 'DP4_ratio', 'MQ']
		moreHeader = ['GQ', 'GL', 'SB', 'QD', 'sndHighestGL', 'deltaGL']
		#['AF', 'AC','AN', 'Dels', 'HRun', 'HaplotypeScore','MQ0', 'QD']	#2011-3-4 useless
		if VCFOutputType==2:
			header += moreHeader
		chr_pure_number_pattern = re.compile(r'[a-z_A-Z]+(\d+)')
		chr_number_pattern = re.compile(r'chr(\d+)')
		"""
		
		individual_name2col_index = None
		col_name2index = None
		counter = 0
		real_counter = 0
		
		
		for row in reader:
			if row[0] =='#CHROM':
				row[0] = 'CHROM'	#discard the #
				header = row
				col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
				individual_name2col_index = cls.getIndividual2ColIndex(header, col_name2index)
				continue
			elif row[0][0]=='#':	#2011-3-4
				continue
			"""
			if chr_number_pattern.search(row[0]):
				chr = chr_number_pattern.search(row[0]).group(1)
			elif chr_pure_number_pattern.search(row[0]):
				chr = chr_pure_number_pattern.search(row[0]).group(1)
			else:
				sys.stderr.write("Couldn't parse the chromosome number/character from %s.\n Exit.\n"%(row[0]))
				sys.exit(4)
			"""
			chr = row[0]
			refNameSet.add(chr)
			
			pos = row[1]
			quality = row[5]
			
			outputHet= False
			
			info = row[7]
			info_ls = info.split(';')
			info_tag2value = {}
			for info in info_ls:
				try:
					tag, value = info.split('=')
				except:
					#sys.stderr.write("Error in splitting %s by =.\n"%info)	###Error in splitting DS by =.
					continue
				info_tag2value[tag] = value
			
			current_locus = '%s_%s'%(chr, pos)
			refBase = row[col_name2index['REF']]
			altBase = row[col_name2index['ALT']]
			if VCFOutputType==2:	#2011-3-4 GATK
				format_column = row[col_name2index['FORMAT']]
				format_column_ls = format_column.split(':')
				format_column_name2index = getColName2IndexFromHeader(format_column_ls)
				data_row = ['NA']*(len(individual_name2col_index)+1)	# extra 1 for the ref
				allele2count = {}
				for individual_name, individual_col_index in individual_name2col_index.iteritems():
					read_group = individual_name
					if read_group not in read_group2col_index:
						read_group2col_index[read_group] = len(read_group2col_index)
						#2011-9-2
						if isqID2coverage:
							try:
								isqID = read_group.split('_')[1]
								isqID = int(isqID)
								coverage = isqID2coverage.get(isqID, defaultCoverage)
							except:
								sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
								import traceback
								traceback.print_exc()
								sys.stderr.write("Coverage for %s not available. use default=%s.\n"%(read_group, defaultCoverage))
								coverage = defaultCoverage
						else:
							coverage = defaultCoverage
						read_group2coverage[read_group] = coverage
					
					coverage = read_group2coverage[read_group]
					genotype_data = row[individual_col_index]
					genotype_data_ls = genotype_data.split(':')
					genotype_call_index = format_column_name2index.get('GT')
					genotype_quality_index = format_column_name2index.get('GQ')
					if genotype_quality_index is None:
						genotype_quality_index = format_column_name2index.get('DP')
					depth_index = format_column_name2index.get("DP")
					#GL_index = format_column_name2index.get('GL')
					if len(genotype_data_ls)<len(format_column_name2index):
						continue
					if depth_index is None or genotype_call_index is None:
						continue
					#genotype_quality = genotype_data_ls[genotype_quality_index]
					genotype_call = genotype_data_ls[genotype_call_index]
					depth = int(genotype_data_ls[depth_index])
					if depth>maxNoOfReads*coverage or depth<minNoOfReads*coverage:	#2011-3-29 skip. coverage too high or too low
						continue
					allele = 'NA'
					if genotype_call=='0/1' or genotype_call =='1/0':	#heterozygous, the latter notation is never used though.
						"""
						GL_list = genotype_data_ls[GL_index]
						GL_list = GL_list.split(',')
						GL_list = map(float, GL_list)
						GL = GL_list[1]
						sndHighestGL = max([GL_list[0], GL_list[2]])
						deltaGL = GL-sndHighestGL
						"""
						AD = genotype_data_ls[format_column_name2index.get('AD')]
						AD = map(int, AD.split(','))
						minorAlleleCoverage = min(AD)
						majorAlleleCoverage = max(AD)
						
						if minorAlleleCoverage<=maxMinorAlleleCoverage*coverage and minorAlleleCoverage>=minMinorAlleleCoverage*coverage \
								and majorAlleleCoverage<=maxMajorAlleleCoverage*coverage:
							DP4_ratio = float(AD[0])/AD[1]
							allele = '%s%s'%(refBase, altBase)
							"""
							data_row = [individual_name, 'chr%s:%s'%(chr, pos), chr, pos, quality, \
									depth, minorAlleleCoverage, DP4_ratio,\
									info_tag2value.get('MQ'), genotype_quality, GL,\
									info_tag2value.get('SB'), info_tag2value.get('QD'), sndHighestGL, deltaGL]
							#for i in range(3, len(moreHeader)):
							#	info_tag = moreHeader[i]
							#	data_row.append(info_tag2value.get(info_tag))
							writer.writerow(data_row)
							"""
					elif genotype_call=='./.':	#missing
						continue
					elif genotype_call =='1/1':
						allele = '%s%s'%(altBase, altBase)
					elif genotype_call =='0/0':
						allele = '%s%s'%(refBase, refBase)
					col_index = read_group2col_index.get(read_group)
					data_row[col_index] = allele
					if allele!='NA':
						if allele not in allele2count:
							allele2count[allele] = 0
						allele2count[allele] += 1
				
				if len(allele2count)>site_type-1:	#whether polymorphic across samples or all sites in vcf
					real_counter += 1
					locus_id2row_index[current_locus] = len(locus_id2row_index)
					data_matrix.append(data_row)
			"""
			elif VCFOutputType==1:	#samtools. 2011-7-20 outdated.
				sample_id = row[8]
				for tag in info_tag2value.keys():
					value = info_tag2value.get(tag)
					if tag=='DP4':
						tag = 'DP4_ratio'
						value = value.split(',')
						value = map(int, value)
						no_of_ref_allele = sum(value[0:2])
						no_of_non_ref_allele = sum(value[2:])
						MAC = min(no_of_ref_allele, no_of_non_ref_allele)
						if MAC<=maxMinorAlleleCoverage and MAC>=minMinorAlleleCoverage:
							outputHet = True
							value = float(no_of_ref_allele)/no_of_non_ref_allele
							info_tag2value['minDP4'] = min(no_of_ref_allele, no_of_non_ref_allele)
						else:
							value = None
						info_tag2value[tag] = value
				if outputHet:
					real_counter += 1
					output_row = [sample_id, 'chr%s:%s'%(chr, pos), chr, pos, quality, info_tag2value.get('DP'), \
								info_tag2value.get('minDP4'), info_tag2value.get('DP4_ratio'), info_tag2value.get('MQ')]
					writer.writerow(output_row)
			"""
			counter += 1
			if counter%2000==0 and report:
				sys.stderr.write("%s\t%s\t%s"%("\x08"*80, counter, real_counter))
		del reader
		
		cls.outputCallMatrix(data_matrix, refFastaFname, outputFname=outputFname, refNameSet=refNameSet, \
					read_group2col_index=read_group2col_index, \
					locus_id2row_index=locus_id2row_index, outputDelimiter=outputDelimiter)
		
		sys.stderr.write("%s\t%s\t%s.\n"%("\x08"*80, counter, real_counter))

Esempio n. 9

0

Mostra file

File: PutDOCOutput2DB.py Progetto: mjmontague/vervet-web

	def run(self):
		"""
		2012.5.7 new input looks like this (tab-delimited):
			alignmentID     total_base_count        sampled_base_count      meanDepth       medianDepth     modeDepth
			100     1005506 301614  70.0441756682   9.0     8.0
			27     1005506 301614  70.0441756682   9.0     8.0

		2012.4.3
			each input looks like this:
			
sample_id       total   mean    granular_third_quartile granular_median granular_first_quartile %_bases_above_15
553_2_VRC_ref_GA_vs_524 2434923137      8.25    11      9       6       4.4
Total   2434923137      8.25    N/A     N/A     N/A
554_3_Barbados_GA_vs_524        2136011136      7.23    11      8       6       3.5
Total   2136011136      7.23    N/A     N/A     N/A
...

		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		session.begin()
		
		no_of_total_lines = 0
		for inputFname in self.inputFnameLs:
			reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
			header = reader.next()
			col_name2index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True)
			
			sample_id_index = col_name2index.get("alignmentID")
			total_base_count_index = col_name2index.get('total_base_count')
			mean_depth_index = col_name2index.get("meanDepth")
			median_depth_index = col_name2index.get("medianDepth")
			mode_depth_index = col_name2index.get("modeDepth")
			for row in reader:
				sample_id = row[sample_id_index]
				if sample_id=='Total':	#ignore rows with this as sample id
					continue
				alignment_id = int(sample_id.split("_")[0])
				total_base_count = int(row[total_base_count_index])
				mean_depth = float(row[mean_depth_index])
				median_depth = float(row[median_depth_index])
				mode_depth = float(row[mode_depth_index])
				individual_alignment = VervetDB.IndividualAlignment.get(alignment_id)
				individual_alignment.pass_qc_read_base_count = total_base_count	#2012.9.17 no longer trustworthy because CalculateMedianMeanOfInputColumn skips data.
				individual_alignment.mean_depth = mean_depth
				individual_alignment.median_depth = median_depth
				individual_alignment.mode_depth = mode_depth
				session.add(individual_alignment)
				no_of_total_lines += 1
			del reader
		sys.stderr.write("%s alignments in total.\n"%(no_of_total_lines))
		
		if self.logFilename:
			logF = open(self.logFilename, 'w')
			logF.write("%s alignments in total.\n"%(no_of_total_lines))
			del logF
		
		if self.commit:
			self.db_vervet.session.flush()
			self.db_vervet.session.commit()

Esempio n. 10

0

Mostra file

def parseOneVCFRow(row, col_name2index, col_index_individual_name_ls, sample_id2index, minDepth=1,\
				dataEntryType=1):
	"""
	2014.01.08 fix a bug that skips calls and shortens data_row. 
	2012.9.6 turn pos into integer
	2012.5.10
		complete representation of one locus
	2012.1.17
		common snippet split out of VCFFile & VCFRecord
		row is a list of input columns from one VCF file line
		dataEntryType
			1: each cell is base call
			2: each cell is a dictionary {'GT': base-call, 'DP': depth}
	"""
	chromosome = row[0]
	pos = int(row[1])	#2012.9.6 turn pos into integer
	vcf_locus_id=row[2]
	quality = row[5]
	filter=row[6]
	info = row[7]
	format = row[8]
	info_ls = info.split(';')
	info_tag2value = {}
	for info_entry in info_ls:
		try:
			tag, value = info_entry.split('=')
		except:
			#sys.stderr.write("Error in splitting %s by =.\n"%info)	###Error in splitting DS by =.
			continue
		info_tag2value[tag] = value
	
	locus_id = (chromosome, pos)
	refBase = row[col_name2index['REF']]
	altBase = row[col_name2index['ALT']]
	
	altBaseLs = altBase.split(',')	#altBase could be just "C" or "C,G" (multi-nucleotide)
	alleleLs = [refBase] + altBaseLs
	alleleNumber2Base = {'.':'NA'}
	for i in xrange(len(alleleLs)):
		alleleNumber2Base[repr(i)] = alleleLs[i]
	
	format_column = row[col_name2index['FORMAT']]
	format_column_ls = format_column.split(':')
	format_column_name2index = getColName2IndexFromHeader(format_column_ls)
		
	if dataEntryType==1:
		data_row = ['NA']*(len(col_index_individual_name_ls)+1)	# extra 1 for the ref
		data_row[0] = refBase
	else:
		data_row = [None]*(len(col_index_individual_name_ls)+1)	# extra 1 for the ref
		data_row[0] = {'GT':refBase, 'DP':-1}
	genotypeCall2Count = {}
	for individual_col_index, individual_name in col_index_individual_name_ls:
		individual_name = individual_name
		if individual_name not in sample_id2index:
			sample_id2index[individual_name] = len(sample_id2index)
		
		#coverage = read_group2coverage[individual_name]
		genotype_data = row[individual_col_index]
		genotype_data_ls = genotype_data.split(':')
		genotype_call_index = format_column_name2index.get('GT')
		genotype_quality_index = format_column_name2index.get('GQ')
		if genotype_quality_index is None:
			genotype_quality_index = format_column_name2index.get('DP')
		depth_index = format_column_name2index.get("DP")
		#GL_index = format_column_name2index.get('GL')
		genotypeCallInBase = 'NA'
		if genotype_call_index is not None and len(genotype_data_ls)>0:
			# or (genotype_call_index is not None and len(genotype_data_ls)<=genotype_call_index):	#<len(format_column_name2index):	#this genotype call is probably empty "./." due to no reads
			#genotype_quality = genotype_data_ls[genotype_quality_index]
			if genotype_call_index is not None and len(genotype_data_ls)>genotype_call_index:
				genotype_call = genotype_data_ls[genotype_call_index]
			else:
				genotype_call = './.'	#missing
			callData = {}
			if genotype_call!='./.' and genotype_call!='.' and genotype_call!='.|.':	#missing data
				patternSearchResult = diploidGenotypePattern.search(genotype_call)
				if patternSearchResult:
					allele1 = alleleNumber2Base[patternSearchResult.group(1)]
					allele2 = alleleNumber2Base[patternSearchResult.group(2)]
					if allele1!='N' and allele2!='N':
						genotypeCallInBase = '%s%s'%(allele1, allele2)
				if depth_index is not None:
					if len(genotype_data_ls)>depth_index:
						depth = genotype_data_ls[depth_index]
					else:
						depth = '.'	#missing DP
					if depth=='.':	#this means depth=0
						depth = 0
					else:
						depth = int(depth)
					if minDepth>0 and depth<minDepth:	#no read. samtools would still assign ref/ref to this individual
						genotypeCallInBase = 'NA'	#set it to missing
					#if depth>maxNoOfReads*coverage or depth<minNoOfReads*coverage:	#2011-3-29 skip. coverage too high or too low
					#	continue
					callData['DP'] = depth

		"""
		if genotype_call=='0/1' or genotype_call =='1/0':	#heterozygous, the latter notation is never used though.
			allele = '%s%s'%(refBase, altBase)
			GL_list = genotype_data_ls[GL_index]
			GL_list = GL_list.split(',')
			GL_list = map(float, GL_list)
			GL = GL_list[1]
			sndHighestGL = max([GL_list[0], GL_list[2]])
			deltaGL = GL-sndHighestGL
			
			AD = genotype_data_ls[format_column_name2index.get('AD')]
			AD = map(int, AD.split(','))
			minorAlleleCoverage = min(AD)
			majorAlleleCoverage = max(AD)
			
			if minorAlleleCoverage<=minorAlleleDepthUpperBoundCoeff*coverage and \
					minorAlleleCoverage>=minorAlleleDepthLowerBoundCoeff*coverage and \
					majorAlleleCoverage<=majorAlleleDepthUpperBoundCoeff*coverage:
				DP4_ratio = float(AD[0])/AD[1]
				allele = '%s%s'%(refBase, altBase)

		elif genotype_call=='./.' or genotype_call=='.|.':	#missing
			allele = 'NA'
		elif genotype_call =='1/1' or genotype_call =='1|1':
			allele = '%s%s'%(altBase, altBase)
		elif genotype_call =='0/0' or genotype_call=='0|0':
			allele = '%s%s'%(refBase, refBase)
		"""
		col_index = sample_id2index.get(individual_name)
		if dataEntryType==1:
			data_row[col_index] = genotypeCallInBase
		else:
			callData['GT'] = genotypeCallInBase
			data_row[col_index] = callData
		if genotypeCallInBase!='NA':
			if genotypeCallInBase not in genotypeCall2Count:
				genotypeCall2Count[genotypeCallInBase] = 0
			genotypeCall2Count[genotypeCallInBase] += 1
	return PassingData(chr=chromosome, chromosome=chromosome, pos=pos, position=pos, locus_id=locus_id, quality=quality, \
					info_tag2value=info_tag2value, \
					refBase=refBase, altBase=altBase, \
					alleleLs=alleleLs, alleleNumber2Base=alleleNumber2Base, genotypeCall2Count=genotypeCall2Count, data_row=data_row,\
					info=info, format=format, filter=filter, vcf_locus_id=vcf_locus_id, \
					format_column_name2index=format_column_name2index, format_column_ls=format_column_ls)