def convertAlignmentReadGroup2UCLAIDInVCF(self, inputFname, outputFname, minDepth=1, includeIndels=False,\
											maxContigNumber=None):
		"""
		2012.5.10
		"""
		sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname))
		
		vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth)
		#replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID
		
		newSampleIDHeader = []
		for sampleID in vcfFile.sampleIDHeader:
			readGroupData = VervetDB.VervetDB.parseAlignmentReadGroupWithoutDB(sampleID)
			UCLAID = readGroupData.individual_code
			newSampleIDHeader.append(UCLAID)
		#new header for every output contig
		newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader
		
		counter = 0
		real_counter = 0
		outVCFFile = VCFFile(outputFname=outputFname)
		outVCFFile.metaInfoLs = vcfFile.metaInfoLs
		outVCFFile.header = newHeader
		outVCFFile.writeMetaAndHeader()
		for vcfRecord in vcfFile.parseIter():
			counter += 1
			if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1):
				#it's an indel if refBase or altBase is not just one base
				continue
			
			chr = vcfRecord.chr
			if maxContigNumber:
				contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber'))
				if contigNumber>maxContigNumber:
					continue
			real_counter += 1
			# set genotype whose depth is below minDepth to ./. (=missing)
			for i in xrange(1, len(vcfRecord.data_row)):	#[0] is the ref base
				callData = vcfRecord.data_row[i]
				if callData is None or callData.get('DP',0)<minDepth:
					sampleColumnIndex = i+vcfFile.sampleStartingColumn-1
					vcfRecord.row[sampleColumnIndex] = './.'
			outVCFFile.writeVCFRecord(vcfRecord)
		
		vcfFile.close()
		#close all output files
		outVCFFile.close()
		
		sys.stderr.write("%s (out of %s) loci.\n"%(real_counter, counter))
	def splitNamVCFIntoMultipleSingleChrVCF(self, inputFname, outputDir, minDepth=1, includeIndels=False, maxContigNumber=1000):
		"""
		2012.5.10
			Two things in Nam's VCF file are to be modified. 
				1. extract VRC UCLAID from its sample ID
				2. replace vervet1_scaffolds_Contig137 with simply "Contig137"
		"""
		sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname))
		from pymodule.VCFFile import VCFFile
		
		vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth)
		#replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID
		import re
		
		newSampleIDHeader = []
		for sampleID in vcfFile.sampleIDHeader:
			search_result = self.UCLAID_Pattern.search(sampleID)
			UCLAID = search_result.group('UCLAID')
			newSampleIDHeader.append(UCLAID)
		#new header for every output contig
		newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader
		
		
		chr2outVCFFile = {}
		counter = 0
		real_counter = 0
		for vcfRecord in vcfFile.parseIter():
			counter += 1
			if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1):
				#it's an indel if refBase or altBase is not just one base
				continue
			
			contig_id_pattern_result = self.contig_id_pattern.search(vcfRecord.chr)
			chr = contig_id_pattern_result.group('contigID')
			if maxContigNumber:
				contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber'))
				if contigNumber>maxContigNumber:
					continue
			real_counter += 1
			vcfRecord.chr = chr
			pos = vcfRecord.pos
			if chr not in chr2outVCFFile:
				outputFname = os.path.join(outputDir, '%s.vcf'%(chr))
				outVCFFile = VCFFile(outputFname=outputFname)
				outVCFFile.metaInfoLs = vcfFile.metaInfoLs
				outVCFFile.header = newHeader
				outVCFFile.writeMetaAndHeader()
				chr2outVCFFile[chr] = outVCFFile
			outVCFFile = chr2outVCFFile.get(chr)
			
			# set genotype whose depth is below minDepth to ./. (=missing)
			for i in xrange(1, len(vcfRecord.data_row)):	#[0] is the ref base
				callData = vcfRecord.data_row[i]
				if callData is None or callData.get('DP',0)<minDepth:
					sampleColumnIndex = i+vcfFile.sampleStartingColumn-1
					vcfRecord.row[sampleColumnIndex] = './.'
			outVCFFile.writeVCFRecord(vcfRecord)
		
		vcfFile.close()
		#close all output files
		for chr, outVCFFile in chr2outVCFFile.iteritems():
			outVCFFile.close()
		
		sys.stderr.write("%s (out of %s) loci from %s chromosomes.\n"%(real_counter, counter, len(chr2outVCFFile)))
Ejemplo n.º 3
0
	def extractSamples(self, db_vervet=None, inputFname=None, outputFname=None, \
					tax_id_set=None, site_id_set=None, country_id_set=None, \
					min_coverage=None, max_coverage=None, outputFormat=1, is_contaminated=None,\
					**keywords):
		"""
		2013.07.03 added argument is_contaminated (whether to fetch contaminated samples or not)
		2013.04.30 added argument min_coverage, max_coverage
		2012.10.10
			added argument outputFormat. 
		2012.10.5
			
		"""
		sys.stderr.write("Extracting samples from %s, %s sites & %s countries & %s taxonomies, min_coverage=%s, max_coverage=%s, outputFormat=%s, is_contaminated=%s ...\n"%\
							(inputFname,\
							getattr(site_id_set, '__len__', returnZeroFunc)(),\
							getattr(country_id_set, '__len__', returnZeroFunc)(),\
							getattr(tax_id_set, '__len__', returnZeroFunc)(), min_coverage, max_coverage,\
							outputFormat, is_contaminated ))
		vcfFile = VCFFile(inputFname=inputFname)
		
		oldHeader = vcfFile.header
		oldHeaderLength = len(oldHeader)
		newHeader = oldHeader[:vcfFile.sampleStartingColumn]	#anything before the samples are same
		no_of_samples = 0
		col_index2sampleID = {}	#this structure stores the selected samples and their column index 
		for col_index, individual_name in vcfFile.get_col_index_individual_name_ls():
			individualAlignment = db_vervet.parseAlignmentReadGroup(individual_name).individualAlignment
			if individualAlignment is not None:
				filteredAlignmentList = db_vervet.filterAlignments(alignmentLs=[individualAlignment], min_coverage=min_coverage, \
						max_coverage=max_coverage, individual_site_id=None, \
						sequence_filtered=None, individual_site_id_set=site_id_set, \
						mask_genotype_method_id=None, parent_individual_alignment_id=None,\
						country_id_set=country_id_set, tax_id_set=tax_id_set, excludeContaminant=False, \
						is_contaminated=is_contaminated, excludeTissueIDSet=None,\
						local_realigned=None, reduce_reads=None, report=False)
				if filteredAlignmentList:	#non-empty, passed the filter
					newHeader.append(individual_name)
					no_of_samples += 1
					col_index2sampleID[col_index] = individual_name
			else:
				sys.stderr.write("Warning: no individualAlignment for sample %s.\n"%(individual_name))
				sys.exit(3)
		
		no_of_snps = 0
		if outputFormat==1:
			outVCFFile = VCFFile(outputFname=outputFname)
			outVCFFile.metaInfoLs = vcfFile.metaInfoLs
			outVCFFile.header = newHeader
			outVCFFile.writeMetaAndHeader()
			
			newHeaderLength = len(newHeader)
			for vcfRecord in vcfFile:
				data_row =vcfRecord.row[:vcfFile.sampleStartingColumn]
				for i in xrange(vcfFile.sampleStartingColumn, oldHeaderLength):
					if i in col_index2sampleID:
						data_row.append(vcfRecord.row[i])
				outVCFFile.writer.writerow(data_row)
				no_of_snps += 1
			outVCFFile.close()
		elif outputFormat in [2,3]:
			outf = open(outputFname, 'w')
			if outputFormat==2:
				outf.write("sampleID\n")
			for col_index, sampleID in col_index2sampleID.iteritems():
				outf.write("%s\n"%(sampleID))
			outf.close()
		vcfFile.close()
		sys.stderr.write("%s samples X %s SNPs.\n"%(no_of_samples, no_of_snps))
	def replicateVCFGenotypeColumns(self, inputFname, outputFname=None, replicateIndividualTag=None, sampleID2FamilyCount=None,\
								minDepth=0):
		"""
		2012.10.5 remove argument sampleStartingColumn
		2012.5.10
			VCFFile has been changed considerably and can act as a writer now.
		2012.3.29
			
		"""
		sys.stderr.write("Replicating some genotype columns in %s ...\n"%(inputFname))
		vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth)
		
		outVCFFile = VCFFile(outputFname=outputFname)
		outVCFFile.metaInfoLs = vcfFile.metaInfoLs
		
		"""
		outf = open(outputFname, 'w')
		writer = csv.writer(outf, delimiter='\t')
		#write all the headers up till the last line (which describes the samples and etc.)
		for metaInfo in vcfFile.metaInfoLs:
			outf.write(metaInfo)
		"""
		
		#modify the sample-id header line 
		sampleID2DataIndexLs = {}
		oldHeader = vcfFile.header
		oldHeaderLength = len(oldHeader)
		newHeader = oldHeader[:vcfFile.sampleStartingColumn]	#anything before the samples are same
		no_of_samples = 0
		for i in xrange(vcfFile.sampleStartingColumn, oldHeaderLength):
			#for sample_id in vcfFile.metaInfoLs[-1][vcfFile.sampleStartingColumn:]:
			sample_id = oldHeader[i].strip()
			newHeader.append('%s%s%s'%(sample_id, replicateIndividualTag, 1))	#1 because it's the 1st copy
			no_of_samples += 1
			sampleID2DataIndexLs[sample_id] = [i]	#1st copy for this sample
		
		#add additional column headers based on each one's occurrence
		extraColIndex2sampleID = {}
		for sample_id, familyCount in sampleID2FamilyCount.iteritems():
			for i in xrange(1, familyCount):
			#if familyCount>1:
				if sample_id in sampleID2DataIndexLs:
					no_of_samples += 1
					extraColIndex = len(newHeader)
					extraColIndex2sampleID[extraColIndex] = sample_id
					sampleID2DataIndexLs[sample_id].append(extraColIndex)
					replicate_order = len(sampleID2DataIndexLs[sample_id])
					newHeader.append("%s%s%s"%(sample_id, replicateIndividualTag, replicate_order))
		outVCFFile.header = newHeader
		outVCFFile.writeMetaAndHeader()
		
		newHeaderLength = len(newHeader)
		no_of_snps = 0
		for vcfRecord in vcfFile.parseIter():
			data_row =vcfRecord.row
			#2013.09.13 replace all "./." with full NA formating i.e. "./.:.:.:.", pending fields in the "format" column
			for i in xrange(vcfRecord.sampleStartingColumn, len(data_row)):
				if data_row[i]=='./.':	#2013.09.15 expand this NA genotype for TrioCaller
					field_value_ls = []
					for format_field in vcfRecord.format_column_ls:
						if format_field=='GT':
							field_value_ls.append('./.')
						elif format_field=='PL':	#for TrioCaller
							field_value_ls.append('.,.,.')
						else:
							field_value_ls.append('.')
					#field_value_ls = ['./.'] + ['.']*(len(vcfRecord.format_column_name2index)-1)
					data_row[i] = ':'.join(field_value_ls)
			for i in xrange(oldHeaderLength, newHeaderLength):	#add more genotype copies for those extra columns
				sample_id = extraColIndex2sampleID.get(i)
				sourceIndex = sampleID2DataIndexLs.get(sample_id)[0]
				data_row.append(data_row[sourceIndex])
			outVCFFile.writer.writerow(data_row)
			no_of_snps += 1
		outVCFFile.close()
		vcfFile.close()
		sys.stderr.write("%s samples X %s SNPs.\n"%(no_of_samples, no_of_snps))