Python VCFFile Examples, pymodule.VCFFile.VCFFile Python Examples

Example #1

0

Show file

File: GenotypeCallByCoverage.py Project: mjmontague/vervet-web

	def discoverFromVCFWithoutFilter(self, inputFname=None, outputFname=None, **keywords):
		"""
		2012.9.11
			read minDepth from self.minDepth
		2012.9.5
			add minDepth=0 to VCFFile
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
		vcfFile = VCFFile(inputFname=inputFname, minDepth=self.minDepth)
		vcfFile.parseFile()
		
		read_group2col_index = vcfFile.sample_id2index
		locus_id2row_index = vcfFile.locus_id2row_index
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
		new_locus_id2row_index = {}
		for locus_id, row_index  in locus_id2row_index.iteritems():
			new_locus_id = '%s_%s'%(locus_id[0], locus_id[1])
			new_locus_id2row_index[new_locus_id] = row_index
		locus_id2row_index = new_locus_id2row_index
		
		data_matrix = vcfFile.genotype_call_matrix
		
		self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \
					read_group2col_index=read_group2col_index, \
					locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)

Example #2

0

Show file

File: hsCreateMetadataMatrix.py Project: mjmontague/vervet-web

	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			#allow 0 depth-> no missing data
			vcfFile = VCFFile(inputFname=filename,minDepth=0)
			sampleIDList = vcfFile.getSampleIDList()
			writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			sampleIDlist = ['sampleID']
			columnIndexList = []
			countryid_row=['country_id']
			uclaIDList=['ucla_id']
			speciesid_row=['tax_id']
			longitudeList=['longitude'];
			latitudeList=['latitude'];
			for i in xrange(len(sampleIDList)):
				sampleID = sampleIDList[i]
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
				site = individualAlignment.ind_sequence.individual.site				
				sampleIDlist.append(sampleID)
				columnIndexList.append(i)
				uclaIDList.append(individualAlignment.ind_sequence.individual.ucla_id);
				countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id)
				speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id)
				longitudeList.append(individualAlignment.ind_sequence.individual.longitude);
				latitudeList.append(individualAlignment.ind_sequence.individual.latitude);
			writer.writerow(sampleIDlist)
			writer.writerow(uclaIDList)
			writer.writerow(speciesid_row)
			writer.writerow(countryid_row)
			writer.writerow(longitudeList)
			writer.writerow(latitudeList)
			del writer

Example #3

0

Show file

File: hsExtractDataTool.py Project: mjmontague/vervet-web

	def selectSubPopNoDB(self,columnindexlist,ind_id_ls,vcffilename):
		"""
		2012.9.19
			get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist
			and return genotype matrix
		"""
		#import pdb
		filename = vcffilename
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			#this is a list with the read-group names
			readgroupIDList = vcfFile.getSampleIDList()
			#writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			#header = ['Chromosome', 'position', 'ref','alt']
			chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[]
			columnIndexList = columnindexlist
			datalist=[]
			for vcfRecord in vcfFile:
				data_row=[]
				chrom_ls.append(vcfRecord.chr)
				snp_pos_ls.append(vcfRecord.pos)
				refBase = vcfRecord.refBase
				nonRefBase = vcfRecord.altBase
				ref_ls.append(refBase)
				alt_ls.append(nonRefBase)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase:
							gt=0
						elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase:
							gt=1
						else:
							gt=2
						data_row.append(gt)
					else:
						data_row.append(-9)
				counter += 1
				datalist.append(data_row)
			sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList)))
			#pdb.set_trace()
			data=np.array(datalist,dtype=np.float)
			datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data)
			return datastruct

Example #4

0

Show file

File: hsExtractDataTool.py Project: mjmontague/vervet-web

	def getVCFInd(self,uclaidlist):
		"""
		2012.9.19
			get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist
			and return genotype matrix
		"""
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			#this is a list with the read-group names
			readgroupIDList = vcfFile.getSampleIDList()
			#writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			#header = ['Chromosome', 'position', 'ref','alt']
			ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[]
			columnIndexList = []
			datalist=[]
			for i in xrange(len(readgroupIDList)):
				readgroupID = readgroupIDList[i]
				#this is the first part of the read group
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment
				uclaid=individualAlignment.ind_sequence.individual.ucla_id
				if uclaid in uclaidlist:			
					#header.append(readgroupID)
					columnIndexList.append(i)
					ind_id_ls.append(uclaid)
			session.close()		
			return (columnIndexList,ind_id_ls)

Example #5

0

Show file

File: PlotVCFAAF_vs_Position.py Project: mjmontague/vervet-web

	def getLocusAndData(self, inputFname, VCFOutputType=2):
		"""
		2011-9-21
		
		"""
		contig_id_pattern = re.compile(r'Contig(\d+).*')
		contig2locus2frequency = {}
		fname = inputFname
		if fname[-6:]!='vcf.gz' and fname[-3:]!='vcf':
			return None
		sys.stderr.write("%s ..."%fname)
		contig_id_pattern_sr = contig_id_pattern.search(inputFname)
		if contig_id_pattern_sr:
			contig_id = contig_id_pattern_sr.group(1)
		else:
			contig_id = os.path.splitext(os.path.split(inputFname)[1])[0]
		
		vcfFile = VCFFile(inputFname=self.inputFname)
		counter = 0
		real_counter = 0
		
		locus_ls = []
		xData_ls = []
		yData_ls = []
		
		for vcfRecord in vcfFile.parseIter():
			locus_id = vcfRecord.locus_id
			chr = vcfRecord.chr
			pos = vcfRecord.pos
			pos = int(pos)
			
			AF1 = vcfRecord.info_tag2value.get("AF", vcfRecord.info_tag2value.get("AF1", None))
			
			if AF1:
				AF1 = float(AF1)
				locus_ls.append(locus_id)
				xData_ls.append(pos)
				yData_ls.append(AF1)
		
		sys.stderr.write("%s loci. Done.\n"%(len(yData_ls)))
		return PassingData(contig_id=contig_id, locus_ls=locus_ls, yData_ls=yData_ls, xData_ls=xData_ls)

Example #6

0

Show file

File: hsExtractDataTool.py Project: mjmontague/vervet-web

	def createMetadataMat(self):
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			#allow 0 depth-> no missing data
			vcfFile = VCFFile(inputFname=filename,minDepth=0)
			sampleIDList = vcfFile.getSampleIDList()
			sampleIDlist = ['sampleID']
			columnIndexList = []
			countryid_row=['country_id']
			uclaIDList=['ucla_id']
			speciesid_row=['tax_id']
			longitudeList=['longitude'];
			latitudeList=['latitude'];
			for i in xrange(len(sampleIDList)):
				sampleID = sampleIDList[i]
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
				site = individualAlignment.ind_sequence.individual.site				
				sampleIDlist.append(sampleID)
				columnIndexList.append(i)
				uclaIDList.append(individualAlignment.ind_sequence.individual.ucla_id);
				countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id)
				speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id)
				longitudeList.append(individualAlignment.ind_sequence.individual.longitude);
				latitudeList.append(individualAlignment.ind_sequence.individual.latitude);
			self.metadata=[uclaIDList,countryid_row,speciesid_row,longitudeList,latitudeList]
			session.close()

Example #7

0

Show file

File: CalculateSNPMismatchRateOfTwoVCF.py Project: mjmontague/vervet-web

	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		vcfFile1 = VCFFile(inputFname=self.inputFname)
		vcfFile1.parseFile()
		vcfFile2 = VCFFile(inputFname=self.jnputFname)
		vcfFile2.parseFile()
		
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		header = ['#chromosome', 'position', 'mismatchRate']
		
		
		no_of_sites_of_input1 = len(vcfFile1.locus_id_ls)
		no_of_sites_of_input2 = len(vcfFile2.locus_id_ls)
		overlapping_sites_set = set(vcfFile1.locus_id_ls)&set(vcfFile2.locus_id_ls)
		no_of_overlapping_sites = len(overlapping_sites_set)
		no_of_total_sites = no_of_sites_of_input1+no_of_sites_of_input2-no_of_overlapping_sites
		
		no_of_samples = len(vcfFile1.sample_id2index)
		no_of_samples_in_vcf2 = len(vcfFile2.sample_id2index)
		overlapping_sample_id_set = set(vcfFile1.sample_id2index.keys()) & set(vcfFile2.sample_id2index.keys())
		overlapping_sample_id_list = list(overlapping_sample_id_set)
		overlapping_sample_id_list.sort()
		
		if no_of_samples!=no_of_samples_in_vcf2:
			sys.stderr.write("Warning: sample size in %s is %s, in %s is %s. not matching.\n"%\
							(self.inputFname, no_of_samples, self.jnputFname, no_of_samples_in_vcf2))
		
		no_of_samples_to_compare = len(overlapping_sample_id_set)
		writer.writerow(header)
		
		locus_id2mismatchData = {}
		for locus_id in overlapping_sites_set:
			row_index1 = vcfFile1.locus_id2row_index[locus_id]
			row_index2 = vcfFile2.locus_id2row_index[locus_id]
			no_of_mismatches = 0
			no_of_non_NA_pairs = 0.0
			for j in xrange(len(overlapping_sample_id_list)):
				sample_id = overlapping_sample_id_list[j]
				col_index1 = vcfFile1.sample_id2index.get(sample_id)
				col_index2 = vcfFile2.sample_id2index.get(sample_id)
				call1 = vcfFile1.genotype_call_matrix[row_index1][col_index1]
				call2 = vcfFile2.genotype_call_matrix[row_index2][col_index2]
				if call1!='NA' and call2!='NA':
					no_of_non_NA_pairs += 1
					if call1!=call2:
						no_of_mismatches += 1
					else:
						#do nothing
						pass
			if no_of_non_NA_pairs>0:
				mismatchRate = no_of_mismatches/float(no_of_non_NA_pairs)
			else:
				mismatchRate = -1
			locus_id2mismatchData[locus_id] = [mismatchRate, no_of_mismatches, no_of_non_NA_pairs]
		
		counter = 0
		locus_id_ls = locus_id2mismatchData.keys()
		locus_id_ls.sort()
		for locus_id in locus_id_ls:
			mismatchData = locus_id2mismatchData.get(locus_id)
			mismatchRate = mismatchData[0]
			if mismatchRate<=self.maxMismatchRate:
				counter += 1
				chr, pos = locus_id[:2]
				writer.writerow([chr, pos, mismatchRate])
		sys.stderr.write("%s loci passed the maxMismatchRate out of %s overlapped loci.\n"%(counter, len(overlapping_sites_set)))

Example #8

0

Show file

File: hsCalculateStatsForSubPop_0_1.py Project: mjmontague/vervet-web

	def selectSubPop(self,uclaidlist):
		"""
		2012.9.19
			get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist
			and return genotype matrix
		"""
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			#this is a list with the read-group names
			readgroupIDList = vcfFile.getSampleIDList()
			#writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			#header = ['Chromosome', 'position', 'ref','alt']
			ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[]
			columnIndexList = []
			datalist=[]
			for i in xrange(len(readgroupIDList)):
				readgroupID = readgroupIDList[i]
				#this is the first part of the read group
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment
				uclaid=individualAlignment.ind_sequence.individual.ucla_id
				if uclaid in uclaidlist:			
					#header.append(readgroupID)
					columnIndexList.append(i)
					ind_id_ls.append(uclaid)
			#writer.writerow(header)
			#datalist.append(header)
			for vcfRecord in vcfFile:
				data_row=[]
				chrom_ls.append(vcfRecord.chr)
				snp_pos_ls.append(vcfRecord.pos)
				refBase = vcfRecord.refBase
				nonRefBase = vcfRecord.altBase
				ref_ls.append(refBase)
				alt_ls.append(nonRefBase)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase:
							gt=0
						elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase:
							gt=1
						else:
							gt=2
						data_row.append(gt)
					else:
						data_row.append('N')
				counter += 1
				datalist.append(data_row)
			sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList)))
			data=np.array(datalist,dtype=np.float)
			datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data)
			session.close()
			return datastruct

Example #9

0

Show file

File: hsFetchCountrySpeciesGenotypeMatrix.py Project: mjmontague/vervet-web

	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			#allow 0 depth-> no missing data
			vcfFile = VCFFile(inputFname=filename,minDepth=0)
			sampleIDList = vcfFile.getSampleIDList()
			writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			header = ['Chromosome', 'position', 'ref']
			columnIndexList = []
			countryid_row=['-','-','-']
			speciesid_row=['-','-','-']
			for i in xrange(len(sampleIDList)):
				sampleID = sampleIDList[i]
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
				site = individualAlignment.ind_sequence.individual.site
				#if individualAlignment.ind_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \
				#																and site.country_id!=136 and site.country_id!=148): 
				header.append(sampleID)
				columnIndexList.append(i)
				countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id)
				speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id)
			writer.writerow(header)
			writer.writerow(speciesid_row)
			writer.writerow(countryid_row)
			for vcfRecord in vcfFile:
				data_row = [vcfRecord.chr, vcfRecord.pos]
				refCall = vcfRecord.data_row[0]
				data_row.append(refCall['GT'])
				#get alternative allele frequency
				AF_list = vcfRecord.info_tag2value.get('AF')	#info_tag2value['AF']
				#if not isinstance(AF_list,types.NoneType):
				#	AF_list = AF_list.split(',')
				#	AF_list = map(float, AF_list)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						#if vcfCall['GT'][1]==refCall['GT'] and vcfCall['GT'][2]==refCall['GT']:
						#	gt=0
						#elif vcfCall['GT'][1]==refCall['GT'] or vcfCall['GT'][2]==refCall['GT']:
						#	gt=0.5
						data_row.append(vcfCall['GT'])
					else:
						data_row.append('NN')
						
				writer.writerow(data_row)
				counter += 1
			sys.stderr.write("%s loci outputted.\n"%(counter))
			del writer

Example #10

0

Show file

File: hsFetchGenotypeMatrix012.py Project: mjmontague/vervet-web

	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			sampleIDList = vcfFile.getSampleIDList()
			writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			header = ['Chromosome', 'position', 'ref','alt']
			columnIndexList = []
			for i in xrange(len(sampleIDList)):
				sampleID = sampleIDList[i]
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
				site = individualAlignment.ind_sequence.individual.site
				#if individualAlignment.ind_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \
				#																and site.country_id!=136 and site.country_id!=148): 
				header.append(sampleID)
				columnIndexList.append(i)
			writer.writerow(header)
			for vcfRecord in vcfFile:
				data_row = [vcfRecord.chr, vcfRecord.pos]
				refBase = vcfRecord.refBase
				nonRefBase = vcfRecord.altBase
				data_row.append(refBase)
				data_row.append(nonRefBase)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase:
							gt=0
						elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase:
							gt=1
						else:
							gt=2
						data_row.append(gt)
					else:
						data_row.append('N')
						
				writer.writerow(data_row)
				counter += 1
			sys.stderr.write("%s loci outputted.\n"%(counter))
			del writer

Example #11

0

Show file

File: hs10XIndDistanceMatrix.py Project: mjmontague/vervet-web

    def run(self):
        """
		2012.7.13
		"""
        if self.debug:
            import pdb

            pdb.set_trace()
        session = self.db_vervet.session

        session.begin()
        if not self.dataDir:
            self.dataDir = self.db_vervet.data_dir
        dataDir = self.dataDir

        genotypeFile = self.db_vervet.getGenotypeFile(
            genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format
        )

        if not genotypeFile:
            sys.stderr.write(
                "Error: genotype_method_id %s, chromosome %s does not exist.\n"
                % (self.genotypeMethodID, self.chromosome)
            )
            sys.exit(2)
        filename = os.path.join(dataDir, genotypeFile.path)
        if os.path.isfile(filename):
            counter = 0
            from pymodule.VCFFile import VCFFile

            # allow 0 depth-> no missing data
            vcfFile = VCFFile(inputFname=filename, minDepth=0)
            sampleIDList = vcfFile.getSampleIDList()
            writer = csv.writer(open(self.outputFname, "w"), delimiter="\t")
            # header = ['Chromosome', 'position', 'ref']
            columnIndexList = []
            countryidList = []
            speciesidList = []
            keptSampleIDList = []
            genotypeMat = []
            for i in xrange(len(sampleIDList)):
                sampleID = sampleIDList[i]
                individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
                site = individualAlignment.ind_sequence.individual.site
                if individualAlignment.ind_sequence.individual.target_coverage == 10:
                    keptSampleIDList.append(sampleID)
                    columnIndexList.append(i)
                    countryidList.append(individualAlignment.ind_sequence.individual.site.country_id)
                    speciesidList.append(individualAlignment.ind_sequence.individual.tax_id)
            for vcfRecord in vcfFile:
                data_row = []
                refCall = vcfRecord.data_row[0]
                # data_row.append(refCall['GT'])
                # get alternative allele frequency
                # AF_list = vcfRecord.info_tag2value.get('AF')	#info_tag2value['AF']
                # if not isinstance(AF_list,types.NoneType):
                # 	AF_list = AF_list.split(',')
                # 	AF_list = map(float, AF_list)
                for columnIndex in columnIndexList:
                    # for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
                    # it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
                    vcfCall = vcfRecord.data_row[columnIndex + 1]
                    if vcfCall:
                        if vcfCall["GT"][0] == refCall["GT"] and vcfCall["GT"][1] == refCall["GT"]:
                            gt = 0
                        elif vcfCall["GT"][0] == refCall["GT"] or vcfCall["GT"][1] == refCall["GT"]:
                            gt = 1
                        else:
                            gt = 2
                        data_row.append(gt)
                    else:
                        data_row.append("NN")
                genotypeMat.append(data_row)
                counter += 1
            sys.stderr.write("%s loci outputted.\n" % (counter))

            # calculate distance Matrix
            import numpy as np

            matArr = np.array(genotypeMat, np.int32)
            distArr = np.empty((matArr.shape[1], matArr.shape[1]))
            distArr[:] = np.NAN

            for i in range(matArr.shape[1]):
                for j in range(matArr.shape[1]):
                    distArr[i][j] = sum(abs(matArr[:, i] - matArr[:, j]))
                    # normalise so that distance is between 0 and 2:
            distArr = distArr / matArr.shape[0]
            np.savetxt(self.outputFname, distArr)
            print countryidList