def discoverFromVCFWithoutFilter(self, inputFname=None, outputFname=None, **keywords): """ 2012.9.11 read minDepth from self.minDepth 2012.9.5 add minDepth=0 to VCFFile #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ vcfFile = VCFFile(inputFname=inputFname, minDepth=self.minDepth) vcfFile.parseFile() read_group2col_index = vcfFile.sample_id2index locus_id2row_index = vcfFile.locus_id2row_index #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos new_locus_id2row_index = {} for locus_id, row_index in locus_id2row_index.iteritems(): new_locus_id = '%s_%s'%(locus_id[0], locus_id[1]) new_locus_id2row_index[new_locus_id] = row_index locus_id2row_index = new_locus_id2row_index data_matrix = vcfFile.genotype_call_matrix self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \ read_group2col_index=read_group2col_index, \ locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile #allow 0 depth-> no missing data vcfFile = VCFFile(inputFname=filename,minDepth=0) sampleIDList = vcfFile.getSampleIDList() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') sampleIDlist = ['sampleID'] columnIndexList = [] countryid_row=['country_id'] uclaIDList=['ucla_id'] speciesid_row=['tax_id'] longitudeList=['longitude']; latitudeList=['latitude']; for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.ind_sequence.individual.site sampleIDlist.append(sampleID) columnIndexList.append(i) uclaIDList.append(individualAlignment.ind_sequence.individual.ucla_id); countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id) speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id) longitudeList.append(individualAlignment.ind_sequence.individual.longitude); latitudeList.append(individualAlignment.ind_sequence.individual.latitude); writer.writerow(sampleIDlist) writer.writerow(uclaIDList) writer.writerow(speciesid_row) writer.writerow(countryid_row) writer.writerow(longitudeList) writer.writerow(latitudeList) del writer
def selectSubPopNoDB(self,columnindexlist,ind_id_ls,vcffilename): """ 2012.9.19 get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist and return genotype matrix """ #import pdb filename = vcffilename if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) #this is a list with the read-group names readgroupIDList = vcfFile.getSampleIDList() #writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') #header = ['Chromosome', 'position', 'ref','alt'] chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[] columnIndexList = columnindexlist datalist=[] for vcfRecord in vcfFile: data_row=[] chrom_ls.append(vcfRecord.chr) snp_pos_ls.append(vcfRecord.pos) refBase = vcfRecord.refBase nonRefBase = vcfRecord.altBase ref_ls.append(refBase) alt_ls.append(nonRefBase) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase: gt=0 elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase: gt=1 else: gt=2 data_row.append(gt) else: data_row.append(-9) counter += 1 datalist.append(data_row) sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList))) #pdb.set_trace() data=np.array(datalist,dtype=np.float) datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data) return datastruct
def getVCFInd(self,uclaidlist): """ 2012.9.19 get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist and return genotype matrix """ session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) #this is a list with the read-group names readgroupIDList = vcfFile.getSampleIDList() #writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') #header = ['Chromosome', 'position', 'ref','alt'] ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[] columnIndexList = [] datalist=[] for i in xrange(len(readgroupIDList)): readgroupID = readgroupIDList[i] #this is the first part of the read group individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment uclaid=individualAlignment.ind_sequence.individual.ucla_id if uclaid in uclaidlist: #header.append(readgroupID) columnIndexList.append(i) ind_id_ls.append(uclaid) session.close() return (columnIndexList,ind_id_ls)
def getLocusAndData(self, inputFname, VCFOutputType=2): """ 2011-9-21 """ contig_id_pattern = re.compile(r'Contig(\d+).*') contig2locus2frequency = {} fname = inputFname if fname[-6:]!='vcf.gz' and fname[-3:]!='vcf': return None sys.stderr.write("%s ..."%fname) contig_id_pattern_sr = contig_id_pattern.search(inputFname) if contig_id_pattern_sr: contig_id = contig_id_pattern_sr.group(1) else: contig_id = os.path.splitext(os.path.split(inputFname)[1])[0] vcfFile = VCFFile(inputFname=self.inputFname) counter = 0 real_counter = 0 locus_ls = [] xData_ls = [] yData_ls = [] for vcfRecord in vcfFile.parseIter(): locus_id = vcfRecord.locus_id chr = vcfRecord.chr pos = vcfRecord.pos pos = int(pos) AF1 = vcfRecord.info_tag2value.get("AF", vcfRecord.info_tag2value.get("AF1", None)) if AF1: AF1 = float(AF1) locus_ls.append(locus_id) xData_ls.append(pos) yData_ls.append(AF1) sys.stderr.write("%s loci. Done.\n"%(len(yData_ls))) return PassingData(contig_id=contig_id, locus_ls=locus_ls, yData_ls=yData_ls, xData_ls=xData_ls)
def createMetadataMat(self): session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile #allow 0 depth-> no missing data vcfFile = VCFFile(inputFname=filename,minDepth=0) sampleIDList = vcfFile.getSampleIDList() sampleIDlist = ['sampleID'] columnIndexList = [] countryid_row=['country_id'] uclaIDList=['ucla_id'] speciesid_row=['tax_id'] longitudeList=['longitude']; latitudeList=['latitude']; for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.ind_sequence.individual.site sampleIDlist.append(sampleID) columnIndexList.append(i) uclaIDList.append(individualAlignment.ind_sequence.individual.ucla_id); countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id) speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id) longitudeList.append(individualAlignment.ind_sequence.individual.longitude); latitudeList.append(individualAlignment.ind_sequence.individual.latitude); self.metadata=[uclaIDList,countryid_row,speciesid_row,longitudeList,latitudeList] session.close()
def run(self): """ """ if self.debug: import pdb pdb.set_trace() vcfFile1 = VCFFile(inputFname=self.inputFname) vcfFile1.parseFile() vcfFile2 = VCFFile(inputFname=self.jnputFname) vcfFile2.parseFile() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['#chromosome', 'position', 'mismatchRate'] no_of_sites_of_input1 = len(vcfFile1.locus_id_ls) no_of_sites_of_input2 = len(vcfFile2.locus_id_ls) overlapping_sites_set = set(vcfFile1.locus_id_ls)&set(vcfFile2.locus_id_ls) no_of_overlapping_sites = len(overlapping_sites_set) no_of_total_sites = no_of_sites_of_input1+no_of_sites_of_input2-no_of_overlapping_sites no_of_samples = len(vcfFile1.sample_id2index) no_of_samples_in_vcf2 = len(vcfFile2.sample_id2index) overlapping_sample_id_set = set(vcfFile1.sample_id2index.keys()) & set(vcfFile2.sample_id2index.keys()) overlapping_sample_id_list = list(overlapping_sample_id_set) overlapping_sample_id_list.sort() if no_of_samples!=no_of_samples_in_vcf2: sys.stderr.write("Warning: sample size in %s is %s, in %s is %s. not matching.\n"%\ (self.inputFname, no_of_samples, self.jnputFname, no_of_samples_in_vcf2)) no_of_samples_to_compare = len(overlapping_sample_id_set) writer.writerow(header) locus_id2mismatchData = {} for locus_id in overlapping_sites_set: row_index1 = vcfFile1.locus_id2row_index[locus_id] row_index2 = vcfFile2.locus_id2row_index[locus_id] no_of_mismatches = 0 no_of_non_NA_pairs = 0.0 for j in xrange(len(overlapping_sample_id_list)): sample_id = overlapping_sample_id_list[j] col_index1 = vcfFile1.sample_id2index.get(sample_id) col_index2 = vcfFile2.sample_id2index.get(sample_id) call1 = vcfFile1.genotype_call_matrix[row_index1][col_index1] call2 = vcfFile2.genotype_call_matrix[row_index2][col_index2] if call1!='NA' and call2!='NA': no_of_non_NA_pairs += 1 if call1!=call2: no_of_mismatches += 1 else: #do nothing pass if no_of_non_NA_pairs>0: mismatchRate = no_of_mismatches/float(no_of_non_NA_pairs) else: mismatchRate = -1 locus_id2mismatchData[locus_id] = [mismatchRate, no_of_mismatches, no_of_non_NA_pairs] counter = 0 locus_id_ls = locus_id2mismatchData.keys() locus_id_ls.sort() for locus_id in locus_id_ls: mismatchData = locus_id2mismatchData.get(locus_id) mismatchRate = mismatchData[0] if mismatchRate<=self.maxMismatchRate: counter += 1 chr, pos = locus_id[:2] writer.writerow([chr, pos, mismatchRate]) sys.stderr.write("%s loci passed the maxMismatchRate out of %s overlapped loci.\n"%(counter, len(overlapping_sites_set)))
def selectSubPop(self,uclaidlist): """ 2012.9.19 get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist and return genotype matrix """ session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) #this is a list with the read-group names readgroupIDList = vcfFile.getSampleIDList() #writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') #header = ['Chromosome', 'position', 'ref','alt'] ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[] columnIndexList = [] datalist=[] for i in xrange(len(readgroupIDList)): readgroupID = readgroupIDList[i] #this is the first part of the read group individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment uclaid=individualAlignment.ind_sequence.individual.ucla_id if uclaid in uclaidlist: #header.append(readgroupID) columnIndexList.append(i) ind_id_ls.append(uclaid) #writer.writerow(header) #datalist.append(header) for vcfRecord in vcfFile: data_row=[] chrom_ls.append(vcfRecord.chr) snp_pos_ls.append(vcfRecord.pos) refBase = vcfRecord.refBase nonRefBase = vcfRecord.altBase ref_ls.append(refBase) alt_ls.append(nonRefBase) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase: gt=0 elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase: gt=1 else: gt=2 data_row.append(gt) else: data_row.append('N') counter += 1 datalist.append(data_row) sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList))) data=np.array(datalist,dtype=np.float) datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data) session.close() return datastruct
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile #allow 0 depth-> no missing data vcfFile = VCFFile(inputFname=filename,minDepth=0) sampleIDList = vcfFile.getSampleIDList() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['Chromosome', 'position', 'ref'] columnIndexList = [] countryid_row=['-','-','-'] speciesid_row=['-','-','-'] for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.ind_sequence.individual.site #if individualAlignment.ind_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \ # and site.country_id!=136 and site.country_id!=148): header.append(sampleID) columnIndexList.append(i) countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id) speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id) writer.writerow(header) writer.writerow(speciesid_row) writer.writerow(countryid_row) for vcfRecord in vcfFile: data_row = [vcfRecord.chr, vcfRecord.pos] refCall = vcfRecord.data_row[0] data_row.append(refCall['GT']) #get alternative allele frequency AF_list = vcfRecord.info_tag2value.get('AF') #info_tag2value['AF'] #if not isinstance(AF_list,types.NoneType): # AF_list = AF_list.split(',') # AF_list = map(float, AF_list) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: #if vcfCall['GT'][1]==refCall['GT'] and vcfCall['GT'][2]==refCall['GT']: # gt=0 #elif vcfCall['GT'][1]==refCall['GT'] or vcfCall['GT'][2]==refCall['GT']: # gt=0.5 data_row.append(vcfCall['GT']) else: data_row.append('NN') writer.writerow(data_row) counter += 1 sys.stderr.write("%s loci outputted.\n"%(counter)) del writer
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) sampleIDList = vcfFile.getSampleIDList() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['Chromosome', 'position', 'ref','alt'] columnIndexList = [] for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.ind_sequence.individual.site #if individualAlignment.ind_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \ # and site.country_id!=136 and site.country_id!=148): header.append(sampleID) columnIndexList.append(i) writer.writerow(header) for vcfRecord in vcfFile: data_row = [vcfRecord.chr, vcfRecord.pos] refBase = vcfRecord.refBase nonRefBase = vcfRecord.altBase data_row.append(refBase) data_row.append(nonRefBase) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase: gt=0 elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase: gt=1 else: gt=2 data_row.append(gt) else: data_row.append('N') writer.writerow(data_row) counter += 1 sys.stderr.write("%s loci outputted.\n"%(counter)) del writer
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile( genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format ) if not genotypeFile: sys.stderr.write( "Error: genotype_method_id %s, chromosome %s does not exist.\n" % (self.genotypeMethodID, self.chromosome) ) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter = 0 from pymodule.VCFFile import VCFFile # allow 0 depth-> no missing data vcfFile = VCFFile(inputFname=filename, minDepth=0) sampleIDList = vcfFile.getSampleIDList() writer = csv.writer(open(self.outputFname, "w"), delimiter="\t") # header = ['Chromosome', 'position', 'ref'] columnIndexList = [] countryidList = [] speciesidList = [] keptSampleIDList = [] genotypeMat = [] for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.ind_sequence.individual.site if individualAlignment.ind_sequence.individual.target_coverage == 10: keptSampleIDList.append(sampleID) columnIndexList.append(i) countryidList.append(individualAlignment.ind_sequence.individual.site.country_id) speciesidList.append(individualAlignment.ind_sequence.individual.tax_id) for vcfRecord in vcfFile: data_row = [] refCall = vcfRecord.data_row[0] # data_row.append(refCall['GT']) # get alternative allele frequency # AF_list = vcfRecord.info_tag2value.get('AF') #info_tag2value['AF'] # if not isinstance(AF_list,types.NoneType): # AF_list = AF_list.split(',') # AF_list = map(float, AF_list) for columnIndex in columnIndexList: # for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. # it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex + 1] if vcfCall: if vcfCall["GT"][0] == refCall["GT"] and vcfCall["GT"][1] == refCall["GT"]: gt = 0 elif vcfCall["GT"][0] == refCall["GT"] or vcfCall["GT"][1] == refCall["GT"]: gt = 1 else: gt = 2 data_row.append(gt) else: data_row.append("NN") genotypeMat.append(data_row) counter += 1 sys.stderr.write("%s loci outputted.\n" % (counter)) # calculate distance Matrix import numpy as np matArr = np.array(genotypeMat, np.int32) distArr = np.empty((matArr.shape[1], matArr.shape[1])) distArr[:] = np.NAN for i in range(matArr.shape[1]): for j in range(matArr.shape[1]): distArr[i][j] = sum(abs(matArr[:, i] - matArr[:, j])) # normalise so that distance is between 0 and 2: distArr = distArr / matArr.shape[0] np.savetxt(self.outputFname, distArr) print countryidList