def addLocusFromVCF2DB(self, db_vervet, inputFname=None, ref_ind_seq_id=None, locus_type_id=None, minDepth=0): """ 2012-5.2 given a VCF file, find all the loci and submit them into db """ sys.stderr.write("Adding loci from %s into db ... "%(inputFname)) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) counter = 0 previous_reported_counter = '' for vcfRecord in vcfFile.parseIter(): chr = vcfRecord.chr pos = vcfRecord.pos pos = int(pos) refBase = vcfRecord.data_row[0].get("GT")[0] refBaseDBEntry = self.getSequenceDBEntry(db_vervet, sequence=refBase, comment=None) altBase = vcfRecord.altBase altBaseDBEntry = self.getSequenceDBEntry(db_vervet, sequence=altBase, comment=None) locus = db_vervet.getLocus(chr=chr, start=pos, stop=pos, ref_seq=refBaseDBEntry, alt_seq=altBaseDBEntry, \ ref_ind_seq_id=ref_ind_seq_id, \ locus_type_id=locus_type_id) counter += 1 if counter%500==0: sys.stderr.write("%s%s"%('\x08'*len(previous_reported_counter), counter)) previous_reported_counter = repr(counter) sys.stderr.write("%s%s"%(len(previous_reported_counter), counter)) sys.stderr.write(" Done.\n")
def countHomoHetCallsForEachSampleFromVCF(self, inputFname, outputFname, chromosome=None, chrLength=None, minDepth=1): """ 2011-11-2 given a VCF file, count the number of h**o-ref, h**o-alt, het calls """ sys.stderr.write("Count the number of homozygous-ref/alt & het from %s .\n"%(inputFname)) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) sampleID2data = {} #key is sampleID, value is a list of 3 numbers. 'NoOfHomoRef', 'NoOfHomoAlt', 'NoOfHet' no_of_total = 0. minStart = None for vcfRecord in vcfFile.parseIter(): chr = vcfRecord.chr pos = vcfRecord.pos pos = int(pos) refBase = vcfRecord.data_row[0].get("GT")[0] for sample_id, sample_index in vcfFile.sample_id2index.iteritems(): if sample_id=='ref': #ignore the reference continue if sample_id not in sampleID2data: sampleID2data[sample_id] = [0, 0, 0] if not vcfRecord.data_row[sample_index]: #None for this sample continue callForThisSample = vcfRecord.data_row[sample_index].get('GT') if not callForThisSample or callForThisSample=='NA': continue if callForThisSample[0]==refBase and callForThisSample[1]==refBase: #homozygous reference allele sampleID2data[sample_id][0]+=1 elif callForThisSample[0]==callForThisSample[1] and callForThisSample[0]!=refBase: #homozygous alternative allele sampleID2data[sample_id][1]+=1 elif callForThisSample[0]!=callForThisSample[1]: sampleID2data[sample_id][2]+=1 import csv writer = csv.writer(open(outputFname, 'w'), delimiter='\t') writer.writerow(['#sampleID', 'chromosome', 'length', "NoOfTotal", 'NoOfHomoRef', 'NoOfHomoAlt', "FractionOfHomoAlt", 'NoOfHet', "FractionOfHet"]) sampleIDLs = sampleID2data.keys() sampleIDLs.sort() for sampleID in sampleIDLs: count_data = sampleID2data.get(sampleID) noOfHomoRef, noOfHomoAlt, noOfHet = count_data[:3] no_of_calls = float(sum(count_data)) if no_of_calls>0: fractionOfHomoAlt = noOfHomoAlt/no_of_calls fractionOfHet = noOfHet/no_of_calls else: fractionOfHomoAlt = -1 fractionOfHet = -1 writer.writerow([sampleID, chromosome, chrLength, int(no_of_calls), noOfHomoRef, noOfHomoAlt, \ fractionOfHomoAlt, noOfHet, fractionOfHet]) del writer sys.stderr.write("Done.\n")
def convertAlignmentReadGroup2UCLAIDInVCF(self, inputFname, outputFname, minDepth=1, includeIndels=False,\ maxContigNumber=None): """ 2012.5.10 """ sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname)) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) #replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID newSampleIDHeader = [] for sampleID in vcfFile.sampleIDHeader: readGroupData = VervetDB.VervetDB.parseAlignmentReadGroupWithoutDB(sampleID) UCLAID = readGroupData.individual_code newSampleIDHeader.append(UCLAID) #new header for every output contig newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader counter = 0 real_counter = 0 outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() for vcfRecord in vcfFile.parseIter(): counter += 1 if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1): #it's an indel if refBase or altBase is not just one base continue chr = vcfRecord.chr if maxContigNumber: contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber')) if contigNumber>maxContigNumber: continue real_counter += 1 # set genotype whose depth is below minDepth to ./. (=missing) for i in xrange(1, len(vcfRecord.data_row)): #[0] is the ref base callData = vcfRecord.data_row[i] if callData is None or callData.get('DP',0)<minDepth: sampleColumnIndex = i+vcfFile.sampleStartingColumn-1 vcfRecord.row[sampleColumnIndex] = './.' outVCFFile.writeVCFRecord(vcfRecord) vcfFile.close() #close all output files outVCFFile.close() sys.stderr.write("%s (out of %s) loci.\n"%(real_counter, counter))
def splitNamVCFIntoMultipleSingleChrVCF(self, inputFname, outputDir, minDepth=1, includeIndels=False, maxContigNumber=1000): """ 2012.5.10 Two things in Nam's VCF file are to be modified. 1. extract VRC UCLAID from its sample ID 2. replace vervet1_scaffolds_Contig137 with simply "Contig137" """ sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname)) from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) #replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID import re newSampleIDHeader = [] for sampleID in vcfFile.sampleIDHeader: search_result = self.UCLAID_Pattern.search(sampleID) UCLAID = search_result.group('UCLAID') newSampleIDHeader.append(UCLAID) #new header for every output contig newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader chr2outVCFFile = {} counter = 0 real_counter = 0 for vcfRecord in vcfFile.parseIter(): counter += 1 if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1): #it's an indel if refBase or altBase is not just one base continue contig_id_pattern_result = self.contig_id_pattern.search(vcfRecord.chr) chr = contig_id_pattern_result.group('contigID') if maxContigNumber: contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber')) if contigNumber>maxContigNumber: continue real_counter += 1 vcfRecord.chr = chr pos = vcfRecord.pos if chr not in chr2outVCFFile: outputFname = os.path.join(outputDir, '%s.vcf'%(chr)) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() chr2outVCFFile[chr] = outVCFFile outVCFFile = chr2outVCFFile.get(chr) # set genotype whose depth is below minDepth to ./. (=missing) for i in xrange(1, len(vcfRecord.data_row)): #[0] is the ref base callData = vcfRecord.data_row[i] if callData is None or callData.get('DP',0)<minDepth: sampleColumnIndex = i+vcfFile.sampleStartingColumn-1 vcfRecord.row[sampleColumnIndex] = './.' outVCFFile.writeVCFRecord(vcfRecord) vcfFile.close() #close all output files for chr, outVCFFile in chr2outVCFFile.iteritems(): outVCFFile.close() sys.stderr.write("%s (out of %s) loci from %s chromosomes.\n"%(real_counter, counter, len(chr2outVCFFile)))
def replicateVCFGenotypeColumns(self, inputFname, outputFname=None, replicateIndividualTag=None, sampleID2FamilyCount=None,\ minDepth=0): """ 2012.10.5 remove argument sampleStartingColumn 2012.5.10 VCFFile has been changed considerably and can act as a writer now. 2012.3.29 """ sys.stderr.write("Replicating some genotype columns in %s ...\n"%(inputFname)) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs """ outf = open(outputFname, 'w') writer = csv.writer(outf, delimiter='\t') #write all the headers up till the last line (which describes the samples and etc.) for metaInfo in vcfFile.metaInfoLs: outf.write(metaInfo) """ #modify the sample-id header line sampleID2DataIndexLs = {} oldHeader = vcfFile.header oldHeaderLength = len(oldHeader) newHeader = oldHeader[:vcfFile.sampleStartingColumn] #anything before the samples are same no_of_samples = 0 for i in xrange(vcfFile.sampleStartingColumn, oldHeaderLength): #for sample_id in vcfFile.metaInfoLs[-1][vcfFile.sampleStartingColumn:]: sample_id = oldHeader[i].strip() newHeader.append('%s%s%s'%(sample_id, replicateIndividualTag, 1)) #1 because it's the 1st copy no_of_samples += 1 sampleID2DataIndexLs[sample_id] = [i] #1st copy for this sample #add additional column headers based on each one's occurrence extraColIndex2sampleID = {} for sample_id, familyCount in sampleID2FamilyCount.iteritems(): for i in xrange(1, familyCount): #if familyCount>1: if sample_id in sampleID2DataIndexLs: no_of_samples += 1 extraColIndex = len(newHeader) extraColIndex2sampleID[extraColIndex] = sample_id sampleID2DataIndexLs[sample_id].append(extraColIndex) replicate_order = len(sampleID2DataIndexLs[sample_id]) newHeader.append("%s%s%s"%(sample_id, replicateIndividualTag, replicate_order)) outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() newHeaderLength = len(newHeader) no_of_snps = 0 for vcfRecord in vcfFile.parseIter(): data_row =vcfRecord.row #2013.09.13 replace all "./." with full NA formating i.e. "./.:.:.:.", pending fields in the "format" column for i in xrange(vcfRecord.sampleStartingColumn, len(data_row)): if data_row[i]=='./.': #2013.09.15 expand this NA genotype for TrioCaller field_value_ls = [] for format_field in vcfRecord.format_column_ls: if format_field=='GT': field_value_ls.append('./.') elif format_field=='PL': #for TrioCaller field_value_ls.append('.,.,.') else: field_value_ls.append('.') #field_value_ls = ['./.'] + ['.']*(len(vcfRecord.format_column_name2index)-1) data_row[i] = ':'.join(field_value_ls) for i in xrange(oldHeaderLength, newHeaderLength): #add more genotype copies for those extra columns sample_id = extraColIndex2sampleID.get(i) sourceIndex = sampleID2DataIndexLs.get(sample_id)[0] data_row.append(data_row[sourceIndex]) outVCFFile.writer.writerow(data_row) no_of_snps += 1 outVCFFile.close() vcfFile.close() sys.stderr.write("%s samples X %s SNPs.\n"%(no_of_samples, no_of_snps))