def run(self): """ """ if self.debug: import pdb pdb.set_trace() vcfFile1 = VCFFile(inputFname=self.inputFname, minDepth=self.minDepth) vcfFile2 = VCFFile(inputFname=self.jnputFname, minDepth=self.minDepth) """ if self.outputFnamePrefix: outputFnamePrefix = self.outputFnamePrefix elif self.outputFname: outputFnamePrefix = os.path.splitext(self.outputFname)[0] #2012.8.20 bugfix, was using os.path.split() else: sys.stderr.write("could not get outputFnamePrefix from self.outputFnamePrefix %s or self.outputFname %s.\n"%\ (self.outputFnamePrefix, self.outputFname)) sys.exit(1) """ #overallOverlapOutputFname = '%s.tsv'%(outputFnamePrefix) #perSampleConcordanceOutputFname = '%s_perSample.tsv'%(outputFnamePrefix) pdata = self.calculateOverlappingSites(vcfFile1=vcfFile1, vcfFile2=vcfFile2, outputFname=self.outputFname, overlappingSitesOutputFname=self.overlappingSitesOutputFname, \ chromosome=self.chromosome, chrLength=self.chrLength) if self.perSampleConcordanceOutputFname: self.calculatePerSampleMismatchFraction(vcfFile1=vcfFile1, vcfFile2=vcfFile2, \ outputFname=self.perSampleConcordanceOutputFname,\ overlapping_sample_id_set=pdata.overlapping_sample_id_set)
def run(self): if self.debug: import pdb pdb.set_trace() debug = True else: debug = False outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) self.reader = VCFFile(inputFname=self.inputFname) self.writer = VCFFile(outputFname=self.outputFname, mode='w') self.writer.metaInfoLs = self.reader.metaInfoLs for info_tag, description in self.knownInfoTag2DescriptionLine.items(): self.writer.metaInfoLs.append(description) self.writer.header = self.reader.header self.writer.writeMetaAndHeader() counter = 0 for vcfRecord in self.reader: counter += 1 self.writer.writeVCFRecord(vcfRecord) self.reader.close() self.writer.close()
def discoverFromVCFWithoutFilter(self, inputFname=None, outputFname=None, **keywords): """ 2012.9.11 read minDepth from self.minDepth 2012.9.5 add minDepth=0 to VCFFile #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ vcfFile = VCFFile(inputFname=inputFname, minDepth=self.minDepth) vcfFile.parseFile() read_group2col_index = vcfFile.sample_id2index locus_id2row_index = vcfFile.locus_id2row_index #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos new_locus_id2row_index = {} for locus_id, row_index in locus_id2row_index.items(): new_locus_id = '%s_%s' % (locus_id[0], locus_id[1]) new_locus_id2row_index[new_locus_id] = row_index locus_id2row_index = new_locus_id2row_index data_matrix = vcfFile.genotype_call_matrix self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \ read_group2col_index=read_group2col_index, \ locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap( self.coordinateMapFname) self.reader = VCFFile(inputFname=self.inputFname) self.writer = VCFFile(outputFname=self.outputFname, mode='w') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() counter = 0 real_counter = 0 noOfRecordsWithMultiNewCoords = 0 for vcfRecord in self.reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key) if newCoordinateDataLs is None: continue if len(newCoordinateDataLs) > 1: noOfRecordsWithMultiNewCoords += 1 continue newCoordinateData = newCoordinateDataLs[0] vcfRecord.setChromosome(newCoordinateData.newChromosome) vcfRecord.setPosition(newCoordinateData.newStart) if newCoordinateData.strand == '-': newRefBase = Seq( newCoordinateData.oldRefBase).reverse_complement() newAltBase = Seq( newCoordinateData.oldAltBase).reverse_complement() else: newRefBase = newCoordinateData.oldRefBase newAltBase = newCoordinateData.oldAltBase vcfRecord.setRefAllele(newRefBase) vcfRecord.setAltAllele(newAltBase) real_counter += 1 self.writer.writeVCFRecord(vcfRecord) self.reader.close() self.writer.close() sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \ real_counter/float(counter), noOfRecordsWithMultiNewCoords))
def extractFlankingSequence(self, inputFname=None, refFastaFname=None, outputFname=None, flankingLength=24,\ outputFormatType=1, alleleLength=1): """ 2013.09.03 added argument alleleLength 2012.10.10 added argument outputFormatType. 1: fasta, 2: fastq 2012.10.8 """ sys.stderr.write("Extracting flanking sequences of loci from %s, based on ref-sequence of %s, alleleLength=%s, outputFormatType=%s ...\n"%\ (inputFname, refFastaFname, alleleLength, outputFormatType)) vcfFile = VCFFile(inputFname=inputFname) outf = open(outputFname, 'w') refFastaFile = FastaFile(inputFname=refFastaFname) counter = 0 real_counter = 0 for vcfRecord in vcfFile: counter += 1 if alleleLength and (len(vcfRecord.refBase) != alleleLength or len(vcfRecord.altBase) != alleleLength): continue real_counter += 1 refBase = vcfRecord.refBase stopPos = vcfRecord.pos + len(refBase) - 1 SNP_ID = '%s_%s_%s_%s_%s' % (vcfRecord.chr, vcfRecord.pos, stopPos, vcfRecord.refBase, vcfRecord.altBase) fastaTitle = '%s_positionInFlank%s' % ( SNP_ID, flankingLength + 1) #positionInFlank is 1-based. flankSeqStart = max(1, vcfRecord.pos - flankingLength) flankSeqStop = stopPos + flankingLength flankingSequence = refFastaFile.getSequence(vcfRecord.chr, start=flankSeqStart, stop=flankSeqStop) if flankingSequence: if outputFormatType == 1: outf.write(">%s\n" % (fastaTitle)) outf.write('%s\n' % (flankingSequence)) else: outf.write("@%s\n" % (fastaTitle)) outf.write('%s\n' % (flankingSequence)) outf.write("+\n") outf.write("%s\n" % ('H' * len(flankingSequence))) del outf vcfFile.close() refFastaFile.close() sys.stderr.write("%s loci (%s total) written out.\n" % (real_counter, counter))
def convertVCF2BjarniFormat(self, inputFname, outputFname, **keywords): """ #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ vcfFile = VCFFile(inputFname=inputFname) vcfFile.parseFile() read_group2col_index = vcfFile.sample_id2index locus_id2row_index = vcfFile.locus_id2row_index data_matrix = vcfFile.genotype_call_matrix self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \ read_group2col_index=read_group2col_index, \ locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
def _juxtaposeAlleleFrequencyFromMultiVCFInput(self, inputFnameLs=None, inputHeaderLs=None, outputFname=None, \ defaultNullFrequency=-0, **keywords): """ 2012.10.5 """ sys.stderr.write("Getting allele frequency from %s input ..." % (len(inputFnameLs))) #get locus2AF from inputFname locus2frequencyList = [] locus_id_set = set() for inputFname in inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) locus2frequency = vcfFile.getLocus2AlternativeAlleleFrequency() vcfFile.close() locus2frequencyList.append(locus2frequency) locus_id_set = locus_id_set.union(set(locus2frequency.keys())) sys.stderr.write("%s loci.\n" % (len(locus_id_set))) sys.stderr.write( "Outputting frequency collected from all input to %s ..." % (outputFname)) #output them in juxtaposition writer = csv.writer(open(outputFname, 'w'), delimiter='\t') header = ['locusID'] + inputHeaderLs + ['count'] writer.writerow(header) locus_id_list = sorted(locus_id_set) for locus_id in locus_id_list: locus_id_str_ls = map(str, locus_id) data_row = ['_'.join(locus_id_str_ls)] for i in range(len(locus2frequencyList)): locus2frequency = locus2frequencyList[i] frequency = locus2frequency.get(locus_id, defaultNullFrequency) data_row.append(frequency) data_row.append(1) writer.writerow(data_row) del writer sys.stderr.write("\n")
def getAllInfoTags(self, inputFname=None, **keywords): """ 2013.07.10 not used right now. """ sys.stderr.write("Extracting info tags from VCF %s ..." % (inputFname)) vcfFile = VCFFile(inputFname=inputFname) info_tag_set = set() counter = 0 real_counter = 0 for vcfRecord in vcfFile: for info_tag in vcfRecord.info_tag2value: info_tag_set.add(info_tag) counter += 1 vcfFile.close() sys.stderr.write("%s unique info tags.\n" % (len(info_tag_set))) return info_tag_set
def openOneInputFile(self, inputFname=None): """ 2013.09.05 split out of fileWalker() , added VCFFile """ if self.inputFileFormat==2: reader = YHFile(inputFname, mode='r', tableName=self.h5TableName) elif self.inputFileFormat==3: reader = HDF5MatrixFile(inputFname, mode='r') elif self.inputFileFormat==4: reader = VCFFile(inputFname=inputFname) else: reader = MatrixFile(inputFname) return reader
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ #2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently #AbstractMatrixFileWalker.setup(self, **keywords) self.writer = VCFFile(outputFname=self.outputFname, mode='w') self.reader = VCFFile(inputFname=self.originalVCFFname, mode='r') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() # read all the Beagle files sampleID2BeagleFile = {} for inputFname in self.inputFnameLs: beagleFile = BeagleGenotypeFile(path=inputFname) beagleFile.readInAllHaplotypes() for individualID in beagleFile.sampleIDList: sampleID2BeagleFile[individualID] = beagleFile # get all haplotypes , etc. # get all sample IDs self.sampleID2BeagleFile = sampleID2BeagleFile
def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1): """ returnType 1: snp_pos2returnData is snp_pos2genotypeVectorLs 2: snp_pos2returnData is snp_pos2returnData 2013.07.19 bugfix 2013.07.11 """ sys.stderr.write("Finding SNPs that have same positions from %s ..." % (inputFname)) reader = VCFFile(inputFname=inputFname) counter = 0 real_counter = 0 snp_pos2returnData = {} for vcfRecord in reader: key = (vcfRecord.chromosome, vcfRecord.position) if key not in snp_pos2returnData: if returnType == 1: snp_pos2returnData[key] = [] else: snp_pos2returnData[key] = 0 else: real_counter += 1 if returnType == 1: snp_pos2returnData[key].append( vcfRecord.data_row[1:]) #[0] is reference else: snp_pos2returnData[key] += 1 counter += 1 reader.close() sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\ (len(snp_pos2returnData), counter, real_counter)) return PassingData(snp_pos2returnData=snp_pos2returnData)
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) switchDensity = self.readInSwitchDensity( inputFname=self.switchPointFname).switchDensity reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 if switchDensity <= self.maxSwitchDensity: for vcfRecord in reader: #assuming input VCF is sorted counter += 1 real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() sys.stderr.write("%s (out of %s) records outputted.\n" % (real_counter, counter))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
class AddMissingInfoDescriptionToVCFHeader(ParentClass): __doc__ = __doc__ option_default_dict = ParentClass.option_default_dict.copy() option_default_dict.update({}) knownInfoTag2DescriptionLine = {"LDAF": """##INFO=<ID=LDAF,Number=1,Type=Float,Description="MLE Allele Frequency Accounting for LD. Range: 0 - 1">\n""",\ "ERATE": """##INFO=<ID=ERATE,Number=1,Type=Float,Description="Per-marker Mutation rate from MaCH/Thunder. Range: 0.0001 - 0.2051">\n""",\ "AVGPOST": """##INFO=<ID=AVGPOST,Number=1,Type=Float,Description="Average posterior probability from MaCH/Thunder. Range: 0.5242 - 1">\n""",\ "RSQ": """##INFO=<ID=RSQ,Number=1,Type=Float,Description="Genotype imputation quality from MaCH/Thunder. Range:0 - 1">\n""",\ "THETA": """##INFO=<ID=THETA,Number=1,Type=Float,Description="Per-marker Transition rate from MaCH/Thunder. Range:0 - 0.1493">\n""",\ "AC_Orig": """##INFO=<ID=AC_Orig,Number=1,Type=Integer,Description="Original AC">\n""",\ "AF_Orig": """##INFO=<ID=AF_Orig,Number=1,Type=Float,Description="Original AF">\n""",\ "AN_Orig": """##INFO=<ID=AN_Orig,Number=1,Type=Integer,Description="Original AN">\n""",\ } def __init__(self, inputFnameLs=None, **keywords): """ """ ParentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords) def getAllInfoTags(self, inputFname=None, **keywords): """ 2013.07.10 not used right now. """ sys.stderr.write("Extracting info tags from VCF %s ..." % (inputFname)) vcfFile = VCFFile(inputFname=inputFname) info_tag_set = set() counter = 0 real_counter = 0 for vcfRecord in vcfFile: for info_tag in vcfRecord.info_tag2value: info_tag_set.add(info_tag) counter += 1 vcfFile.close() sys.stderr.write("%s unique info tags.\n" % (len(info_tag_set))) return info_tag_set def run(self): if self.debug: import pdb pdb.set_trace() debug = True else: debug = False outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) self.reader = VCFFile(inputFname=self.inputFname) self.writer = VCFFile(outputFname=self.outputFname, mode='w') self.writer.metaInfoLs = self.reader.metaInfoLs for info_tag, description in self.knownInfoTag2DescriptionLine.items(): self.writer.metaInfoLs.append(description) self.writer.header = self.reader.header self.writer.writeMetaAndHeader() counter = 0 for vcfRecord in self.reader: counter += 1 self.writer.writeVCFRecord(vcfRecord) self.reader.close() self.writer.close()
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusID2Stat = self.getLocusID2StatFunctionDict[self.runType](self.statFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) stat = locusID2Stat.get(key) if stat is None: continue toKeepLocus = True if self.minValue is not None and stat < self.minValue: toKeepLocus = False if self.maxValue is not None and stat > self.maxValue: toKeepLocus = False if toKeepLocus: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter>0: fraction = real_counter/float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def splitVCF(self, inputFname, outputFnamePrefix=None, noOfOverlappingSites=1000, noOfSitesPerUnit=5000,\ noOfTotalSites=None): """ 2012.8.25 """ sys.stderr.write("Splitting VCF %s into files each with %s sites and %s overlapping ... \n"%(inputFname, noOfSitesPerUnit,\ noOfOverlappingSites)) vcfFile = VCFFile(inputFname=inputFname) unitNumber2OutVCFFile = {} counter = 0 real_counter = 0 #make it 1 less than total so the last unit is >=s noOfUnits = max(1, utils.getNoOfUnitsNeededToCoverN(N=noOfTotalSites, s=noOfSitesPerUnit, o=noOfOverlappingSites)-1) sys.stderr.write(" will be split into %s units ... "%(noOfUnits)) overlappingRecordLs = [] for vcfRecord in vcfFile: counter += 1 #below the maximum: noOfUnits. unitNumber = min(noOfUnits, max(1, utils.getNoOfUnitsNeededToCoverN(N=counter, s=noOfSitesPerUnit, o=noOfOverlappingSites))) if unitNumber not in unitNumber2OutVCFFile: outputFname = '%s_unit%s.vcf'%(outputFnamePrefix, unitNumber) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = vcfFile.header outVCFFile.writeMetaAndHeader() outVCFFile.noOfLoci =0 #output the overlapping vcf records (from previous unit if overlappingRecordLs: for overlappingVCFRecord in overlappingRecordLs: outVCFFile.writeVCFRecord(overlappingVCFRecord) outVCFFile.noOfLoci += 1 overlappingRecordLs = [] #reset it unitNumber2OutVCFFile[unitNumber] = outVCFFile outVCFFile = unitNumber2OutVCFFile[unitNumber] outVCFFile.writeVCFRecord(vcfRecord) outVCFFile.noOfLoci += 1 #store the overlapping records if unitNumber<noOfUnits: if outVCFFile.noOfLoci>(noOfSitesPerUnit-noOfOverlappingSites): overlappingRecordLs.append(vcfRecord) vcfFile.close() #close all output files for unitNumber, outVCFFile in unitNumber2OutVCFFile.items(): outVCFFile.close() sys.stderr.write("%s loci split into %s files.\n"%(counter, len(unitNumber2OutVCFFile)))
def splitVCFIntoBeagleInputs(self, inputFname=None, beagleLikelihoodFile=None, \ familySize2BeagleFileHandler=None, pedigreeFamilyData=None, \ minProbForValidCall=0.9, markersFile=None): """ 2013.05.03 The non-likelihood (unphased, trios, pairs) Beagle format: I id sample1 sample1 sample2 sample2 A diabetes 1 1 2 2 M Contig791:1086 C C C C M Contig791:1649 T C C C M Contig791:4084 G A A A """ sys.stderr.write("Splitting VCFFile %s (+ one beagle Likelihood file %s) into Beagle trios/duos files, minProbForValidCall=%s ... \n"%\ (inputFname, beagleLikelihoodFile.inputFname, minProbForValidCall)) counter = 0 no_of_trios = 0 no_of_duos = 0 no_of_singletons = 0 totalNoOfCalls = 0 noOfCallsMarkedMissing = 0 vcfFile = VCFFile(inputFname=inputFname) familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList for vcfRecord in vcfFile: oneLocus = next(beagleLikelihoodFile) counter += 1 familySize2CallList = {} genotypeLikelihoodList = oneLocus.genotypeLikelihoodList for familySize, sampleIDList in familySize2SampleIDList.items(): if familySize not in familySize2CallList: familySize2CallList[familySize] = [] for sampleID in sampleIDList: totalNoOfCalls += 1 vcfGenotypeCallData = vcfRecord.getGenotypeCallForOneSample( sampleID) tripleLikelihood = beagleLikelihoodFile.getLikelihoodListOfOneGenotypeOneSample( oneLocus=oneLocus, sampleID=sampleID) if familySize == 1: no_of_singletons += 1 familySize2CallList[familySize].extend( tripleLikelihood) else: if familySize == 2: no_of_duos += 1 elif familySize == 3: no_of_trios += 1 tripleLikelihood = list(map(float, tripleLikelihood)) maxLikelihoodIndex = numpy.argmax(tripleLikelihood) maxLikelihood = tripleLikelihood[maxLikelihoodIndex] if maxLikelihood >= minProbForValidCall: if maxLikelihoodIndex == 0: diploidCallFromBeagle = [ oneLocus.alleleA, oneLocus.alleleA ] elif maxLikelihoodIndex == 1: diploidCallFromBeagle = [ oneLocus.alleleA, oneLocus.alleleB ] else: diploidCallFromBeagle = [ oneLocus.alleleB, oneLocus.alleleB ] else: noOfCallsMarkedMissing += 1 diploidCallFromBeagle = ['?', '?'] #if vcfGenotypeCallData is None: #DP is zero # sys.stderr.write("vcfGenotypeCallData for sample %s at locus %s, %s is None.\n"%\ # (sampleID, vcfRecord.chr, vcfRecord.pos)) # import pdb # pdb.set_trace() if vcfGenotypeCallData and \ self.checkConcordanceBetweenBeagleAndVCFCall(vcfGenotypeCallData['GT'], diploidCallFromBeagle): diploidCall = [ vcfGenotypeCallData['GT'][0], vcfGenotypeCallData['GT'][1] ] else: diploidCall = ['?', '?'] familySize2CallList[familySize].extend(diploidCall) for familySize, callList in familySize2CallList.items(): if familySize == 1: rowHeaderList = [ oneLocus.markerID, oneLocus.alleleA, oneLocus.alleleB ] else: rowHeaderList = ['M', oneLocus.markerID] beagleFileHandler = familySize2BeagleFileHandler[familySize] beagleFileHandler.writerow(rowHeaderList + callList) if markersFile is not None: markersFile.writerow([ oneLocus.markerID, oneLocus.markerID.split(':')[1], oneLocus.alleleA, oneLocus.alleleB ]) vcfFile.close() sys.stderr.write("%s loci, total %s calls, %s calls for singletons, %s calls for duos, %s calls for trios. %s calls marked missing.\n"%\ (counter, totalNoOfCalls, no_of_singletons, no_of_duos, no_of_trios, noOfCallsMarkedMissing))
class LiftOverVCFBasedOnCoordinateMap(ParentClass): __doc__ = __doc__ option_default_dict = ParentClass.option_default_dict.copy() option_default_dict.update({ ('coordinateMapFname', 1, ): ['', '', 1, 'file that has a map between old and new coordinates. output of FindSNPPositionOnNewRefFromFlankingBlastOutput.py', ],\ }) def __init__(self, inputFnameLs=None, **keywords): """ """ ParentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords) def readInCoordinateMap(self, coordinateMapFname=None): """ 2013.07.11 querySNPID queryStrand queryChromosome queryStart queryStop queryRefBase queryAltBase queryAlignmentSpan queryAlignmentStart queryAlignmentStop newChr newRefStart newRefStop newRefBase targetAlignmentSpan targetAlignmentStart targetAlignmentStop """ sys.stderr.write("Reading in the coordinate map from %s ..." % (coordinateMapFname)) oldCoordinate2newCoordinateDataLs = {} reader = MatrixFile(path=coordinateMapFname) reader.constructColName2IndexFromHeader() oldChromosomeIndex = reader.getColIndexGivenColHeader( "queryChromosome") oldStartIndex = reader.getColIndexGivenColHeader("queryStart") strandIndex = reader.getColIndexGivenColHeader("queryStrand") oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase") oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase") newChromosomeIndex = reader.getColIndexGivenColHeader("newChr") newStartIndex = reader.getColIndexGivenColHeader("newRefStart") newStopIndex = reader.getColIndexGivenColHeader("newRefStop") newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase") counter = 0 for row in reader: oldChromosome = row[oldChromosomeIndex] oldStart = int(row[oldStartIndex]) strand = row[strandIndex] oldRefBase = row[oldRefBaseIndex] oldAltBase = row[oldAltBaseIndex] newChromosome = row[newChromosomeIndex] newStart = int(row[newStartIndex]) newStop = int(row[newStopIndex]) newRefBase = row[newRefBaseIndex] key = (oldChromosome, oldStart) if key not in oldCoordinate2newCoordinateDataLs: oldCoordinate2newCoordinateDataLs[key] = [] oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \ oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\ newStop=newStop, newRefBase=newRefBase)) counter += 1 del reader sys.stderr.write("%s old coordinates with %s new coordinates.\n" % (len(oldCoordinate2newCoordinateDataLs), counter)) return oldCoordinate2newCoordinateDataLs def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap( self.coordinateMapFname) self.reader = VCFFile(inputFname=self.inputFname) self.writer = VCFFile(outputFname=self.outputFname, mode='w') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() counter = 0 real_counter = 0 noOfRecordsWithMultiNewCoords = 0 for vcfRecord in self.reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key) if newCoordinateDataLs is None: continue if len(newCoordinateDataLs) > 1: noOfRecordsWithMultiNewCoords += 1 continue newCoordinateData = newCoordinateDataLs[0] vcfRecord.setChromosome(newCoordinateData.newChromosome) vcfRecord.setPosition(newCoordinateData.newStart) if newCoordinateData.strand == '-': newRefBase = Seq( newCoordinateData.oldRefBase).reverse_complement() newAltBase = Seq( newCoordinateData.oldAltBase).reverse_complement() else: newRefBase = newCoordinateData.oldRefBase newAltBase = newCoordinateData.oldAltBase vcfRecord.setRefAllele(newRefBase) vcfRecord.setAltAllele(newAltBase) real_counter += 1 self.writer.writeVCFRecord(vcfRecord) self.reader.close() self.writer.close() sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \ real_counter/float(counter), noOfRecordsWithMultiNewCoords))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusNewID2mapPvalue = self.getLocusNewID2mapPvalue( self.liftOverLocusMapPvalueFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) mapPvalue = locusNewID2mapPvalue.get(key) if mapPvalue is None: continue if mapPvalue > self.minLiftOverMapPvalue: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def filterVCFSNPCluster(self, inputFname=None, outputFname=None, minNeighborDistance=10, **keywords): """ #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ sys.stderr.write( "Filtering VCF %s to get rid of SNPs that are %s distance apart ..." % (inputFname, minNeighborDistance)) vcfFile = VCFFile(inputFname=inputFname) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = vcfFile.header outVCFFile.writeMetaAndHeader() previousVCFRecord = None previousVCFRecordIsBad = False #indicator whether previous record is bad or not. based on distance to the previous-previous record counter = 0 for vcfRecord in vcfFile: if previousVCFRecord is not None: if previousVCFRecord.chr == vcfRecord.chr: distanceToPreviousRecord = abs(vcfRecord.pos - previousVCFRecord.pos) if distanceToPreviousRecord < minNeighborDistance: previousVCFRecordIsBad = True else: if not previousVCFRecordIsBad: #distance to current & previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False else: #handle the last record from the previous chromosome (assuming loci are in chromosomal order) if not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False #reset previousVCFRecord = vcfRecord counter += 1 vcfFile.close() #handle the last record if previousVCFRecord is not None and not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) outVCFFile.close() noOfLociAfterFilter = len(outVCFFile.locus_id_ls) delta = counter - noOfLociAfterFilter if counter > 0: fraction = delta / float(counter) else: fraction = -0.0 sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" % (delta, counter, noOfLociAfterFilter, fraction * 100))
class CombinePhasedBeagleOutputsIntoVCF(AbstractMatrixFileWalker): __doc__ = __doc__ option_default_dict = AbstractMatrixFileWalker.option_default_dict option_default_dict.update({ ('replicateIndividualTag', 0, ): ['copy', '', 1, 'the tag that separates the true ID and its replicate count'],\ ('originalVCFFname', 1, ): ['', '', 1, 'original VCF file on which both Beagle phased output and output VCF will be based. \n\ The output VCF will be same as originalVCFFname, except GT field, to be replaced by phased genotypes from Beagle-phased files'],\ }) def __init__(self, inputFnameLs=None, **keywords): """ """ AbstractMatrixFileWalker.__init__(self, inputFnameLs=inputFnameLs, **keywords) #a map from one sample to specific beagle file self.sampleID2BeagleFile = None def setup(self, **keywords): """ 2012.10.15 run before anything is run """ #2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently #AbstractMatrixFileWalker.setup(self, **keywords) self.writer = VCFFile(outputFname=self.outputFname, mode='w') self.reader = VCFFile(inputFname=self.originalVCFFname, mode='r') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() # read all the Beagle files sampleID2BeagleFile = {} for inputFname in self.inputFnameLs: beagleFile = BeagleGenotypeFile(path=inputFname) beagleFile.readInAllHaplotypes() for individualID in beagleFile.sampleIDList: sampleID2BeagleFile[individualID] = beagleFile # get all haplotypes , etc. # get all sample IDs self.sampleID2BeagleFile = sampleID2BeagleFile def reduce(self, **keywords): """ 2012.10.15 run after all files have been walked through """ #sample the data real_counter = 0 counter = 0 no_of_loci = 0 for vcfRecord in self.reader: for sampleID, sample_index in vcfRecord.sample_id2index.items(): beagleFile = self.sampleID2BeagleFile.get(sampleID) """ if beagleFile is None: sys.stderr.write("Warning: sampleID %s is not affiliated with any Beagle file.\n"%(sampleID) raise """ beagleGenotype = beagleFile.getGenotypeOfOneSampleOneLocus(sampleID=sampleID, locusID=None) vcfRecord.setGenotypeCallForOneSample(sampleID=sampleID, genotype='%s|%s'%(beagleGenotype[0], beagleGenotype[1])) counter += 1 self.writer.writeVCFRecord(vcfRecord) no_of_loci += 1 sys.stderr.write("%s genotypes, %s loci.\n"%(counter, no_of_loci)) #close the self.invariantPData.writer and self.writer AbstractMatrixFileWalker.reduce(self, **keywords)
def extractSamples(self, db_main=None, inputFname=None, outputFname=None, \ tax_id_set=None, site_id_set=None, country_id_set=None, \ min_coverage=None, max_coverage=None, outputFormat=1, is_contaminated=None,\ **keywords): """ 2013.07.03 added argument is_contaminated (whether to fetch contaminated samples or not) 2013.04.30 added argument min_coverage, max_coverage 2012.10.10 added argument outputFormat. 2012.10.5 """ sys.stderr.write("Extracting samples from %s, %s sites & %s countries & %s taxonomies, min_coverage=%s, max_coverage=%s, outputFormat=%s, is_contaminated=%s ...\n"%\ (inputFname,\ getattr(site_id_set, '__len__', returnZeroFunc)(),\ getattr(country_id_set, '__len__', returnZeroFunc)(),\ getattr(tax_id_set, '__len__', returnZeroFunc)(), min_coverage, max_coverage,\ outputFormat, is_contaminated )) vcfFile = VCFFile(inputFname=inputFname) oldHeader = vcfFile.header oldHeaderLength = len(oldHeader) newHeader = oldHeader[:vcfFile. sampleStartingColumn] #anything before the samples are same no_of_samples = 0 col_index2sampleID = { } #this structure stores the selected samples and their column index for col_index, individual_name in vcfFile.get_col_index_individual_name_ls( ): individualAlignment = db_main.parseAlignmentReadGroup( individual_name).individualAlignment if individualAlignment is not None: filteredAlignmentList = db_main.filterAlignments(alignmentLs=[individualAlignment], min_coverage=min_coverage, \ max_coverage=max_coverage, individual_site_id=None, \ sequence_filtered=None, individual_site_id_set=site_id_set, \ mask_genotype_method_id=None, parent_individual_alignment_id=None,\ country_id_set=country_id_set, tax_id_set=tax_id_set, excludeContaminant=False, \ is_contaminated=is_contaminated, excludeTissueIDSet=None,\ local_realigned=None, reduce_reads=None, report=False) if filteredAlignmentList: #non-empty, passed the filter newHeader.append(individual_name) no_of_samples += 1 col_index2sampleID[col_index] = individual_name else: sys.stderr.write( "Warning: no individualAlignment for sample %s.\n" % (individual_name)) sys.exit(3) no_of_snps = 0 if outputFormat == 1: outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() newHeaderLength = len(newHeader) for vcfRecord in vcfFile: data_row = vcfRecord.row[:vcfFile.sampleStartingColumn] for i in range(vcfFile.sampleStartingColumn, oldHeaderLength): if i in col_index2sampleID: data_row.append(vcfRecord.row[i]) outVCFFile.writer.writerow(data_row) no_of_snps += 1 outVCFFile.close() elif outputFormat in [2, 3]: outf = open(outputFname, 'w') if outputFormat == 2: outf.write("sampleID\n") for col_index, sampleID in col_index2sampleID.items(): outf.write("%s\n" % (sampleID)) outf.close() vcfFile.close() sys.stderr.write("%s samples X %s SNPs.\n" % (no_of_samples, no_of_snps))
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #self.writer = BeagleGenotypeFile(path=self.outputFname, mode='w') #read in the IBD check result self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \ rowIDHeader=None, colIDHeader=None, \ rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False) #. read in the alignment coverage data alignmentCoverageFile = MatrixFile( path=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary( keyColumnIndexList=[0], valueColumnIndexList=[1]) alignmentCoverageFile.close() sys.stderr.write( "Reading in all samples from %s VCF input files ... \n" % (len(self.inputFnameLs))) # read all the Beagle files individualID2HaplotypeData = {} for inputFname in self.inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) #vcfFile.readInAllHaplotypes() for individualID in vcfFile.getSampleIDList(): individualID2HaplotypeData[individualID] = None #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID) #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList, # locusIDList=vcfFile.locusIDList) # get all haplotypes , etc. # get all sample IDs sys.stderr.write("%s individuals total.\n" % (len(individualID2HaplotypeData))) #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns) #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child) sys.stderr.write("Constructing individualID2pedigreeContext ...") plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname) pGraph = plinkPedigreeFile.pedigreeGraph #shrink the graph to only individuals with data pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys()) cc_subgraph_list = nx.connected_component_subgraphs( pGraph.to_undirected()) individualID2familyContext = {} outDegreeContainer = NumberContainer(minValue=0) familySizeContainer = NumberContainer(minValue=0) individualCoverageContainer = NumberContainer(minValue=0) familyCoverageContainer = NumberContainer(minValue=0) for cc_subgraph in cc_subgraph_list: familySize = len(cc_subgraph) familySizeContainer.addOneValue(familySize) familyCoverage = 0 for n in cc_subgraph: #assuming each family is a two-generation trio/nuclear family individualCoverage = self.getIndividualCoverage( individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs ) individualCoverage = float(individualCoverage) individualCoverageContainer.addOneValue(individualCoverage) familyCoverage += individualCoverage in_degree = pGraph.in_degree(n) out_degree = pGraph.out_degree(n) outDegreeContainer.addOneValue(out_degree) familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \ individualCoverage=individualCoverage,\ familyCoverage=None) if n not in individualID2familyContext: individualID2familyContext[n] = familyContext else: sys.stderr.write( "Node %s already in individualID2familyContext.\n" % (n)) familyCoverageContainer.addOneValue(familyCoverage) #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype for n in cc_subgraph: individualID2familyContext[n].familyCoverage = familyCoverage plinkPedigreeFile.close() sys.stderr.write("%s individuals.\n" % (len(individualID2familyContext))) # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual sys.stderr.write( "Weighing each individual , assigning probability mass ...") individualID2probabilityMass = {} for individualID, familyContext in individualID2familyContext.items(): outDegreeQuotient = outDegreeContainer.normalizeValue( familyContext.familySize) individualCoverageQuotient = individualCoverageContainer.normalizeValue( familyContext.individualCoverage) #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage) importanceScore = outDegreeQuotient + individualCoverageQuotient representativeImportanceScore = importanceScore individualID2probabilityMass[ individualID] = representativeImportanceScore sys.stderr.write(" %s IDs with probability mass assigned.\n" % (len(individualID2probabilityMass))) self.individualID2probabilityMass = individualID2probabilityMass self.individualID2HaplotypeData = individualID2HaplotypeData
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2count = self.readInSNPID2GenotypeVectorLs( self.inputFname, returnType=2).snp_pos2returnData reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) frequency = snp_pos2count.get(key) if frequency == 1: writer.writeVCFRecord(vcfRecord) real_counter += 1 reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = 0 sys.stderr.write("%s (out of %s, %s) snps are unique.\n" % (real_counter, counter, fraction))