def run(self): """ """ if self.debug: import pdb pdb.set_trace() vcfFile1 = VCFFile(inputFname=self.inputFname, minDepth=self.minDepth) vcfFile2 = VCFFile(inputFname=self.jnputFname, minDepth=self.minDepth) """ if self.outputFnamePrefix: outputFnamePrefix = self.outputFnamePrefix elif self.outputFname: outputFnamePrefix = os.path.splitext(self.outputFname)[0] #2012.8.20 bugfix, was using os.path.split() else: sys.stderr.write("could not get outputFnamePrefix from self.outputFnamePrefix %s or self.outputFname %s.\n"%\ (self.outputFnamePrefix, self.outputFname)) sys.exit(1) """ #overallOverlapOutputFname = '%s.tsv'%(outputFnamePrefix) #perSampleConcordanceOutputFname = '%s_perSample.tsv'%(outputFnamePrefix) pdata = self.calculateOverlappingSites(vcfFile1=vcfFile1, vcfFile2=vcfFile2, outputFname=self.outputFname, overlappingSitesOutputFname=self.overlappingSitesOutputFname, \ chromosome=self.chromosome, chrLength=self.chrLength) if self.perSampleConcordanceOutputFname: self.calculatePerSampleMismatchFraction(vcfFile1=vcfFile1, vcfFile2=vcfFile2, \ outputFname=self.perSampleConcordanceOutputFname,\ overlapping_sample_id_set=pdata.overlapping_sample_id_set)
def run(self): if self.debug: import pdb pdb.set_trace() debug = True else: debug = False outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) self.reader = VCFFile(inputFname=self.inputFname) self.writer = VCFFile(outputFname=self.outputFname, mode='w') self.writer.metaInfoLs = self.reader.metaInfoLs for info_tag, description in self.knownInfoTag2DescriptionLine.items(): self.writer.metaInfoLs.append(description) self.writer.header = self.reader.header self.writer.writeMetaAndHeader() counter = 0 for vcfRecord in self.reader: counter += 1 self.writer.writeVCFRecord(vcfRecord) self.reader.close() self.writer.close()
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) switchDensity = self.readInSwitchDensity( inputFname=self.switchPointFname).switchDensity reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 if switchDensity <= self.maxSwitchDensity: for vcfRecord in reader: #assuming input VCF is sorted counter += 1 real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() sys.stderr.write("%s (out of %s) records outputted.\n" % (real_counter, counter))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
def filterVCFSNPCluster(self, inputFname=None, outputFname=None, minNeighborDistance=10, **keywords): """ #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ sys.stderr.write( "Filtering VCF %s to get rid of SNPs that are %s distance apart ..." % (inputFname, minNeighborDistance)) vcfFile = VCFFile(inputFname=inputFname) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = vcfFile.header outVCFFile.writeMetaAndHeader() previousVCFRecord = None previousVCFRecordIsBad = False #indicator whether previous record is bad or not. based on distance to the previous-previous record counter = 0 for vcfRecord in vcfFile: if previousVCFRecord is not None: if previousVCFRecord.chr == vcfRecord.chr: distanceToPreviousRecord = abs(vcfRecord.pos - previousVCFRecord.pos) if distanceToPreviousRecord < minNeighborDistance: previousVCFRecordIsBad = True else: if not previousVCFRecordIsBad: #distance to current & previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False else: #handle the last record from the previous chromosome (assuming loci are in chromosomal order) if not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False #reset previousVCFRecord = vcfRecord counter += 1 vcfFile.close() #handle the last record if previousVCFRecord is not None and not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) outVCFFile.close() noOfLociAfterFilter = len(outVCFFile.locus_id_ls) delta = counter - noOfLociAfterFilter if counter > 0: fraction = delta / float(counter) else: fraction = -0.0 sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" % (delta, counter, noOfLociAfterFilter, fraction * 100))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap( self.coordinateMapFname) self.reader = VCFFile(inputFname=self.inputFname) self.writer = VCFFile(outputFname=self.outputFname, mode='w') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() counter = 0 real_counter = 0 noOfRecordsWithMultiNewCoords = 0 for vcfRecord in self.reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key) if newCoordinateDataLs is None: continue if len(newCoordinateDataLs) > 1: noOfRecordsWithMultiNewCoords += 1 continue newCoordinateData = newCoordinateDataLs[0] vcfRecord.setChromosome(newCoordinateData.newChromosome) vcfRecord.setPosition(newCoordinateData.newStart) if newCoordinateData.strand == '-': newRefBase = Seq( newCoordinateData.oldRefBase).reverse_complement() newAltBase = Seq( newCoordinateData.oldAltBase).reverse_complement() else: newRefBase = newCoordinateData.oldRefBase newAltBase = newCoordinateData.oldAltBase vcfRecord.setRefAllele(newRefBase) vcfRecord.setAltAllele(newAltBase) real_counter += 1 self.writer.writeVCFRecord(vcfRecord) self.reader.close() self.writer.close() sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \ real_counter/float(counter), noOfRecordsWithMultiNewCoords))
def splitVCF(self, inputFname, outputFnamePrefix=None, noOfOverlappingSites=1000, noOfSitesPerUnit=5000,\ noOfTotalSites=None): """ 2012.8.25 """ sys.stderr.write("Splitting VCF %s into files each with %s sites and %s overlapping ... \n"%(inputFname, noOfSitesPerUnit,\ noOfOverlappingSites)) vcfFile = VCFFile(inputFname=inputFname) unitNumber2OutVCFFile = {} counter = 0 real_counter = 0 #make it 1 less than total so the last unit is >=s noOfUnits = max(1, utils.getNoOfUnitsNeededToCoverN(N=noOfTotalSites, s=noOfSitesPerUnit, o=noOfOverlappingSites)-1) sys.stderr.write(" will be split into %s units ... "%(noOfUnits)) overlappingRecordLs = [] for vcfRecord in vcfFile: counter += 1 #below the maximum: noOfUnits. unitNumber = min(noOfUnits, max(1, utils.getNoOfUnitsNeededToCoverN(N=counter, s=noOfSitesPerUnit, o=noOfOverlappingSites))) if unitNumber not in unitNumber2OutVCFFile: outputFname = '%s_unit%s.vcf'%(outputFnamePrefix, unitNumber) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = vcfFile.header outVCFFile.writeMetaAndHeader() outVCFFile.noOfLoci =0 #output the overlapping vcf records (from previous unit if overlappingRecordLs: for overlappingVCFRecord in overlappingRecordLs: outVCFFile.writeVCFRecord(overlappingVCFRecord) outVCFFile.noOfLoci += 1 overlappingRecordLs = [] #reset it unitNumber2OutVCFFile[unitNumber] = outVCFFile outVCFFile = unitNumber2OutVCFFile[unitNumber] outVCFFile.writeVCFRecord(vcfRecord) outVCFFile.noOfLoci += 1 #store the overlapping records if unitNumber<noOfUnits: if outVCFFile.noOfLoci>(noOfSitesPerUnit-noOfOverlappingSites): overlappingRecordLs.append(vcfRecord) vcfFile.close() #close all output files for unitNumber, outVCFFile in unitNumber2OutVCFFile.items(): outVCFFile.close() sys.stderr.write("%s loci split into %s files.\n"%(counter, len(unitNumber2OutVCFFile)))
def discoverFromVCFWithoutFilter(self, inputFname=None, outputFname=None, **keywords): """ 2012.9.11 read minDepth from self.minDepth 2012.9.5 add minDepth=0 to VCFFile #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ vcfFile = VCFFile(inputFname=inputFname, minDepth=self.minDepth) vcfFile.parseFile() read_group2col_index = vcfFile.sample_id2index locus_id2row_index = vcfFile.locus_id2row_index #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos new_locus_id2row_index = {} for locus_id, row_index in locus_id2row_index.items(): new_locus_id = '%s_%s' % (locus_id[0], locus_id[1]) new_locus_id2row_index[new_locus_id] = row_index locus_id2row_index = new_locus_id2row_index data_matrix = vcfFile.genotype_call_matrix self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \ read_group2col_index=read_group2col_index, \ locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusID2Stat = self.getLocusID2StatFunctionDict[self.runType](self.statFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) stat = locusID2Stat.get(key) if stat is None: continue toKeepLocus = True if self.minValue is not None and stat < self.minValue: toKeepLocus = False if self.maxValue is not None and stat > self.maxValue: toKeepLocus = False if toKeepLocus: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter>0: fraction = real_counter/float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusNewID2mapPvalue = self.getLocusNewID2mapPvalue( self.liftOverLocusMapPvalueFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) mapPvalue = locusNewID2mapPvalue.get(key) if mapPvalue is None: continue if mapPvalue > self.minLiftOverMapPvalue: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def openOneInputFile(self, inputFname=None): """ 2013.09.05 split out of fileWalker() , added VCFFile """ if self.inputFileFormat==2: reader = YHFile(inputFname, mode='r', tableName=self.h5TableName) elif self.inputFileFormat==3: reader = HDF5MatrixFile(inputFname, mode='r') elif self.inputFileFormat==4: reader = VCFFile(inputFname=inputFname) else: reader = MatrixFile(inputFname) return reader
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ #2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently #AbstractMatrixFileWalker.setup(self, **keywords) self.writer = VCFFile(outputFname=self.outputFname, mode='w') self.reader = VCFFile(inputFname=self.originalVCFFname, mode='r') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() # read all the Beagle files sampleID2BeagleFile = {} for inputFname in self.inputFnameLs: beagleFile = BeagleGenotypeFile(path=inputFname) beagleFile.readInAllHaplotypes() for individualID in beagleFile.sampleIDList: sampleID2BeagleFile[individualID] = beagleFile # get all haplotypes , etc. # get all sample IDs self.sampleID2BeagleFile = sampleID2BeagleFile
def extractFlankingSequence(self, inputFname=None, refFastaFname=None, outputFname=None, flankingLength=24,\ outputFormatType=1, alleleLength=1): """ 2013.09.03 added argument alleleLength 2012.10.10 added argument outputFormatType. 1: fasta, 2: fastq 2012.10.8 """ sys.stderr.write("Extracting flanking sequences of loci from %s, based on ref-sequence of %s, alleleLength=%s, outputFormatType=%s ...\n"%\ (inputFname, refFastaFname, alleleLength, outputFormatType)) vcfFile = VCFFile(inputFname=inputFname) outf = open(outputFname, 'w') refFastaFile = FastaFile(inputFname=refFastaFname) counter = 0 real_counter = 0 for vcfRecord in vcfFile: counter += 1 if alleleLength and (len(vcfRecord.refBase) != alleleLength or len(vcfRecord.altBase) != alleleLength): continue real_counter += 1 refBase = vcfRecord.refBase stopPos = vcfRecord.pos + len(refBase) - 1 SNP_ID = '%s_%s_%s_%s_%s' % (vcfRecord.chr, vcfRecord.pos, stopPos, vcfRecord.refBase, vcfRecord.altBase) fastaTitle = '%s_positionInFlank%s' % ( SNP_ID, flankingLength + 1) #positionInFlank is 1-based. flankSeqStart = max(1, vcfRecord.pos - flankingLength) flankSeqStop = stopPos + flankingLength flankingSequence = refFastaFile.getSequence(vcfRecord.chr, start=flankSeqStart, stop=flankSeqStop) if flankingSequence: if outputFormatType == 1: outf.write(">%s\n" % (fastaTitle)) outf.write('%s\n' % (flankingSequence)) else: outf.write("@%s\n" % (fastaTitle)) outf.write('%s\n' % (flankingSequence)) outf.write("+\n") outf.write("%s\n" % ('H' * len(flankingSequence))) del outf vcfFile.close() refFastaFile.close() sys.stderr.write("%s loci (%s total) written out.\n" % (real_counter, counter))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2count = self.readInSNPID2GenotypeVectorLs( self.inputFname, returnType=2).snp_pos2returnData reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) frequency = snp_pos2count.get(key) if frequency == 1: writer.writeVCFRecord(vcfRecord) real_counter += 1 reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = 0 sys.stderr.write("%s (out of %s, %s) snps are unique.\n" % (real_counter, counter, fraction))
def convertVCF2BjarniFormat(self, inputFname, outputFname, **keywords): """ #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ vcfFile = VCFFile(inputFname=inputFname) vcfFile.parseFile() read_group2col_index = vcfFile.sample_id2index locus_id2row_index = vcfFile.locus_id2row_index data_matrix = vcfFile.genotype_call_matrix self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \ read_group2col_index=read_group2col_index, \ locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
def _juxtaposeAlleleFrequencyFromMultiVCFInput(self, inputFnameLs=None, inputHeaderLs=None, outputFname=None, \ defaultNullFrequency=-0, **keywords): """ 2012.10.5 """ sys.stderr.write("Getting allele frequency from %s input ..." % (len(inputFnameLs))) #get locus2AF from inputFname locus2frequencyList = [] locus_id_set = set() for inputFname in inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) locus2frequency = vcfFile.getLocus2AlternativeAlleleFrequency() vcfFile.close() locus2frequencyList.append(locus2frequency) locus_id_set = locus_id_set.union(set(locus2frequency.keys())) sys.stderr.write("%s loci.\n" % (len(locus_id_set))) sys.stderr.write( "Outputting frequency collected from all input to %s ..." % (outputFname)) #output them in juxtaposition writer = csv.writer(open(outputFname, 'w'), delimiter='\t') header = ['locusID'] + inputHeaderLs + ['count'] writer.writerow(header) locus_id_list = sorted(locus_id_set) for locus_id in locus_id_list: locus_id_str_ls = map(str, locus_id) data_row = ['_'.join(locus_id_str_ls)] for i in range(len(locus2frequencyList)): locus2frequency = locus2frequencyList[i] frequency = locus2frequency.get(locus_id, defaultNullFrequency) data_row.append(frequency) data_row.append(1) writer.writerow(data_row) del writer sys.stderr.write("\n")
def getAllInfoTags(self, inputFname=None, **keywords): """ 2013.07.10 not used right now. """ sys.stderr.write("Extracting info tags from VCF %s ..." % (inputFname)) vcfFile = VCFFile(inputFname=inputFname) info_tag_set = set() counter = 0 real_counter = 0 for vcfRecord in vcfFile: for info_tag in vcfRecord.info_tag2value: info_tag_set.add(info_tag) counter += 1 vcfFile.close() sys.stderr.write("%s unique info tags.\n" % (len(info_tag_set))) return info_tag_set
def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1): """ returnType 1: snp_pos2returnData is snp_pos2genotypeVectorLs 2: snp_pos2returnData is snp_pos2returnData 2013.07.19 bugfix 2013.07.11 """ sys.stderr.write("Finding SNPs that have same positions from %s ..." % (inputFname)) reader = VCFFile(inputFname=inputFname) counter = 0 real_counter = 0 snp_pos2returnData = {} for vcfRecord in reader: key = (vcfRecord.chromosome, vcfRecord.position) if key not in snp_pos2returnData: if returnType == 1: snp_pos2returnData[key] = [] else: snp_pos2returnData[key] = 0 else: real_counter += 1 if returnType == 1: snp_pos2returnData[key].append( vcfRecord.data_row[1:]) #[0] is reference else: snp_pos2returnData[key] += 1 counter += 1 reader.close() sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\ (len(snp_pos2returnData), counter, real_counter)) return PassingData(snp_pos2returnData=snp_pos2returnData)
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #self.writer = BeagleGenotypeFile(path=self.outputFname, mode='w') #read in the IBD check result self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \ rowIDHeader=None, colIDHeader=None, \ rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False) #. read in the alignment coverage data alignmentCoverageFile = MatrixFile( path=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary( keyColumnIndexList=[0], valueColumnIndexList=[1]) alignmentCoverageFile.close() sys.stderr.write( "Reading in all samples from %s VCF input files ... \n" % (len(self.inputFnameLs))) # read all the Beagle files individualID2HaplotypeData = {} for inputFname in self.inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) #vcfFile.readInAllHaplotypes() for individualID in vcfFile.getSampleIDList(): individualID2HaplotypeData[individualID] = None #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID) #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList, # locusIDList=vcfFile.locusIDList) # get all haplotypes , etc. # get all sample IDs sys.stderr.write("%s individuals total.\n" % (len(individualID2HaplotypeData))) #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns) #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child) sys.stderr.write("Constructing individualID2pedigreeContext ...") plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname) pGraph = plinkPedigreeFile.pedigreeGraph #shrink the graph to only individuals with data pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys()) cc_subgraph_list = nx.connected_component_subgraphs( pGraph.to_undirected()) individualID2familyContext = {} outDegreeContainer = NumberContainer(minValue=0) familySizeContainer = NumberContainer(minValue=0) individualCoverageContainer = NumberContainer(minValue=0) familyCoverageContainer = NumberContainer(minValue=0) for cc_subgraph in cc_subgraph_list: familySize = len(cc_subgraph) familySizeContainer.addOneValue(familySize) familyCoverage = 0 for n in cc_subgraph: #assuming each family is a two-generation trio/nuclear family individualCoverage = self.getIndividualCoverage( individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs ) individualCoverage = float(individualCoverage) individualCoverageContainer.addOneValue(individualCoverage) familyCoverage += individualCoverage in_degree = pGraph.in_degree(n) out_degree = pGraph.out_degree(n) outDegreeContainer.addOneValue(out_degree) familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \ individualCoverage=individualCoverage,\ familyCoverage=None) if n not in individualID2familyContext: individualID2familyContext[n] = familyContext else: sys.stderr.write( "Node %s already in individualID2familyContext.\n" % (n)) familyCoverageContainer.addOneValue(familyCoverage) #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype for n in cc_subgraph: individualID2familyContext[n].familyCoverage = familyCoverage plinkPedigreeFile.close() sys.stderr.write("%s individuals.\n" % (len(individualID2familyContext))) # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual sys.stderr.write( "Weighing each individual , assigning probability mass ...") individualID2probabilityMass = {} for individualID, familyContext in individualID2familyContext.items(): outDegreeQuotient = outDegreeContainer.normalizeValue( familyContext.familySize) individualCoverageQuotient = individualCoverageContainer.normalizeValue( familyContext.individualCoverage) #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage) importanceScore = outDegreeQuotient + individualCoverageQuotient representativeImportanceScore = importanceScore individualID2probabilityMass[ individualID] = representativeImportanceScore sys.stderr.write(" %s IDs with probability mass assigned.\n" % (len(individualID2probabilityMass))) self.individualID2probabilityMass = individualID2probabilityMass self.individualID2HaplotypeData = individualID2HaplotypeData
def splitVCFIntoBeagleInputs(self, inputFname=None, beagleLikelihoodFile=None, \ familySize2BeagleFileHandler=None, pedigreeFamilyData=None, \ minProbForValidCall=0.9, markersFile=None): """ 2013.05.03 The non-likelihood (unphased, trios, pairs) Beagle format: I id sample1 sample1 sample2 sample2 A diabetes 1 1 2 2 M Contig791:1086 C C C C M Contig791:1649 T C C C M Contig791:4084 G A A A """ sys.stderr.write("Splitting VCFFile %s (+ one beagle Likelihood file %s) into Beagle trios/duos files, minProbForValidCall=%s ... \n"%\ (inputFname, beagleLikelihoodFile.inputFname, minProbForValidCall)) counter = 0 no_of_trios = 0 no_of_duos = 0 no_of_singletons = 0 totalNoOfCalls = 0 noOfCallsMarkedMissing = 0 vcfFile = VCFFile(inputFname=inputFname) familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList for vcfRecord in vcfFile: oneLocus = next(beagleLikelihoodFile) counter += 1 familySize2CallList = {} genotypeLikelihoodList = oneLocus.genotypeLikelihoodList for familySize, sampleIDList in familySize2SampleIDList.items(): if familySize not in familySize2CallList: familySize2CallList[familySize] = [] for sampleID in sampleIDList: totalNoOfCalls += 1 vcfGenotypeCallData = vcfRecord.getGenotypeCallForOneSample( sampleID) tripleLikelihood = beagleLikelihoodFile.getLikelihoodListOfOneGenotypeOneSample( oneLocus=oneLocus, sampleID=sampleID) if familySize == 1: no_of_singletons += 1 familySize2CallList[familySize].extend( tripleLikelihood) else: if familySize == 2: no_of_duos += 1 elif familySize == 3: no_of_trios += 1 tripleLikelihood = list(map(float, tripleLikelihood)) maxLikelihoodIndex = numpy.argmax(tripleLikelihood) maxLikelihood = tripleLikelihood[maxLikelihoodIndex] if maxLikelihood >= minProbForValidCall: if maxLikelihoodIndex == 0: diploidCallFromBeagle = [ oneLocus.alleleA, oneLocus.alleleA ] elif maxLikelihoodIndex == 1: diploidCallFromBeagle = [ oneLocus.alleleA, oneLocus.alleleB ] else: diploidCallFromBeagle = [ oneLocus.alleleB, oneLocus.alleleB ] else: noOfCallsMarkedMissing += 1 diploidCallFromBeagle = ['?', '?'] #if vcfGenotypeCallData is None: #DP is zero # sys.stderr.write("vcfGenotypeCallData for sample %s at locus %s, %s is None.\n"%\ # (sampleID, vcfRecord.chr, vcfRecord.pos)) # import pdb # pdb.set_trace() if vcfGenotypeCallData and \ self.checkConcordanceBetweenBeagleAndVCFCall(vcfGenotypeCallData['GT'], diploidCallFromBeagle): diploidCall = [ vcfGenotypeCallData['GT'][0], vcfGenotypeCallData['GT'][1] ] else: diploidCall = ['?', '?'] familySize2CallList[familySize].extend(diploidCall) for familySize, callList in familySize2CallList.items(): if familySize == 1: rowHeaderList = [ oneLocus.markerID, oneLocus.alleleA, oneLocus.alleleB ] else: rowHeaderList = ['M', oneLocus.markerID] beagleFileHandler = familySize2BeagleFileHandler[familySize] beagleFileHandler.writerow(rowHeaderList + callList) if markersFile is not None: markersFile.writerow([ oneLocus.markerID, oneLocus.markerID.split(':')[1], oneLocus.alleleA, oneLocus.alleleB ]) vcfFile.close() sys.stderr.write("%s loci, total %s calls, %s calls for singletons, %s calls for duos, %s calls for trios. %s calls marked missing.\n"%\ (counter, totalNoOfCalls, no_of_singletons, no_of_duos, no_of_trios, noOfCallsMarkedMissing))
def extractSamples(self, db_main=None, inputFname=None, outputFname=None, \ tax_id_set=None, site_id_set=None, country_id_set=None, \ min_coverage=None, max_coverage=None, outputFormat=1, is_contaminated=None,\ **keywords): """ 2013.07.03 added argument is_contaminated (whether to fetch contaminated samples or not) 2013.04.30 added argument min_coverage, max_coverage 2012.10.10 added argument outputFormat. 2012.10.5 """ sys.stderr.write("Extracting samples from %s, %s sites & %s countries & %s taxonomies, min_coverage=%s, max_coverage=%s, outputFormat=%s, is_contaminated=%s ...\n"%\ (inputFname,\ getattr(site_id_set, '__len__', returnZeroFunc)(),\ getattr(country_id_set, '__len__', returnZeroFunc)(),\ getattr(tax_id_set, '__len__', returnZeroFunc)(), min_coverage, max_coverage,\ outputFormat, is_contaminated )) vcfFile = VCFFile(inputFname=inputFname) oldHeader = vcfFile.header oldHeaderLength = len(oldHeader) newHeader = oldHeader[:vcfFile. sampleStartingColumn] #anything before the samples are same no_of_samples = 0 col_index2sampleID = { } #this structure stores the selected samples and their column index for col_index, individual_name in vcfFile.get_col_index_individual_name_ls( ): individualAlignment = db_main.parseAlignmentReadGroup( individual_name).individualAlignment if individualAlignment is not None: filteredAlignmentList = db_main.filterAlignments(alignmentLs=[individualAlignment], min_coverage=min_coverage, \ max_coverage=max_coverage, individual_site_id=None, \ sequence_filtered=None, individual_site_id_set=site_id_set, \ mask_genotype_method_id=None, parent_individual_alignment_id=None,\ country_id_set=country_id_set, tax_id_set=tax_id_set, excludeContaminant=False, \ is_contaminated=is_contaminated, excludeTissueIDSet=None,\ local_realigned=None, reduce_reads=None, report=False) if filteredAlignmentList: #non-empty, passed the filter newHeader.append(individual_name) no_of_samples += 1 col_index2sampleID[col_index] = individual_name else: sys.stderr.write( "Warning: no individualAlignment for sample %s.\n" % (individual_name)) sys.exit(3) no_of_snps = 0 if outputFormat == 1: outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() newHeaderLength = len(newHeader) for vcfRecord in vcfFile: data_row = vcfRecord.row[:vcfFile.sampleStartingColumn] for i in range(vcfFile.sampleStartingColumn, oldHeaderLength): if i in col_index2sampleID: data_row.append(vcfRecord.row[i]) outVCFFile.writer.writerow(data_row) no_of_snps += 1 outVCFFile.close() elif outputFormat in [2, 3]: outf = open(outputFname, 'w') if outputFormat == 2: outf.write("sampleID\n") for col_index, sampleID in col_index2sampleID.items(): outf.write("%s\n" % (sampleID)) outf.close() vcfFile.close() sys.stderr.write("%s samples X %s SNPs.\n" % (no_of_samples, no_of_snps))