def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
def filterVCFSNPCluster(self, inputFname=None, outputFname=None, minNeighborDistance=10, **keywords): """ #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ sys.stderr.write( "Filtering VCF %s to get rid of SNPs that are %s distance apart ..." % (inputFname, minNeighborDistance)) vcfFile = VCFFile(inputFname=inputFname) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = vcfFile.header outVCFFile.writeMetaAndHeader() previousVCFRecord = None previousVCFRecordIsBad = False #indicator whether previous record is bad or not. based on distance to the previous-previous record counter = 0 for vcfRecord in vcfFile: if previousVCFRecord is not None: if previousVCFRecord.chr == vcfRecord.chr: distanceToPreviousRecord = abs(vcfRecord.pos - previousVCFRecord.pos) if distanceToPreviousRecord < minNeighborDistance: previousVCFRecordIsBad = True else: if not previousVCFRecordIsBad: #distance to current & previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False else: #handle the last record from the previous chromosome (assuming loci are in chromosomal order) if not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False #reset previousVCFRecord = vcfRecord counter += 1 vcfFile.close() #handle the last record if previousVCFRecord is not None and not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) outVCFFile.close() noOfLociAfterFilter = len(outVCFFile.locus_id_ls) delta = counter - noOfLociAfterFilter if counter > 0: fraction = delta / float(counter) else: fraction = -0.0 sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" % (delta, counter, noOfLociAfterFilter, fraction * 100))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap( self.coordinateMapFname) self.reader = VCFFile(inputFname=self.inputFname) self.writer = VCFFile(outputFname=self.outputFname, openMode='w') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() counter = 0 real_counter = 0 noOfRecordsWithMultiNewCoords = 0 for vcfRecord in self.reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key) if newCoordinateDataLs is None: continue if len(newCoordinateDataLs) > 1: noOfRecordsWithMultiNewCoords += 1 continue newCoordinateData = newCoordinateDataLs[0] vcfRecord.setChromosome(newCoordinateData.newChromosome) vcfRecord.setPosition(newCoordinateData.newStart) if newCoordinateData.strand == '-': newRefBase = Seq( newCoordinateData.oldRefBase).reverse_complement() newAltBase = Seq( newCoordinateData.oldAltBase).reverse_complement() else: newRefBase = newCoordinateData.oldRefBase newAltBase = newCoordinateData.oldAltBase vcfRecord.setRefAllele(newRefBase) vcfRecord.setAltAllele(newAltBase) real_counter += 1 self.writer.writeVCFRecord(vcfRecord) self.reader.close() self.writer.close() sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \ real_counter/float(counter), noOfRecordsWithMultiNewCoords))
def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1): """ returnType 1: snp_pos2returnData is snp_pos2genotypeVectorLs 2: snp_pos2returnData is snp_pos2returnData 2013.07.19 bugfix 2013.07.11 """ sys.stderr.write("Finding SNPs that have same positions from %s ..."%(inputFname)) reader = VCFFile(inputFname=inputFname) counter = 0 real_counter = 0 snp_pos2returnData = {} for vcfRecord in reader: key = (vcfRecord.chromosome, vcfRecord.position) if key not in snp_pos2returnData: if returnType==1: snp_pos2returnData[key] = [] else: snp_pos2returnData[key] = 0 else: real_counter += 1 if returnType==1: snp_pos2returnData[key].append(vcfRecord.data_row[1:]) #[0] is reference else: snp_pos2returnData[key] += 1 counter += 1 reader.close() sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\ (len(snp_pos2returnData), counter, real_counter)) return PassingData(snp_pos2returnData=snp_pos2returnData)
def calculateSiteGap(self, inputFname, outputFname, chromosome=None, chrLength=None, minDepth=1): """ 2011-11-2 given a VCF file, count the number of h**o-ref, h**o-alt, het calls """ sys.stderr.write("Calculate the distances between sites of %s .\n" % (inputFname)) writer = csv.writer(open(outputFname, 'w'), delimiter='\t') writer.writerow( ['chromosome', 'position', 'length', "distanceToNextSite"]) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) no_of_total = 0. minStart = None previousPosition = None for vcfRecord in vcfFile.parseIter(): chr = vcfRecord.chr pos = vcfRecord.pos pos = int(pos) if previousPosition is not None: distanceToNextSite = pos - previousPosition data_row = [ chr, previousPosition, chrLength, distanceToNextSite ] writer.writerow(data_row) previousPosition = pos del writer sys.stderr.write("Done.\n")
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusID2Stat = self.getLocusID2StatFunctionDict[self.runType]( self.statFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) stat = locusID2Stat.get(key) if stat is None: continue toKeepLocus = True if self.minValue is not None and stat < self.minValue: toKeepLocus = False if self.maxValue is not None and stat > self.maxValue: toKeepLocus = False if toKeepLocus: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusNewID2mapPvalue = self.getLocusNewID2mapPvalue( self.liftOverLocusMapPvalueFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) mapPvalue = locusNewID2mapPvalue.get(key) if mapPvalue is None: continue if mapPvalue > self.minLiftOverMapPvalue: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def openOneInputFile(self, inputFname=None): """ 2013.09.05 split out of fileWalker() , added VCFFile """ if self.inputFileFormat==2: #2012.12.20 reader = YHFile(inputFname, openMode='r', tableName=self.h5TableName) elif self.inputFileFormat==3: #2012.11.22 reader = HDF5MatrixFile(inputFname, openMode='r') elif self.inputFileFormat==4: reader = VCFFile(inputFname=inputFname) else: reader = MatrixFile(inputFname) return reader
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ #2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently #AbstractMatrixFileWalker.setup(self, **keywords) self.writer = VCFFile(outputFname=self.outputFname, openMode='w') self.reader = VCFFile(inputFname=self.originalVCFFname, openMode='r') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() # read all the Beagle files sampleID2BeagleFile = {} for inputFname in self.inputFnameLs: beagleFile = BeagleGenotypeFile(inputFname=inputFname) beagleFile.readInAllHaplotypes() for individualID in beagleFile.sampleIDList: sampleID2BeagleFile[individualID] = beagleFile # get all haplotypes , etc. # get all sample IDs self.sampleID2BeagleFile = sampleID2BeagleFile
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2count = self.readInSNPID2GenotypeVectorLs( self.inputFname, returnType=2).snp_pos2returnData reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) frequency = snp_pos2count.get(key) if frequency == 1: writer.writeVCFRecord(vcfRecord) real_counter += 1 reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = 0 sys.stderr.write("%s (out of %s, %s) snps are unique.\n" % (real_counter, counter, fraction))
def extractFlankingSequence(self, inputFname=None, refFastaFname=None, outputFname=None, flankingLength=24,\ outputFormatType=1, alleleLength=1): """ 2013.09.03 added argument alleleLength 2012.10.10 added argument outputFormatType. 1: fasta, 2: fastq 2012.10.8 """ sys.stderr.write("Extracting flanking sequences of loci from %s, based on ref-sequence of %s, alleleLength=%s, outputFormatType=%s ...\n"%\ (inputFname, refFastaFname, alleleLength, outputFormatType)) vcfFile = VCFFile(inputFname=inputFname) outf = open(outputFname, 'w') refFastaFile = FastaFile(inputFname=refFastaFname) counter = 0 real_counter = 0 for vcfRecord in vcfFile: counter += 1 if alleleLength and (len(vcfRecord.refBase)!=alleleLength or len(vcfRecord.altBase)!=alleleLength): continue real_counter += 1 refBase = vcfRecord.refBase stopPos = vcfRecord.pos + len(refBase) -1 SNP_ID = '%s_%s_%s_%s_%s'%(vcfRecord.chr, vcfRecord.pos, stopPos, vcfRecord.refBase, vcfRecord.altBase) fastaTitle = '%s_positionInFlank%s'%(SNP_ID, flankingLength+1) #positionInFlank is 1-based. flankSeqStart = max(1, vcfRecord.pos-flankingLength) flankSeqStop = stopPos + flankingLength flankingSequence = refFastaFile.getSequence(vcfRecord.chr, start=flankSeqStart, stop=flankSeqStop) if flankingSequence: if outputFormatType==1: outf.write(">%s\n"%(fastaTitle)) outf.write('%s\n'%(flankingSequence)) else: outf.write("@%s\n"%(fastaTitle)) outf.write('%s\n'%(flankingSequence)) outf.write("+\n") outf.write("%s\n"%('H'*len(flankingSequence))) del outf vcfFile.close() refFastaFile.close() sys.stderr.write("%s loci (%s total) written out.\n"%(real_counter, counter))
def convertVCF2BjarniFormat(self, inputFname, outputFname, **keywords): """ #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ vcfFile = VCFFile(inputFname=inputFname) vcfFile.parseFile() read_group2col_index = vcfFile.sample_id2index locus_id2row_index = vcfFile.locus_id2row_index data_matrix = vcfFile.genotype_call_matrix self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \ read_group2col_index=read_group2col_index, \ locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
def _juxtaposeAlleleFrequencyFromMultiVCFInput(self, inputFnameLs=None, inputHeaderLs=None, outputFname=None, \ defaultNullFrequency=-0, **keywords): """ 2012.10.5 """ sys.stderr.write("Getting allele frequency from %s input ..." % (len(inputFnameLs))) #get locus2AF from inputFname locus2frequencyList = [] locus_id_set = set() for inputFname in inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) locus2frequency = vcfFile.getLocus2AlternativeAlleleFrequency() vcfFile.close() locus2frequencyList.append(locus2frequency) locus_id_set = locus_id_set.union(set(locus2frequency.keys())) sys.stderr.write("%s loci.\n" % (len(locus_id_set))) sys.stderr.write( "Outputting frequency collected from all input to %s ..." % (outputFname)) #output them in juxtaposition writer = csv.writer(open(outputFname, 'w'), delimiter='\t') header = ['locusID'] + inputHeaderLs + ['count'] writer.writerow(header) locus_id_list = list(locus_id_set) locus_id_list.sort() for locus_id in locus_id_list: locus_id_str_ls = map(str, locus_id) data_row = ['_'.join(locus_id_str_ls)] for i in xrange(len(locus2frequencyList)): locus2frequency = locus2frequencyList[i] frequency = locus2frequency.get(locus_id, defaultNullFrequency) data_row.append(frequency) data_row.append(1) writer.writerow(data_row) del writer sys.stderr.write("\n")
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #self.writer = BeagleGenotypeFile(inputFname=self.outputFname, openMode='w') #read in the IBD check result self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \ rowIDHeader=None, colIDHeader=None, \ rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False) #. read in the alignment coverage data alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1]) alignmentCoverageFile.close() sys.stderr.write("Reading in all samples from %s VCF input files ... \n"%(len(self.inputFnameLs))) # read all the Beagle files individualID2HaplotypeData = {} for inputFname in self.inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) #vcfFile.readInAllHaplotypes() for individualID in vcfFile.getSampleIDList(): individualID2HaplotypeData[individualID] = None #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID) #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList, # locusIDList=vcfFile.locusIDList) # get all haplotypes , etc. # get all sample IDs sys.stderr.write("%s individuals total.\n"%(len(individualID2HaplotypeData))) #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns) #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child) sys.stderr.write("Constructing individualID2pedigreeContext ...") plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname) pGraph = plinkPedigreeFile.pedigreeGraph #shrink the graph to only individuals with data pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys()) cc_subgraph_list = nx.connected_component_subgraphs(pGraph.to_undirected()) individualID2familyContext = {} outDegreeContainer = NumberContainer(minValue=0) familySizeContainer = NumberContainer(minValue=0) individualCoverageContainer = NumberContainer(minValue=0) familyCoverageContainer = NumberContainer(minValue=0) for cc_subgraph in cc_subgraph_list: familySize= len(cc_subgraph) familySizeContainer.addOneValue(familySize) familyCoverage = 0 for n in cc_subgraph: #assuming each family is a two-generation trio/nuclear family individualCoverage = self.getIndividualCoverage(individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs) individualCoverage = float(individualCoverage) individualCoverageContainer.addOneValue(individualCoverage) familyCoverage += individualCoverage in_degree = pGraph.in_degree(n) out_degree = pGraph.out_degree(n) outDegreeContainer.addOneValue(out_degree) familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \ individualCoverage=individualCoverage,\ familyCoverage=None) if n not in individualID2familyContext: individualID2familyContext[n] = familyContext else: sys.stderr.write("Node %s already in individualID2familyContext.\n"%(n)) familyCoverageContainer.addOneValue(familyCoverage) #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype for n in cc_subgraph: individualID2familyContext[n].familyCoverage = familyCoverage plinkPedigreeFile.close() sys.stderr.write("%s individuals.\n"%(len(individualID2familyContext))) # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual sys.stderr.write("Weighing each individual , assigning probability mass ...") individualID2probabilityMass = {} for individualID, familyContext in individualID2familyContext.iteritems(): outDegreeQuotient = outDegreeContainer.normalizeValue(familyContext.familySize) individualCoverageQuotient = individualCoverageContainer.normalizeValue(familyContext.individualCoverage) #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage) importanceScore = outDegreeQuotient + individualCoverageQuotient representativeImportanceScore = importanceScore individualID2probabilityMass[individualID] = representativeImportanceScore sys.stderr.write(" %s IDs with probability mass assigned.\n"%(len(individualID2probabilityMass))) self.individualID2probabilityMass = individualID2probabilityMass self.individualID2HaplotypeData = individualID2HaplotypeData
def splitVCFIntoBeagleInputs(self, inputFname=None, beagleLikelihoodFile=None, \ familySize2BeagleFileHandler=None, pedigreeFamilyData=None, \ minProbForValidCall=0.9, markersFile=None): """ 2013.05.03 The non-likelihood (unphased, trios, pairs) Beagle format: I id sample1 sample1 sample2 sample2 A diabetes 1 1 2 2 M Contig791:1086 C C C C M Contig791:1649 T C C C M Contig791:4084 G A A A """ sys.stderr.write("Splitting VCFFile %s (+ one beagle Likelihood file %s) into Beagle trios/duos files, minProbForValidCall=%s ... \n"%\ (inputFname, beagleLikelihoodFile.inputFname, minProbForValidCall)) counter = 0 no_of_trios = 0 no_of_duos = 0 no_of_singletons = 0 totalNoOfCalls = 0 noOfCallsMarkedMissing = 0 vcfFile = VCFFile(inputFname=inputFname) familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList for vcfRecord in vcfFile: oneLocus = beagleLikelihoodFile.next() counter += 1 familySize2CallList = {} genotypeLikelihoodList = oneLocus.genotypeLikelihoodList for familySize, sampleIDList in familySize2SampleIDList.iteritems( ): if familySize not in familySize2CallList: familySize2CallList[familySize] = [] for sampleID in sampleIDList: totalNoOfCalls += 1 vcfGenotypeCallData = vcfRecord.getGenotypeCallForOneSample( sampleID) tripleLikelihood = beagleLikelihoodFile.getLikelihoodListOfOneGenotypeOneSample( oneLocus=oneLocus, sampleID=sampleID) if familySize == 1: no_of_singletons += 1 familySize2CallList[familySize].extend( tripleLikelihood) else: if familySize == 2: no_of_duos += 1 elif familySize == 3: no_of_trios += 1 tripleLikelihood = map(float, tripleLikelihood) maxLikelihoodIndex = numpy.argmax(tripleLikelihood) maxLikelihood = tripleLikelihood[maxLikelihoodIndex] if maxLikelihood >= minProbForValidCall: if maxLikelihoodIndex == 0: diploidCallFromBeagle = [ oneLocus.alleleA, oneLocus.alleleA ] elif maxLikelihoodIndex == 1: diploidCallFromBeagle = [ oneLocus.alleleA, oneLocus.alleleB ] else: diploidCallFromBeagle = [ oneLocus.alleleB, oneLocus.alleleB ] else: noOfCallsMarkedMissing += 1 diploidCallFromBeagle = ['?', '?'] #if vcfGenotypeCallData is None: #DP is zero # sys.stderr.write("vcfGenotypeCallData for sample %s at locus %s, %s is None.\n"%\ # (sampleID, vcfRecord.chr, vcfRecord.pos)) # import pdb # pdb.set_trace() if vcfGenotypeCallData and self.checkConcordanceBetweenBeagleAndVCFCall( vcfGenotypeCallData['GT'], diploidCallFromBeagle): diploidCall = [ vcfGenotypeCallData['GT'][0], vcfGenotypeCallData['GT'][1] ] else: diploidCall = ['?', '?'] familySize2CallList[familySize].extend(diploidCall) for familySize, callList in familySize2CallList.iteritems(): if familySize == 1: rowHeaderList = [ oneLocus.markerID, oneLocus.alleleA, oneLocus.alleleB ] else: rowHeaderList = ['M', oneLocus.markerID] beagleFileHandler = familySize2BeagleFileHandler[familySize] beagleFileHandler.writerow(rowHeaderList + callList) if markersFile is not None: markersFile.writerow([ oneLocus.markerID, oneLocus.markerID.split(':')[1], oneLocus.alleleA, oneLocus.alleleB ]) vcfFile.close() sys.stderr.write("%s loci, total %s calls, %s calls for singletons, %s calls for duos, %s calls for trios. %s calls marked missing.\n"%\ (counter, totalNoOfCalls, no_of_singletons, no_of_duos, no_of_trios, noOfCallsMarkedMissing))