Ejemplo n.º 1
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, openMode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
Ejemplo n.º 2
0
    def filterVCFSNPCluster(self,
                            inputFname=None,
                            outputFname=None,
                            minNeighborDistance=10,
                            **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        sys.stderr.write(
            "Filtering VCF %s to get rid of SNPs that are %s distance apart ..."
            % (inputFname, minNeighborDistance))
        vcfFile = VCFFile(inputFname=inputFname)

        outVCFFile = VCFFile(outputFname=outputFname)
        outVCFFile.metaInfoLs = vcfFile.metaInfoLs
        outVCFFile.header = vcfFile.header
        outVCFFile.writeMetaAndHeader()

        previousVCFRecord = None
        previousVCFRecordIsBad = False  #indicator whether previous record is bad or not. based on distance to the previous-previous record
        counter = 0
        for vcfRecord in vcfFile:
            if previousVCFRecord is not None:
                if previousVCFRecord.chr == vcfRecord.chr:
                    distanceToPreviousRecord = abs(vcfRecord.pos -
                                                   previousVCFRecord.pos)
                    if distanceToPreviousRecord < minNeighborDistance:
                        previousVCFRecordIsBad = True
                    else:
                        if not previousVCFRecordIsBad:  #distance to current & previous-previous record is >=minNeighborDistance
                            outVCFFile.writeVCFRecord(previousVCFRecord)
                        previousVCFRecordIsBad = False
                else:
                    #handle the last record from the previous chromosome (assuming loci are in chromosomal order)
                    if not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
                        outVCFFile.writeVCFRecord(previousVCFRecord)

                    previousVCFRecordIsBad = False  #reset

            previousVCFRecord = vcfRecord
            counter += 1
        vcfFile.close()

        #handle the last record
        if previousVCFRecord is not None and not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
            outVCFFile.writeVCFRecord(previousVCFRecord)
        outVCFFile.close()

        noOfLociAfterFilter = len(outVCFFile.locus_id_ls)
        delta = counter - noOfLociAfterFilter
        if counter > 0:
            fraction = delta / float(counter)
        else:
            fraction = -0.0
        sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" %
                         (delta, counter, noOfLociAfterFilter, fraction * 100))
Ejemplo n.º 3
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap(
            self.coordinateMapFname)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        noOfRecordsWithMultiNewCoords = 0

        for vcfRecord in self.reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key)
            if newCoordinateDataLs is None:
                continue
            if len(newCoordinateDataLs) > 1:
                noOfRecordsWithMultiNewCoords += 1
                continue
            newCoordinateData = newCoordinateDataLs[0]
            vcfRecord.setChromosome(newCoordinateData.newChromosome)
            vcfRecord.setPosition(newCoordinateData.newStart)
            if newCoordinateData.strand == '-':
                newRefBase = Seq(
                    newCoordinateData.oldRefBase).reverse_complement()
                newAltBase = Seq(
                    newCoordinateData.oldAltBase).reverse_complement()
            else:
                newRefBase = newCoordinateData.oldRefBase
                newAltBase = newCoordinateData.oldAltBase

            vcfRecord.setRefAllele(newRefBase)
            vcfRecord.setAltAllele(newAltBase)
            real_counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
        sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
                  real_counter/float(counter), noOfRecordsWithMultiNewCoords))
Ejemplo n.º 4
0
	def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1):
		"""
		returnType
			1: snp_pos2returnData is snp_pos2genotypeVectorLs
			2: snp_pos2returnData is snp_pos2returnData
		2013.07.19 bugfix
		2013.07.11
		"""
		sys.stderr.write("Finding SNPs that have same positions from %s ..."%(inputFname))
		
		reader = VCFFile(inputFname=inputFname)
		counter = 0
		real_counter = 0
		snp_pos2returnData = {}
		for vcfRecord in reader:
			key = (vcfRecord.chromosome, vcfRecord.position)
			if key not in snp_pos2returnData:
				if returnType==1:
					snp_pos2returnData[key] = []
				else:
					snp_pos2returnData[key] = 0
			else:
				real_counter += 1
			
			if returnType==1:
				snp_pos2returnData[key].append(vcfRecord.data_row[1:])	#[0] is reference
			else:
				snp_pos2returnData[key] += 1
			
			counter += 1
		reader.close()
		sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\
						(len(snp_pos2returnData), counter, real_counter))
		return PassingData(snp_pos2returnData=snp_pos2returnData)
Ejemplo n.º 5
0
    def calculateSiteGap(self,
                         inputFname,
                         outputFname,
                         chromosome=None,
                         chrLength=None,
                         minDepth=1):
        """
		2011-11-2
			given a VCF file, count the number of h**o-ref, h**o-alt, het calls
			
		"""
        sys.stderr.write("Calculate the distances between sites of %s .\n" %
                         (inputFname))
        writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
        writer.writerow(
            ['chromosome', 'position', 'length', "distanceToNextSite"])
        vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth)

        no_of_total = 0.
        minStart = None
        previousPosition = None
        for vcfRecord in vcfFile.parseIter():
            chr = vcfRecord.chr
            pos = vcfRecord.pos
            pos = int(pos)
            if previousPosition is not None:
                distanceToNextSite = pos - previousPosition
                data_row = [
                    chr, previousPosition, chrLength, distanceToNextSite
                ]
                writer.writerow(data_row)
            previousPosition = pos
        del writer
        sys.stderr.write("Done.\n")
Ejemplo n.º 6
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusID2Stat = self.getLocusID2StatFunctionDict[self.runType](
            self.statFname)

        reader = VCFFile(inputFname=self.inputFname)
        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            stat = locusID2Stat.get(key)
            if stat is None:
                continue

            toKeepLocus = True
            if self.minValue is not None and stat < self.minValue:
                toKeepLocus = False
            if self.maxValue is not None and stat > self.maxValue:
                toKeepLocus = False

            if toKeepLocus:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
Ejemplo n.º 7
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusNewID2mapPvalue = self.getLocusNewID2mapPvalue(
            self.liftOverLocusMapPvalueFname)

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            mapPvalue = locusNewID2mapPvalue.get(key)
            if mapPvalue is None:
                continue

            if mapPvalue > self.minLiftOverMapPvalue:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
	def openOneInputFile(self, inputFname=None):
		"""
		2013.09.05 split out of fileWalker() , added VCFFile
		"""
		if self.inputFileFormat==2:	#2012.12.20
			reader = YHFile(inputFname, openMode='r', tableName=self.h5TableName)
		elif self.inputFileFormat==3:	#2012.11.22
			reader = HDF5MatrixFile(inputFname, openMode='r')
		elif self.inputFileFormat==4:
			reader = VCFFile(inputFname=inputFname)
		else:
			reader = MatrixFile(inputFname)
		return reader
    def setup(self, **keywords):
        """
		2012.10.15
			run before anything is run
		"""
        #2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently
        #AbstractMatrixFileWalker.setup(self, **keywords)
        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.reader = VCFFile(inputFname=self.originalVCFFname, openMode='r')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        # read all the Beagle files
        sampleID2BeagleFile = {}
        for inputFname in self.inputFnameLs:
            beagleFile = BeagleGenotypeFile(inputFname=inputFname)
            beagleFile.readInAllHaplotypes()
            for individualID in beagleFile.sampleIDList:
                sampleID2BeagleFile[individualID] = beagleFile
            # get all haplotypes , etc.
            # get all sample IDs
        self.sampleID2BeagleFile = sampleID2BeagleFile
Ejemplo n.º 10
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2count = self.readInSNPID2GenotypeVectorLs(
            self.inputFname, returnType=2).snp_pos2returnData

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            frequency = snp_pos2count.get(key)
            if frequency == 1:
                writer.writeVCFRecord(vcfRecord)
                real_counter += 1

        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = 0
        sys.stderr.write("%s (out of %s, %s) snps are unique.\n" %
                         (real_counter, counter, fraction))
	def extractFlankingSequence(self, inputFname=None, refFastaFname=None, outputFname=None, flankingLength=24,\
							outputFormatType=1, alleleLength=1):
		"""
		2013.09.03 added argument alleleLength
		2012.10.10
			added argument outputFormatType. 1: fasta, 2: fastq
		2012.10.8
		"""
		sys.stderr.write("Extracting flanking sequences of loci from %s, based on ref-sequence of %s, alleleLength=%s, outputFormatType=%s ...\n"%\
						(inputFname, refFastaFname, alleleLength, outputFormatType))
		vcfFile = VCFFile(inputFname=inputFname)
		outf = open(outputFname, 'w')
		refFastaFile = FastaFile(inputFname=refFastaFname)
		
		counter = 0
		real_counter = 0
		for vcfRecord in vcfFile:
			counter += 1
			if alleleLength and (len(vcfRecord.refBase)!=alleleLength or len(vcfRecord.altBase)!=alleleLength):
				continue
			
			real_counter += 1
			refBase = vcfRecord.refBase
			stopPos = vcfRecord.pos + len(refBase) -1
			
			SNP_ID = '%s_%s_%s_%s_%s'%(vcfRecord.chr, vcfRecord.pos, stopPos, vcfRecord.refBase, vcfRecord.altBase)
			fastaTitle = '%s_positionInFlank%s'%(SNP_ID, flankingLength+1)	#positionInFlank is 1-based.
			flankSeqStart = max(1, vcfRecord.pos-flankingLength)
			flankSeqStop = stopPos + flankingLength
			flankingSequence = refFastaFile.getSequence(vcfRecord.chr, start=flankSeqStart, stop=flankSeqStop)
			if flankingSequence:
				if outputFormatType==1:
					outf.write(">%s\n"%(fastaTitle))
					outf.write('%s\n'%(flankingSequence))
				else:
					outf.write("@%s\n"%(fastaTitle))
					outf.write('%s\n'%(flankingSequence))
					outf.write("+\n")
					outf.write("%s\n"%('H'*len(flankingSequence)))
						
				
		
		del outf
		vcfFile.close()
		refFastaFile.close()
		sys.stderr.write("%s loci (%s total) written out.\n"%(real_counter, counter))
Ejemplo n.º 12
0
    def convertVCF2BjarniFormat(self, inputFname, outputFname, **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        vcfFile = VCFFile(inputFname=inputFname)
        vcfFile.parseFile()

        read_group2col_index = vcfFile.sample_id2index
        locus_id2row_index = vcfFile.locus_id2row_index

        data_matrix = vcfFile.genotype_call_matrix

        self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \
           read_group2col_index=read_group2col_index, \
           locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
Ejemplo n.º 13
0
    def _juxtaposeAlleleFrequencyFromMultiVCFInput(self, inputFnameLs=None, inputHeaderLs=None, outputFname=None, \
             defaultNullFrequency=-0, **keywords):
        """
		2012.10.5
		
		"""
        sys.stderr.write("Getting allele frequency from %s input ..." %
                         (len(inputFnameLs)))

        #get locus2AF from inputFname
        locus2frequencyList = []

        locus_id_set = set()
        for inputFname in inputFnameLs:
            vcfFile = VCFFile(inputFname=inputFname)
            locus2frequency = vcfFile.getLocus2AlternativeAlleleFrequency()
            vcfFile.close()
            locus2frequencyList.append(locus2frequency)
            locus_id_set = locus_id_set.union(set(locus2frequency.keys()))
        sys.stderr.write("%s loci.\n" % (len(locus_id_set)))

        sys.stderr.write(
            "Outputting frequency collected from all input to %s ..." %
            (outputFname))
        #output them in juxtaposition
        writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
        header = ['locusID'] + inputHeaderLs + ['count']
        writer.writerow(header)

        locus_id_list = list(locus_id_set)
        locus_id_list.sort()

        for locus_id in locus_id_list:
            locus_id_str_ls = map(str, locus_id)
            data_row = ['_'.join(locus_id_str_ls)]
            for i in xrange(len(locus2frequencyList)):
                locus2frequency = locus2frequencyList[i]
                frequency = locus2frequency.get(locus_id, defaultNullFrequency)
                data_row.append(frequency)
            data_row.append(1)
            writer.writerow(data_row)
        del writer
        sys.stderr.write("\n")
Ejemplo n.º 14
0
	def setup(self, **keywords):
		"""
		2012.10.15
			run before anything is run
		"""
		AbstractMatrixFileWalker.setup(self, **keywords)
		#self.writer = BeagleGenotypeFile(inputFname=self.outputFname, openMode='w')
		
		#read in the IBD check result
		self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \
								rowIDHeader=None, colIDHeader=None, \
								rowIDIndex=0, colIDIndex=1, \
								dataHeader=None, dataIndex=2, hasHeader=False)
		
		#. read in the alignment coverage data
		alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname)
		alignmentCoverageFile.constructColName2IndexFromHeader()
		alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1])
		alignmentCoverageFile.close()
		
		sys.stderr.write("Reading in all samples from %s VCF input files ... \n"%(len(self.inputFnameLs)))
		# read all the Beagle files
		individualID2HaplotypeData = {}
		for inputFname in self.inputFnameLs:
			vcfFile = VCFFile(inputFname=inputFname)
			#vcfFile.readInAllHaplotypes()
			for individualID in vcfFile.getSampleIDList():
				individualID2HaplotypeData[individualID] = None
				#haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID)
				#individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList,
				#													locusIDList=vcfFile.locusIDList)
			# get all haplotypes , etc.
			# get all sample IDs
		sys.stderr.write("%s individuals total.\n"%(len(individualID2HaplotypeData)))
		
		#. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns)
		#. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child)
		sys.stderr.write("Constructing individualID2pedigreeContext ...")
		plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname)
		pGraph = plinkPedigreeFile.pedigreeGraph
		#shrink the graph to only individuals with data
		pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys())
		
		cc_subgraph_list = nx.connected_component_subgraphs(pGraph.to_undirected())
		individualID2familyContext = {}
		outDegreeContainer = NumberContainer(minValue=0)
		familySizeContainer = NumberContainer(minValue=0)
		individualCoverageContainer = NumberContainer(minValue=0)
		familyCoverageContainer = NumberContainer(minValue=0)
		for cc_subgraph in cc_subgraph_list:
			familySize= len(cc_subgraph)
			familySizeContainer.addOneValue(familySize)
			
			familyCoverage = 0
			for n in cc_subgraph:	#assuming each family is a two-generation trio/nuclear family
				individualCoverage = self.getIndividualCoverage(individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs)
				individualCoverage = float(individualCoverage)
				individualCoverageContainer.addOneValue(individualCoverage)
				familyCoverage += individualCoverage
				in_degree = pGraph.in_degree(n)
				out_degree = pGraph.out_degree(n)
				outDegreeContainer.addOneValue(out_degree)
				familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \
										individualCoverage=individualCoverage,\
										familyCoverage=None)
				if n not in individualID2familyContext:
					individualID2familyContext[n] = familyContext
				else:
					sys.stderr.write("Node %s already in individualID2familyContext.\n"%(n))
			familyCoverageContainer.addOneValue(familyCoverage)
			#set the family coverage for each member, used in weighing the individual. better covered family => better haplotype
			for n in cc_subgraph:
				individualID2familyContext[n].familyCoverage = familyCoverage
		plinkPedigreeFile.close()
		sys.stderr.write("%s individuals.\n"%(len(individualID2familyContext)))
		
		
		# weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual
		sys.stderr.write("Weighing each individual , assigning probability mass  ...")
		individualID2probabilityMass = {}
		for individualID, familyContext in individualID2familyContext.iteritems():
			outDegreeQuotient = outDegreeContainer.normalizeValue(familyContext.familySize)
			individualCoverageQuotient = individualCoverageContainer.normalizeValue(familyContext.individualCoverage)
			#familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage)
			importanceScore = outDegreeQuotient + individualCoverageQuotient
			representativeImportanceScore = importanceScore
			individualID2probabilityMass[individualID] = representativeImportanceScore
		sys.stderr.write(" %s IDs with probability mass assigned.\n"%(len(individualID2probabilityMass)))
		
		self.individualID2probabilityMass = individualID2probabilityMass
		self.individualID2HaplotypeData = individualID2HaplotypeData
Ejemplo n.º 15
0
    def splitVCFIntoBeagleInputs(self, inputFname=None, beagleLikelihoodFile=None, \
         familySize2BeagleFileHandler=None, pedigreeFamilyData=None, \
         minProbForValidCall=0.9, markersFile=None):
        """
		2013.05.03
		
		The non-likelihood (unphased, trios, pairs) Beagle format:
			I id sample1 sample1 sample2 sample2
			A diabetes 1 1 2 2
			M Contig791:1086 C C C C
			M Contig791:1649 T C C C
			M Contig791:4084 G A A A
		"""
        sys.stderr.write("Splitting VCFFile %s (+ one beagle Likelihood file %s) into Beagle trios/duos files, minProbForValidCall=%s ... \n"%\
            (inputFname, beagleLikelihoodFile.inputFname, minProbForValidCall))
        counter = 0
        no_of_trios = 0
        no_of_duos = 0
        no_of_singletons = 0
        totalNoOfCalls = 0
        noOfCallsMarkedMissing = 0
        vcfFile = VCFFile(inputFname=inputFname)
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList

        for vcfRecord in vcfFile:
            oneLocus = beagleLikelihoodFile.next()
            counter += 1
            familySize2CallList = {}
            genotypeLikelihoodList = oneLocus.genotypeLikelihoodList
            for familySize, sampleIDList in familySize2SampleIDList.iteritems(
            ):
                if familySize not in familySize2CallList:
                    familySize2CallList[familySize] = []
                for sampleID in sampleIDList:
                    totalNoOfCalls += 1
                    vcfGenotypeCallData = vcfRecord.getGenotypeCallForOneSample(
                        sampleID)
                    tripleLikelihood = beagleLikelihoodFile.getLikelihoodListOfOneGenotypeOneSample(
                        oneLocus=oneLocus, sampleID=sampleID)
                    if familySize == 1:
                        no_of_singletons += 1
                        familySize2CallList[familySize].extend(
                            tripleLikelihood)
                    else:
                        if familySize == 2:
                            no_of_duos += 1
                        elif familySize == 3:
                            no_of_trios += 1
                        tripleLikelihood = map(float, tripleLikelihood)
                        maxLikelihoodIndex = numpy.argmax(tripleLikelihood)
                        maxLikelihood = tripleLikelihood[maxLikelihoodIndex]
                        if maxLikelihood >= minProbForValidCall:
                            if maxLikelihoodIndex == 0:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleA
                                ]
                            elif maxLikelihoodIndex == 1:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleB
                                ]
                            else:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleB, oneLocus.alleleB
                                ]
                        else:
                            noOfCallsMarkedMissing += 1
                            diploidCallFromBeagle = ['?', '?']
                        #if vcfGenotypeCallData is None:	#DP is zero
                        #	sys.stderr.write("vcfGenotypeCallData for sample %s at locus %s, %s is None.\n"%\
                        #					(sampleID, vcfRecord.chr, vcfRecord.pos))
                        #	import pdb
                        #	pdb.set_trace()
                        if vcfGenotypeCallData and self.checkConcordanceBetweenBeagleAndVCFCall(
                                vcfGenotypeCallData['GT'],
                                diploidCallFromBeagle):
                            diploidCall = [
                                vcfGenotypeCallData['GT'][0],
                                vcfGenotypeCallData['GT'][1]
                            ]
                        else:
                            diploidCall = ['?', '?']
                        familySize2CallList[familySize].extend(diploidCall)

            for familySize, callList in familySize2CallList.iteritems():
                if familySize == 1:
                    rowHeaderList = [
                        oneLocus.markerID, oneLocus.alleleA, oneLocus.alleleB
                    ]
                else:
                    rowHeaderList = ['M', oneLocus.markerID]
                beagleFileHandler = familySize2BeagleFileHandler[familySize]

                beagleFileHandler.writerow(rowHeaderList + callList)
            if markersFile is not None:
                markersFile.writerow([
                    oneLocus.markerID,
                    oneLocus.markerID.split(':')[1], oneLocus.alleleA,
                    oneLocus.alleleB
                ])
        vcfFile.close()
        sys.stderr.write("%s loci, total %s calls, %s calls for singletons, %s calls for duos, %s calls for trios. %s calls marked missing.\n"%\
            (counter, totalNoOfCalls, no_of_singletons, no_of_duos, no_of_trios, noOfCallsMarkedMissing))