Exemple #1
0
	def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1):
		"""
		returnType
			1: snp_pos2returnData is snp_pos2genotypeVectorLs
			2: snp_pos2returnData is snp_pos2returnData
		2013.07.19 bugfix
		2013.07.11
		"""
		sys.stderr.write("Finding SNPs that have same positions from %s ..."%(inputFname))
		
		reader = VCFFile(inputFname=inputFname)
		counter = 0
		real_counter = 0
		snp_pos2returnData = {}
		for vcfRecord in reader:
			key = (vcfRecord.chromosome, vcfRecord.position)
			if key not in snp_pos2returnData:
				if returnType==1:
					snp_pos2returnData[key] = []
				else:
					snp_pos2returnData[key] = 0
			else:
				real_counter += 1
			
			if returnType==1:
				snp_pos2returnData[key].append(vcfRecord.data_row[1:])	#[0] is reference
			else:
				snp_pos2returnData[key] += 1
			
			counter += 1
		reader.close()
		sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\
						(len(snp_pos2returnData), counter, real_counter))
		return PassingData(snp_pos2returnData=snp_pos2returnData)
Exemple #2
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, openMode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
Exemple #3
0
    def filterVCFSNPCluster(self,
                            inputFname=None,
                            outputFname=None,
                            minNeighborDistance=10,
                            **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        sys.stderr.write(
            "Filtering VCF %s to get rid of SNPs that are %s distance apart ..."
            % (inputFname, minNeighborDistance))
        vcfFile = VCFFile(inputFname=inputFname)

        outVCFFile = VCFFile(outputFname=outputFname)
        outVCFFile.metaInfoLs = vcfFile.metaInfoLs
        outVCFFile.header = vcfFile.header
        outVCFFile.writeMetaAndHeader()

        previousVCFRecord = None
        previousVCFRecordIsBad = False  #indicator whether previous record is bad or not. based on distance to the previous-previous record
        counter = 0
        for vcfRecord in vcfFile:
            if previousVCFRecord is not None:
                if previousVCFRecord.chr == vcfRecord.chr:
                    distanceToPreviousRecord = abs(vcfRecord.pos -
                                                   previousVCFRecord.pos)
                    if distanceToPreviousRecord < minNeighborDistance:
                        previousVCFRecordIsBad = True
                    else:
                        if not previousVCFRecordIsBad:  #distance to current & previous-previous record is >=minNeighborDistance
                            outVCFFile.writeVCFRecord(previousVCFRecord)
                        previousVCFRecordIsBad = False
                else:
                    #handle the last record from the previous chromosome (assuming loci are in chromosomal order)
                    if not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
                        outVCFFile.writeVCFRecord(previousVCFRecord)

                    previousVCFRecordIsBad = False  #reset

            previousVCFRecord = vcfRecord
            counter += 1
        vcfFile.close()

        #handle the last record
        if previousVCFRecord is not None and not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
            outVCFFile.writeVCFRecord(previousVCFRecord)
        outVCFFile.close()

        noOfLociAfterFilter = len(outVCFFile.locus_id_ls)
        delta = counter - noOfLociAfterFilter
        if counter > 0:
            fraction = delta / float(counter)
        else:
            fraction = -0.0
        sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" %
                         (delta, counter, noOfLociAfterFilter, fraction * 100))
	def extractFlankingSequence(self, inputFname=None, refFastaFname=None, outputFname=None, flankingLength=24,\
							outputFormatType=1, alleleLength=1):
		"""
		2013.09.03 added argument alleleLength
		2012.10.10
			added argument outputFormatType. 1: fasta, 2: fastq
		2012.10.8
		"""
		sys.stderr.write("Extracting flanking sequences of loci from %s, based on ref-sequence of %s, alleleLength=%s, outputFormatType=%s ...\n"%\
						(inputFname, refFastaFname, alleleLength, outputFormatType))
		vcfFile = VCFFile(inputFname=inputFname)
		outf = open(outputFname, 'w')
		refFastaFile = FastaFile(inputFname=refFastaFname)
		
		counter = 0
		real_counter = 0
		for vcfRecord in vcfFile:
			counter += 1
			if alleleLength and (len(vcfRecord.refBase)!=alleleLength or len(vcfRecord.altBase)!=alleleLength):
				continue
			
			real_counter += 1
			refBase = vcfRecord.refBase
			stopPos = vcfRecord.pos + len(refBase) -1
			
			SNP_ID = '%s_%s_%s_%s_%s'%(vcfRecord.chr, vcfRecord.pos, stopPos, vcfRecord.refBase, vcfRecord.altBase)
			fastaTitle = '%s_positionInFlank%s'%(SNP_ID, flankingLength+1)	#positionInFlank is 1-based.
			flankSeqStart = max(1, vcfRecord.pos-flankingLength)
			flankSeqStop = stopPos + flankingLength
			flankingSequence = refFastaFile.getSequence(vcfRecord.chr, start=flankSeqStart, stop=flankSeqStop)
			if flankingSequence:
				if outputFormatType==1:
					outf.write(">%s\n"%(fastaTitle))
					outf.write('%s\n'%(flankingSequence))
				else:
					outf.write("@%s\n"%(fastaTitle))
					outf.write('%s\n'%(flankingSequence))
					outf.write("+\n")
					outf.write("%s\n"%('H'*len(flankingSequence)))
						
				
		
		del outf
		vcfFile.close()
		refFastaFile.close()
		sys.stderr.write("%s loci (%s total) written out.\n"%(real_counter, counter))
Exemple #5
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusID2Stat = self.getLocusID2StatFunctionDict[self.runType](
            self.statFname)

        reader = VCFFile(inputFname=self.inputFname)
        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            stat = locusID2Stat.get(key)
            if stat is None:
                continue

            toKeepLocus = True
            if self.minValue is not None and stat < self.minValue:
                toKeepLocus = False
            if self.maxValue is not None and stat > self.maxValue:
                toKeepLocus = False

            if toKeepLocus:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
Exemple #6
0
    def _juxtaposeAlleleFrequencyFromMultiVCFInput(self, inputFnameLs=None, inputHeaderLs=None, outputFname=None, \
             defaultNullFrequency=-0, **keywords):
        """
		2012.10.5
		
		"""
        sys.stderr.write("Getting allele frequency from %s input ..." %
                         (len(inputFnameLs)))

        #get locus2AF from inputFname
        locus2frequencyList = []

        locus_id_set = set()
        for inputFname in inputFnameLs:
            vcfFile = VCFFile(inputFname=inputFname)
            locus2frequency = vcfFile.getLocus2AlternativeAlleleFrequency()
            vcfFile.close()
            locus2frequencyList.append(locus2frequency)
            locus_id_set = locus_id_set.union(set(locus2frequency.keys()))
        sys.stderr.write("%s loci.\n" % (len(locus_id_set)))

        sys.stderr.write(
            "Outputting frequency collected from all input to %s ..." %
            (outputFname))
        #output them in juxtaposition
        writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
        header = ['locusID'] + inputHeaderLs + ['count']
        writer.writerow(header)

        locus_id_list = list(locus_id_set)
        locus_id_list.sort()

        for locus_id in locus_id_list:
            locus_id_str_ls = map(str, locus_id)
            data_row = ['_'.join(locus_id_str_ls)]
            for i in xrange(len(locus2frequencyList)):
                locus2frequency = locus2frequencyList[i]
                frequency = locus2frequency.get(locus_id, defaultNullFrequency)
                data_row.append(frequency)
            data_row.append(1)
            writer.writerow(data_row)
        del writer
        sys.stderr.write("\n")
Exemple #7
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusNewID2mapPvalue = self.getLocusNewID2mapPvalue(
            self.liftOverLocusMapPvalueFname)

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            mapPvalue = locusNewID2mapPvalue.get(key)
            if mapPvalue is None:
                continue

            if mapPvalue > self.minLiftOverMapPvalue:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
Exemple #8
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2count = self.readInSNPID2GenotypeVectorLs(
            self.inputFname, returnType=2).snp_pos2returnData

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            frequency = snp_pos2count.get(key)
            if frequency == 1:
                writer.writeVCFRecord(vcfRecord)
                real_counter += 1

        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = 0
        sys.stderr.write("%s (out of %s, %s) snps are unique.\n" %
                         (real_counter, counter, fraction))
Exemple #9
0
class LiftOverVCFBasedOnCoordinateMap(parentClass):
    __doc__ = __doc__
    option_default_dict = parentClass.option_default_dict.copy()
    option_default_dict.update({
         ('coordinateMapFname', 1, ): ['', '', 1, 'file that has a map between old and new coordinates. output of FindSNPPositionOnNewRefFromFlankingBlastOutput.py', ],\

         })

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        parentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords)

    def readInCoordinateMap(self, coordinateMapFname=None):
        """
		2013.07.11
			querySNPID      queryStrand     queryChromosome queryStart      queryStop       queryRefBase    queryAltBase    queryAlignmentSpan
			queryAlignmentStart     queryAlignmentStop      newChr  newRefStart     newRefStop      newRefBase      targetAlignmentSpan
			targetAlignmentStart    targetAlignmentStop
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (coordinateMapFname))
        oldCoordinate2newCoordinateDataLs = {}
        reader = MatrixFile(inputFname=coordinateMapFname)
        reader.constructColName2IndexFromHeader()
        oldChromosomeIndex = reader.getColIndexGivenColHeader(
            "queryChromosome")
        oldStartIndex = reader.getColIndexGivenColHeader("queryStart")
        strandIndex = reader.getColIndexGivenColHeader("queryStrand")
        oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase")
        oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase")

        newChromosomeIndex = reader.getColIndexGivenColHeader("newChr")
        newStartIndex = reader.getColIndexGivenColHeader("newRefStart")
        newStopIndex = reader.getColIndexGivenColHeader("newRefStop")
        newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase")
        counter = 0
        for row in reader:
            oldChromosome = row[oldChromosomeIndex]
            oldStart = int(row[oldStartIndex])
            strand = row[strandIndex]
            oldRefBase = row[oldRefBaseIndex]
            oldAltBase = row[oldAltBaseIndex]

            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            newRefBase = row[newRefBaseIndex]

            key = (oldChromosome, oldStart)
            if key not in oldCoordinate2newCoordinateDataLs:
                oldCoordinate2newCoordinateDataLs[key] = []
            oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \
                     oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\
                     newStop=newStop, newRefBase=newRefBase))
            counter += 1
        del reader
        sys.stderr.write("%s old coordinates with %s new coordinates.\n" %
                         (len(oldCoordinate2newCoordinateDataLs), counter))
        return oldCoordinate2newCoordinateDataLs

    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap(
            self.coordinateMapFname)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        noOfRecordsWithMultiNewCoords = 0

        for vcfRecord in self.reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key)
            if newCoordinateDataLs is None:
                continue
            if len(newCoordinateDataLs) > 1:
                noOfRecordsWithMultiNewCoords += 1
                continue
            newCoordinateData = newCoordinateDataLs[0]
            vcfRecord.setChromosome(newCoordinateData.newChromosome)
            vcfRecord.setPosition(newCoordinateData.newStart)
            if newCoordinateData.strand == '-':
                newRefBase = Seq(
                    newCoordinateData.oldRefBase).reverse_complement()
                newAltBase = Seq(
                    newCoordinateData.oldAltBase).reverse_complement()
            else:
                newRefBase = newCoordinateData.oldRefBase
                newAltBase = newCoordinateData.oldAltBase

            vcfRecord.setRefAllele(newRefBase)
            vcfRecord.setAltAllele(newAltBase)
            real_counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
        sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
                  real_counter/float(counter), noOfRecordsWithMultiNewCoords))
Exemple #10
0
    def splitVCFIntoBeagleInputs(self, inputFname=None, beagleLikelihoodFile=None, \
         familySize2BeagleFileHandler=None, pedigreeFamilyData=None, \
         minProbForValidCall=0.9, markersFile=None):
        """
		2013.05.03
		
		The non-likelihood (unphased, trios, pairs) Beagle format:
			I id sample1 sample1 sample2 sample2
			A diabetes 1 1 2 2
			M Contig791:1086 C C C C
			M Contig791:1649 T C C C
			M Contig791:4084 G A A A
		"""
        sys.stderr.write("Splitting VCFFile %s (+ one beagle Likelihood file %s) into Beagle trios/duos files, minProbForValidCall=%s ... \n"%\
            (inputFname, beagleLikelihoodFile.inputFname, minProbForValidCall))
        counter = 0
        no_of_trios = 0
        no_of_duos = 0
        no_of_singletons = 0
        totalNoOfCalls = 0
        noOfCallsMarkedMissing = 0
        vcfFile = VCFFile(inputFname=inputFname)
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList

        for vcfRecord in vcfFile:
            oneLocus = beagleLikelihoodFile.next()
            counter += 1
            familySize2CallList = {}
            genotypeLikelihoodList = oneLocus.genotypeLikelihoodList
            for familySize, sampleIDList in familySize2SampleIDList.iteritems(
            ):
                if familySize not in familySize2CallList:
                    familySize2CallList[familySize] = []
                for sampleID in sampleIDList:
                    totalNoOfCalls += 1
                    vcfGenotypeCallData = vcfRecord.getGenotypeCallForOneSample(
                        sampleID)
                    tripleLikelihood = beagleLikelihoodFile.getLikelihoodListOfOneGenotypeOneSample(
                        oneLocus=oneLocus, sampleID=sampleID)
                    if familySize == 1:
                        no_of_singletons += 1
                        familySize2CallList[familySize].extend(
                            tripleLikelihood)
                    else:
                        if familySize == 2:
                            no_of_duos += 1
                        elif familySize == 3:
                            no_of_trios += 1
                        tripleLikelihood = map(float, tripleLikelihood)
                        maxLikelihoodIndex = numpy.argmax(tripleLikelihood)
                        maxLikelihood = tripleLikelihood[maxLikelihoodIndex]
                        if maxLikelihood >= minProbForValidCall:
                            if maxLikelihoodIndex == 0:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleA
                                ]
                            elif maxLikelihoodIndex == 1:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleB
                                ]
                            else:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleB, oneLocus.alleleB
                                ]
                        else:
                            noOfCallsMarkedMissing += 1
                            diploidCallFromBeagle = ['?', '?']
                        #if vcfGenotypeCallData is None:	#DP is zero
                        #	sys.stderr.write("vcfGenotypeCallData for sample %s at locus %s, %s is None.\n"%\
                        #					(sampleID, vcfRecord.chr, vcfRecord.pos))
                        #	import pdb
                        #	pdb.set_trace()
                        if vcfGenotypeCallData and self.checkConcordanceBetweenBeagleAndVCFCall(
                                vcfGenotypeCallData['GT'],
                                diploidCallFromBeagle):
                            diploidCall = [
                                vcfGenotypeCallData['GT'][0],
                                vcfGenotypeCallData['GT'][1]
                            ]
                        else:
                            diploidCall = ['?', '?']
                        familySize2CallList[familySize].extend(diploidCall)

            for familySize, callList in familySize2CallList.iteritems():
                if familySize == 1:
                    rowHeaderList = [
                        oneLocus.markerID, oneLocus.alleleA, oneLocus.alleleB
                    ]
                else:
                    rowHeaderList = ['M', oneLocus.markerID]
                beagleFileHandler = familySize2BeagleFileHandler[familySize]

                beagleFileHandler.writerow(rowHeaderList + callList)
            if markersFile is not None:
                markersFile.writerow([
                    oneLocus.markerID,
                    oneLocus.markerID.split(':')[1], oneLocus.alleleA,
                    oneLocus.alleleB
                ])
        vcfFile.close()
        sys.stderr.write("%s loci, total %s calls, %s calls for singletons, %s calls for duos, %s calls for trios. %s calls marked missing.\n"%\
            (counter, totalNoOfCalls, no_of_singletons, no_of_duos, no_of_trios, noOfCallsMarkedMissing))