Ejemplo n.º 1
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, openMode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
Ejemplo n.º 2
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusID2Stat = self.getLocusID2StatFunctionDict[self.runType](
            self.statFname)

        reader = VCFFile(inputFname=self.inputFname)
        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            stat = locusID2Stat.get(key)
            if stat is None:
                continue

            toKeepLocus = True
            if self.minValue is not None and stat < self.minValue:
                toKeepLocus = False
            if self.maxValue is not None and stat > self.maxValue:
                toKeepLocus = False

            if toKeepLocus:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
Ejemplo n.º 3
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusNewID2mapPvalue = self.getLocusNewID2mapPvalue(
            self.liftOverLocusMapPvalueFname)

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            mapPvalue = locusNewID2mapPvalue.get(key)
            if mapPvalue is None:
                continue

            if mapPvalue > self.minLiftOverMapPvalue:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
Ejemplo n.º 4
0
    def filterVCFSNPCluster(self,
                            inputFname=None,
                            outputFname=None,
                            minNeighborDistance=10,
                            **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        sys.stderr.write(
            "Filtering VCF %s to get rid of SNPs that are %s distance apart ..."
            % (inputFname, minNeighborDistance))
        vcfFile = VCFFile(inputFname=inputFname)

        outVCFFile = VCFFile(outputFname=outputFname)
        outVCFFile.metaInfoLs = vcfFile.metaInfoLs
        outVCFFile.header = vcfFile.header
        outVCFFile.writeMetaAndHeader()

        previousVCFRecord = None
        previousVCFRecordIsBad = False  #indicator whether previous record is bad or not. based on distance to the previous-previous record
        counter = 0
        for vcfRecord in vcfFile:
            if previousVCFRecord is not None:
                if previousVCFRecord.chr == vcfRecord.chr:
                    distanceToPreviousRecord = abs(vcfRecord.pos -
                                                   previousVCFRecord.pos)
                    if distanceToPreviousRecord < minNeighborDistance:
                        previousVCFRecordIsBad = True
                    else:
                        if not previousVCFRecordIsBad:  #distance to current & previous-previous record is >=minNeighborDistance
                            outVCFFile.writeVCFRecord(previousVCFRecord)
                        previousVCFRecordIsBad = False
                else:
                    #handle the last record from the previous chromosome (assuming loci are in chromosomal order)
                    if not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
                        outVCFFile.writeVCFRecord(previousVCFRecord)

                    previousVCFRecordIsBad = False  #reset

            previousVCFRecord = vcfRecord
            counter += 1
        vcfFile.close()

        #handle the last record
        if previousVCFRecord is not None and not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
            outVCFFile.writeVCFRecord(previousVCFRecord)
        outVCFFile.close()

        noOfLociAfterFilter = len(outVCFFile.locus_id_ls)
        delta = counter - noOfLociAfterFilter
        if counter > 0:
            fraction = delta / float(counter)
        else:
            fraction = -0.0
        sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" %
                         (delta, counter, noOfLociAfterFilter, fraction * 100))
Ejemplo n.º 5
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2count = self.readInSNPID2GenotypeVectorLs(
            self.inputFname, returnType=2).snp_pos2returnData

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            frequency = snp_pos2count.get(key)
            if frequency == 1:
                writer.writeVCFRecord(vcfRecord)
                real_counter += 1

        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = 0
        sys.stderr.write("%s (out of %s, %s) snps are unique.\n" %
                         (real_counter, counter, fraction))
Ejemplo n.º 6
0
class LiftOverVCFBasedOnCoordinateMap(parentClass):
    __doc__ = __doc__
    option_default_dict = parentClass.option_default_dict.copy()
    option_default_dict.update({
         ('coordinateMapFname', 1, ): ['', '', 1, 'file that has a map between old and new coordinates. output of FindSNPPositionOnNewRefFromFlankingBlastOutput.py', ],\

         })

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        parentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords)

    def readInCoordinateMap(self, coordinateMapFname=None):
        """
		2013.07.11
			querySNPID      queryStrand     queryChromosome queryStart      queryStop       queryRefBase    queryAltBase    queryAlignmentSpan
			queryAlignmentStart     queryAlignmentStop      newChr  newRefStart     newRefStop      newRefBase      targetAlignmentSpan
			targetAlignmentStart    targetAlignmentStop
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (coordinateMapFname))
        oldCoordinate2newCoordinateDataLs = {}
        reader = MatrixFile(inputFname=coordinateMapFname)
        reader.constructColName2IndexFromHeader()
        oldChromosomeIndex = reader.getColIndexGivenColHeader(
            "queryChromosome")
        oldStartIndex = reader.getColIndexGivenColHeader("queryStart")
        strandIndex = reader.getColIndexGivenColHeader("queryStrand")
        oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase")
        oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase")

        newChromosomeIndex = reader.getColIndexGivenColHeader("newChr")
        newStartIndex = reader.getColIndexGivenColHeader("newRefStart")
        newStopIndex = reader.getColIndexGivenColHeader("newRefStop")
        newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase")
        counter = 0
        for row in reader:
            oldChromosome = row[oldChromosomeIndex]
            oldStart = int(row[oldStartIndex])
            strand = row[strandIndex]
            oldRefBase = row[oldRefBaseIndex]
            oldAltBase = row[oldAltBaseIndex]

            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            newRefBase = row[newRefBaseIndex]

            key = (oldChromosome, oldStart)
            if key not in oldCoordinate2newCoordinateDataLs:
                oldCoordinate2newCoordinateDataLs[key] = []
            oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \
                     oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\
                     newStop=newStop, newRefBase=newRefBase))
            counter += 1
        del reader
        sys.stderr.write("%s old coordinates with %s new coordinates.\n" %
                         (len(oldCoordinate2newCoordinateDataLs), counter))
        return oldCoordinate2newCoordinateDataLs

    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap(
            self.coordinateMapFname)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        noOfRecordsWithMultiNewCoords = 0

        for vcfRecord in self.reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key)
            if newCoordinateDataLs is None:
                continue
            if len(newCoordinateDataLs) > 1:
                noOfRecordsWithMultiNewCoords += 1
                continue
            newCoordinateData = newCoordinateDataLs[0]
            vcfRecord.setChromosome(newCoordinateData.newChromosome)
            vcfRecord.setPosition(newCoordinateData.newStart)
            if newCoordinateData.strand == '-':
                newRefBase = Seq(
                    newCoordinateData.oldRefBase).reverse_complement()
                newAltBase = Seq(
                    newCoordinateData.oldAltBase).reverse_complement()
            else:
                newRefBase = newCoordinateData.oldRefBase
                newAltBase = newCoordinateData.oldAltBase

            vcfRecord.setRefAllele(newRefBase)
            vcfRecord.setAltAllele(newAltBase)
            real_counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
        sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
                  real_counter/float(counter), noOfRecordsWithMultiNewCoords))
class CombinePhasedBeagleOutputsIntoVCF(AbstractMatrixFileWalker):
    __doc__ = __doc__

    option_default_dict = AbstractMatrixFileWalker.option_default_dict
    option_default_dict.update({
      ('replicateIndividualTag', 0, ): ['copy', '', 1, 'the tag that separates the true ID and its replicate count'],\
      ('originalVCFFname', 1, ): ['', '', 1, 'original VCF file on which both Beagle phased output and output VCF will be based. \n\
	The output VCF will be same as originalVCFFname, except GT field, to be replaced by phased genotypes from Beagle-phased files'                                                                                                                                  ],\
      })

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        AbstractMatrixFileWalker.__init__(self,
                                          inputFnameLs=inputFnameLs,
                                          **keywords)
        #a map from one sample to specific beagle file
        self.sampleID2BeagleFile = None

    def setup(self, **keywords):
        """
		2012.10.15
			run before anything is run
		"""
        #2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently
        #AbstractMatrixFileWalker.setup(self, **keywords)
        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.reader = VCFFile(inputFname=self.originalVCFFname, openMode='r')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        # read all the Beagle files
        sampleID2BeagleFile = {}
        for inputFname in self.inputFnameLs:
            beagleFile = BeagleGenotypeFile(inputFname=inputFname)
            beagleFile.readInAllHaplotypes()
            for individualID in beagleFile.sampleIDList:
                sampleID2BeagleFile[individualID] = beagleFile
            # get all haplotypes , etc.
            # get all sample IDs
        self.sampleID2BeagleFile = sampleID2BeagleFile

    def reduce(self, **keywords):
        """
		2012.10.15
			run after all files have been walked through
		"""
        #sample the data

        real_counter = 0
        counter = 0
        no_of_loci = 0
        for vcfRecord in self.reader:
            for sampleID, sample_index in vcfRecord.sample_id2index.iteritems(
            ):
                beagleFile = self.sampleID2BeagleFile.get(sampleID)
                """
				if beagleFile is None:
					sys.stderr.write("Warning: sampleID %s is not affiliated with any Beagle file.\n"%(sampleID)
					raise
				"""
                beagleGenotype = beagleFile.getGenotypeOfOneSampleOneLocus(
                    sampleID=sampleID, locusID=None)
                vcfRecord.setGenotypeCallForOneSample(
                    sampleID=sampleID,
                    genotype='%s|%s' % (beagleGenotype[0], beagleGenotype[1]))
                counter += 1
            self.writer.writeVCFRecord(vcfRecord)
            no_of_loci += 1
        sys.stderr.write("%s genotypes, %s loci.\n" % (counter, no_of_loci))

        #close the self.invariantPData.writer and self.writer
        AbstractMatrixFileWalker.reduce(self, **keywords)