Esempio n. 1
0
    def run(self):
        """
        """

        if self.debug:
            import pdb
            pdb.set_trace()

        vcfFile1 = VCFFile(inputFname=self.inputFname, minDepth=self.minDepth)
        vcfFile2 = VCFFile(inputFname=self.jnputFname, minDepth=self.minDepth)
        """
        if self.outputFnamePrefix:
            outputFnamePrefix = self.outputFnamePrefix
        elif self.outputFname:
            outputFnamePrefix = os.path.splitext(self.outputFname)[0]	#2012.8.20 bugfix, was using os.path.split()
        else:
            sys.stderr.write("could not get outputFnamePrefix from self.outputFnamePrefix %s or self.outputFname %s.\n"%\
                            (self.outputFnamePrefix, self.outputFname))
            sys.exit(1)
        """
        #overallOverlapOutputFname = '%s.tsv'%(outputFnamePrefix)
        #perSampleConcordanceOutputFname = '%s_perSample.tsv'%(outputFnamePrefix)

        pdata = self.calculateOverlappingSites(vcfFile1=vcfFile1, vcfFile2=vcfFile2, outputFname=self.outputFname,
                            overlappingSitesOutputFname=self.overlappingSitesOutputFname, \
                            chromosome=self.chromosome, chrLength=self.chrLength)
        if self.perSampleConcordanceOutputFname:
            self.calculatePerSampleMismatchFraction(vcfFile1=vcfFile1, vcfFile2=vcfFile2, \
                                                outputFname=self.perSampleConcordanceOutputFname,\
                                                overlapping_sample_id_set=pdata.overlapping_sample_id_set)
Esempio n. 2
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()
            debug = True
        else:
            debug = False

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, mode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        for info_tag, description in self.knownInfoTag2DescriptionLine.items():
            self.writer.metaInfoLs.append(description)
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        for vcfRecord in self.reader:
            counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        switchDensity = self.readInSwitchDensity(
            inputFname=self.switchPointFname).switchDensity

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()
        counter = 0
        real_counter = 0

        if switchDensity <= self.maxSwitchDensity:

            for vcfRecord in reader:  #assuming input VCF is sorted
                counter += 1
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)

        reader.close()
        writer.close()
        sys.stderr.write("%s (out of %s) records outputted.\n" %
                         (real_counter, counter))
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, mode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
Esempio n. 5
0
    def filterVCFSNPCluster(self,
                            inputFname=None,
                            outputFname=None,
                            minNeighborDistance=10,
                            **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        sys.stderr.write(
            "Filtering VCF %s to get rid of SNPs that are %s distance apart ..."
            % (inputFname, minNeighborDistance))
        vcfFile = VCFFile(inputFname=inputFname)

        outVCFFile = VCFFile(outputFname=outputFname)
        outVCFFile.metaInfoLs = vcfFile.metaInfoLs
        outVCFFile.header = vcfFile.header
        outVCFFile.writeMetaAndHeader()

        previousVCFRecord = None
        previousVCFRecordIsBad = False  #indicator whether previous record is bad or not. based on distance to the previous-previous record
        counter = 0
        for vcfRecord in vcfFile:
            if previousVCFRecord is not None:
                if previousVCFRecord.chr == vcfRecord.chr:
                    distanceToPreviousRecord = abs(vcfRecord.pos -
                                                   previousVCFRecord.pos)
                    if distanceToPreviousRecord < minNeighborDistance:
                        previousVCFRecordIsBad = True
                    else:
                        if not previousVCFRecordIsBad:  #distance to current & previous-previous record is >=minNeighborDistance
                            outVCFFile.writeVCFRecord(previousVCFRecord)
                        previousVCFRecordIsBad = False
                else:
                    #handle the last record from the previous chromosome (assuming loci are in chromosomal order)
                    if not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
                        outVCFFile.writeVCFRecord(previousVCFRecord)

                    previousVCFRecordIsBad = False  #reset

            previousVCFRecord = vcfRecord
            counter += 1
        vcfFile.close()

        #handle the last record
        if previousVCFRecord is not None and not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
            outVCFFile.writeVCFRecord(previousVCFRecord)
        outVCFFile.close()

        noOfLociAfterFilter = len(outVCFFile.locus_id_ls)
        delta = counter - noOfLociAfterFilter
        if counter > 0:
            fraction = delta / float(counter)
        else:
            fraction = -0.0
        sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" %
                         (delta, counter, noOfLociAfterFilter, fraction * 100))
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap(
            self.coordinateMapFname)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, mode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        noOfRecordsWithMultiNewCoords = 0

        for vcfRecord in self.reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key)
            if newCoordinateDataLs is None:
                continue
            if len(newCoordinateDataLs) > 1:
                noOfRecordsWithMultiNewCoords += 1
                continue
            newCoordinateData = newCoordinateDataLs[0]
            vcfRecord.setChromosome(newCoordinateData.newChromosome)
            vcfRecord.setPosition(newCoordinateData.newStart)
            if newCoordinateData.strand == '-':
                newRefBase = Seq(
                    newCoordinateData.oldRefBase).reverse_complement()
                newAltBase = Seq(
                    newCoordinateData.oldAltBase).reverse_complement()
            else:
                newRefBase = newCoordinateData.oldRefBase
                newAltBase = newCoordinateData.oldAltBase

            vcfRecord.setRefAllele(newRefBase)
            vcfRecord.setAltAllele(newAltBase)
            real_counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
        sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
                  real_counter/float(counter), noOfRecordsWithMultiNewCoords))
Esempio n. 7
0
	def splitVCF(self, inputFname, outputFnamePrefix=None, noOfOverlappingSites=1000, noOfSitesPerUnit=5000,\
				noOfTotalSites=None):
		"""
		2012.8.25
			
		"""
		sys.stderr.write("Splitting VCF %s into files each with %s sites and %s overlapping ... \n"%(inputFname, noOfSitesPerUnit,\
																		noOfOverlappingSites))
		
		vcfFile = VCFFile(inputFname=inputFname)
		
		unitNumber2OutVCFFile = {}
		counter = 0
		real_counter = 0
		#make it 1 less than total so the last unit is >=s
		noOfUnits = max(1, utils.getNoOfUnitsNeededToCoverN(N=noOfTotalSites, s=noOfSitesPerUnit, o=noOfOverlappingSites)-1)
		sys.stderr.write(" will be split into %s units ... "%(noOfUnits))
		overlappingRecordLs = []
		for vcfRecord in vcfFile:
			counter += 1
			#below the maximum: noOfUnits.
			unitNumber = min(noOfUnits, max(1, utils.getNoOfUnitsNeededToCoverN(N=counter, s=noOfSitesPerUnit, o=noOfOverlappingSites)))
			if unitNumber not in unitNumber2OutVCFFile:
				outputFname = '%s_unit%s.vcf'%(outputFnamePrefix, unitNumber)
				outVCFFile = VCFFile(outputFname=outputFname)
				outVCFFile.metaInfoLs = vcfFile.metaInfoLs
				outVCFFile.header = vcfFile.header
				outVCFFile.writeMetaAndHeader()
				outVCFFile.noOfLoci =0
				#output the overlapping vcf records (from previous unit
				if overlappingRecordLs:
					for overlappingVCFRecord in overlappingRecordLs:
						outVCFFile.writeVCFRecord(overlappingVCFRecord)
						outVCFFile.noOfLoci += 1
					overlappingRecordLs = []	#reset it
				unitNumber2OutVCFFile[unitNumber] = outVCFFile
			outVCFFile = unitNumber2OutVCFFile[unitNumber]
			outVCFFile.writeVCFRecord(vcfRecord)
			outVCFFile.noOfLoci += 1
			#store the overlapping records
			if unitNumber<noOfUnits:
				if outVCFFile.noOfLoci>(noOfSitesPerUnit-noOfOverlappingSites):
					overlappingRecordLs.append(vcfRecord)
			
		
		vcfFile.close()
		#close all output files
		for unitNumber, outVCFFile in unitNumber2OutVCFFile.items():
			outVCFFile.close()
		
		sys.stderr.write("%s loci split into %s files.\n"%(counter, len(unitNumber2OutVCFFile)))
    def discoverFromVCFWithoutFilter(self,
                                     inputFname=None,
                                     outputFname=None,
                                     **keywords):
        """
        2012.9.11
            read minDepth from self.minDepth
        2012.9.5
            add minDepth=0 to VCFFile
        #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
            need a conversion in between
        2012.5.8
        """
        vcfFile = VCFFile(inputFname=inputFname, minDepth=self.minDepth)
        vcfFile.parseFile()

        read_group2col_index = vcfFile.sample_id2index
        locus_id2row_index = vcfFile.locus_id2row_index
        #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
        new_locus_id2row_index = {}
        for locus_id, row_index in locus_id2row_index.items():
            new_locus_id = '%s_%s' % (locus_id[0], locus_id[1])
            new_locus_id2row_index[new_locus_id] = row_index
        locus_id2row_index = new_locus_id2row_index

        data_matrix = vcfFile.genotype_call_matrix

        self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \
                    read_group2col_index=read_group2col_index, \
                    locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
 def run(self):
     if self.debug:
         import pdb
         pdb.set_trace()
     
     outputDir = os.path.split(self.outputFname)[0]
     if outputDir and not os.path.isdir(outputDir):
         os.makedirs(outputDir)
     locusID2Stat = self.getLocusID2StatFunctionDict[self.runType](self.statFname)
     
     reader = VCFFile(inputFname=self.inputFname)
     writer = VCFFile(outputFname=self.outputFname, mode='w')
     writer.metaInfoLs = reader.metaInfoLs
     writer.header = reader.header
     writer.writeMetaAndHeader()
     
     counter = 0
     real_counter = 0
     
     for vcfRecord in reader:	#assuming input VCF is sorted
         counter += 1
         key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position)
         stat = locusID2Stat.get(key)
         if stat is None:
             continue
         
         toKeepLocus = True
         if self.minValue is not None and stat < self.minValue:
             toKeepLocus = False
         if self.maxValue is not None and stat > self.maxValue:
             toKeepLocus = False
         
         if toKeepLocus:
             real_counter += 1
             writer.writeVCFRecord(vcfRecord)
     reader.close()
     writer.close()
     if counter>0:
         fraction = real_counter/float(counter)
     else:
         fraction = -1
     sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                                             fraction))
Esempio n. 10
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusNewID2mapPvalue = self.getLocusNewID2mapPvalue(
            self.liftOverLocusMapPvalueFname)

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            mapPvalue = locusNewID2mapPvalue.get(key)
            if mapPvalue is None:
                continue

            if mapPvalue > self.minLiftOverMapPvalue:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
 def openOneInputFile(self, inputFname=None):
     """
     2013.09.05 split out of fileWalker() , added VCFFile
     """
     if self.inputFileFormat==2:
         reader = YHFile(inputFname, mode='r', tableName=self.h5TableName)
     elif self.inputFileFormat==3:
         reader = HDF5MatrixFile(inputFname, mode='r')
     elif self.inputFileFormat==4:
         reader = VCFFile(inputFname=inputFname)
     else:
         reader = MatrixFile(inputFname)
     return reader
	def setup(self, **keywords):
		"""
		2012.10.15
			run before anything is run
		"""
		#2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently
		#AbstractMatrixFileWalker.setup(self, **keywords)
		self.writer = VCFFile(outputFname=self.outputFname, mode='w')
		self.reader = VCFFile(inputFname=self.originalVCFFname, mode='r')
		self.writer.metaInfoLs = self.reader.metaInfoLs
		self.writer.header = self.reader.header
		self.writer.writeMetaAndHeader()
		
		# read all the Beagle files
		sampleID2BeagleFile = {}
		for inputFname in self.inputFnameLs:
			beagleFile = BeagleGenotypeFile(path=inputFname)
			beagleFile.readInAllHaplotypes()
			for individualID in beagleFile.sampleIDList:
				sampleID2BeagleFile[individualID] = beagleFile
			# get all haplotypes , etc.
			# get all sample IDs
		self.sampleID2BeagleFile = sampleID2BeagleFile
    def extractFlankingSequence(self, inputFname=None, refFastaFname=None, outputFname=None, flankingLength=24,\
          outputFormatType=1, alleleLength=1):
        """
		2013.09.03 added argument alleleLength
		2012.10.10
			added argument outputFormatType. 1: fasta, 2: fastq
		2012.10.8
		"""
        sys.stderr.write("Extracting flanking sequences of loci from %s, based on ref-sequence of %s, alleleLength=%s, outputFormatType=%s ...\n"%\
            (inputFname, refFastaFname, alleleLength, outputFormatType))
        vcfFile = VCFFile(inputFname=inputFname)
        outf = open(outputFname, 'w')
        refFastaFile = FastaFile(inputFname=refFastaFname)

        counter = 0
        real_counter = 0
        for vcfRecord in vcfFile:
            counter += 1
            if alleleLength and (len(vcfRecord.refBase) != alleleLength
                                 or len(vcfRecord.altBase) != alleleLength):
                continue

            real_counter += 1
            refBase = vcfRecord.refBase
            stopPos = vcfRecord.pos + len(refBase) - 1

            SNP_ID = '%s_%s_%s_%s_%s' % (vcfRecord.chr, vcfRecord.pos, stopPos,
                                         vcfRecord.refBase, vcfRecord.altBase)
            fastaTitle = '%s_positionInFlank%s' % (
                SNP_ID, flankingLength + 1)  #positionInFlank is 1-based.
            flankSeqStart = max(1, vcfRecord.pos - flankingLength)
            flankSeqStop = stopPos + flankingLength
            flankingSequence = refFastaFile.getSequence(vcfRecord.chr,
                                                        start=flankSeqStart,
                                                        stop=flankSeqStop)
            if flankingSequence:
                if outputFormatType == 1:
                    outf.write(">%s\n" % (fastaTitle))
                    outf.write('%s\n' % (flankingSequence))
                else:
                    outf.write("@%s\n" % (fastaTitle))
                    outf.write('%s\n' % (flankingSequence))
                    outf.write("+\n")
                    outf.write("%s\n" % ('H' * len(flankingSequence)))

        del outf
        vcfFile.close()
        refFastaFile.close()
        sys.stderr.write("%s loci (%s total) written out.\n" %
                         (real_counter, counter))
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2count = self.readInSNPID2GenotypeVectorLs(
            self.inputFname, returnType=2).snp_pos2returnData

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            frequency = snp_pos2count.get(key)
            if frequency == 1:
                writer.writeVCFRecord(vcfRecord)
                real_counter += 1

        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = 0
        sys.stderr.write("%s (out of %s, %s) snps are unique.\n" %
                         (real_counter, counter, fraction))
    def convertVCF2BjarniFormat(self, inputFname, outputFname, **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        vcfFile = VCFFile(inputFname=inputFname)
        vcfFile.parseFile()

        read_group2col_index = vcfFile.sample_id2index
        locus_id2row_index = vcfFile.locus_id2row_index

        data_matrix = vcfFile.genotype_call_matrix

        self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \
           read_group2col_index=read_group2col_index, \
           locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
    def _juxtaposeAlleleFrequencyFromMultiVCFInput(self, inputFnameLs=None,
     inputHeaderLs=None, outputFname=None, \
     defaultNullFrequency=-0, **keywords):
        """
		2012.10.5
		
		"""
        sys.stderr.write("Getting allele frequency from %s input ..." %
                         (len(inputFnameLs)))

        #get locus2AF from inputFname
        locus2frequencyList = []

        locus_id_set = set()
        for inputFname in inputFnameLs:
            vcfFile = VCFFile(inputFname=inputFname)
            locus2frequency = vcfFile.getLocus2AlternativeAlleleFrequency()
            vcfFile.close()
            locus2frequencyList.append(locus2frequency)
            locus_id_set = locus_id_set.union(set(locus2frequency.keys()))
        sys.stderr.write("%s loci.\n" % (len(locus_id_set)))

        sys.stderr.write(
            "Outputting frequency collected from all input to %s ..." %
            (outputFname))
        #output them in juxtaposition
        writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
        header = ['locusID'] + inputHeaderLs + ['count']
        writer.writerow(header)

        locus_id_list = sorted(locus_id_set)

        for locus_id in locus_id_list:
            locus_id_str_ls = map(str, locus_id)
            data_row = ['_'.join(locus_id_str_ls)]
            for i in range(len(locus2frequencyList)):
                locus2frequency = locus2frequencyList[i]
                frequency = locus2frequency.get(locus_id, defaultNullFrequency)
                data_row.append(frequency)
            data_row.append(1)
            writer.writerow(data_row)
        del writer
        sys.stderr.write("\n")
Esempio n. 17
0
    def getAllInfoTags(self, inputFname=None, **keywords):
        """
		2013.07.10
			not used right now.
		"""
        sys.stderr.write("Extracting info tags from  VCF %s ..." %
                         (inputFname))
        vcfFile = VCFFile(inputFname=inputFname)

        info_tag_set = set()
        counter = 0
        real_counter = 0
        for vcfRecord in vcfFile:
            for info_tag in vcfRecord.info_tag2value:
                info_tag_set.add(info_tag)
            counter += 1
        vcfFile.close()

        sys.stderr.write("%s unique info tags.\n" % (len(info_tag_set)))
        return info_tag_set
Esempio n. 18
0
    def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1):
        """
        returnType
            1: snp_pos2returnData is snp_pos2genotypeVectorLs
            2: snp_pos2returnData is snp_pos2returnData
        2013.07.19 bugfix
        2013.07.11
        """
        sys.stderr.write("Finding SNPs that have same positions from %s ..." %
                         (inputFname))

        reader = VCFFile(inputFname=inputFname)
        counter = 0
        real_counter = 0
        snp_pos2returnData = {}
        for vcfRecord in reader:
            key = (vcfRecord.chromosome, vcfRecord.position)
            if key not in snp_pos2returnData:
                if returnType == 1:
                    snp_pos2returnData[key] = []
                else:
                    snp_pos2returnData[key] = 0
            else:
                real_counter += 1

            if returnType == 1:
                snp_pos2returnData[key].append(
                    vcfRecord.data_row[1:])  #[0] is reference
            else:
                snp_pos2returnData[key] += 1

            counter += 1
        reader.close()
        sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\
                        (len(snp_pos2returnData), counter, real_counter))
        return PassingData(snp_pos2returnData=snp_pos2returnData)
Esempio n. 19
0
    def setup(self, **keywords):
        """
		2012.10.15
			run before anything is run
		"""
        AbstractMatrixFileWalker.setup(self, **keywords)
        #self.writer = BeagleGenotypeFile(path=self.outputFname, mode='w')

        #read in the IBD check result
        self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \
            rowIDHeader=None, colIDHeader=None, \
            rowIDIndex=0, colIDIndex=1, \
            dataHeader=None, dataIndex=2, hasHeader=False)

        #. read in the alignment coverage data
        alignmentCoverageFile = MatrixFile(
            path=self.individualAlignmentCoverageFname)
        alignmentCoverageFile.constructColName2IndexFromHeader()
        alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(
            keyColumnIndexList=[0], valueColumnIndexList=[1])
        alignmentCoverageFile.close()

        sys.stderr.write(
            "Reading in all samples from %s VCF input files ... \n" %
            (len(self.inputFnameLs)))
        # read all the Beagle files
        individualID2HaplotypeData = {}
        for inputFname in self.inputFnameLs:
            vcfFile = VCFFile(inputFname=inputFname)
            #vcfFile.readInAllHaplotypes()
            for individualID in vcfFile.getSampleIDList():
                individualID2HaplotypeData[individualID] = None
                #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID)
                #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList,
                #													locusIDList=vcfFile.locusIDList)
            # get all haplotypes , etc.
            # get all sample IDs
        sys.stderr.write("%s individuals total.\n" %
                         (len(individualID2HaplotypeData)))

        #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns)
        #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child)
        sys.stderr.write("Constructing individualID2pedigreeContext ...")
        plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname)
        pGraph = plinkPedigreeFile.pedigreeGraph
        #shrink the graph to only individuals with data
        pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys())

        cc_subgraph_list = nx.connected_component_subgraphs(
            pGraph.to_undirected())
        individualID2familyContext = {}
        outDegreeContainer = NumberContainer(minValue=0)
        familySizeContainer = NumberContainer(minValue=0)
        individualCoverageContainer = NumberContainer(minValue=0)
        familyCoverageContainer = NumberContainer(minValue=0)
        for cc_subgraph in cc_subgraph_list:
            familySize = len(cc_subgraph)
            familySizeContainer.addOneValue(familySize)

            familyCoverage = 0
            for n in cc_subgraph:  #assuming each family is a two-generation trio/nuclear family
                individualCoverage = self.getIndividualCoverage(
                    individualID=n,
                    alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs
                )
                individualCoverage = float(individualCoverage)
                individualCoverageContainer.addOneValue(individualCoverage)
                familyCoverage += individualCoverage
                in_degree = pGraph.in_degree(n)
                out_degree = pGraph.out_degree(n)
                outDegreeContainer.addOneValue(out_degree)
                familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \
                      individualCoverage=individualCoverage,\
                      familyCoverage=None)
                if n not in individualID2familyContext:
                    individualID2familyContext[n] = familyContext
                else:
                    sys.stderr.write(
                        "Node %s already in individualID2familyContext.\n" %
                        (n))
            familyCoverageContainer.addOneValue(familyCoverage)
            #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype
            for n in cc_subgraph:
                individualID2familyContext[n].familyCoverage = familyCoverage
        plinkPedigreeFile.close()
        sys.stderr.write("%s individuals.\n" %
                         (len(individualID2familyContext)))

        # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual
        sys.stderr.write(
            "Weighing each individual , assigning probability mass  ...")
        individualID2probabilityMass = {}
        for individualID, familyContext in individualID2familyContext.items():
            outDegreeQuotient = outDegreeContainer.normalizeValue(
                familyContext.familySize)
            individualCoverageQuotient = individualCoverageContainer.normalizeValue(
                familyContext.individualCoverage)
            #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage)
            importanceScore = outDegreeQuotient + individualCoverageQuotient
            representativeImportanceScore = importanceScore
            individualID2probabilityMass[
                individualID] = representativeImportanceScore
        sys.stderr.write(" %s IDs with probability mass assigned.\n" %
                         (len(individualID2probabilityMass)))

        self.individualID2probabilityMass = individualID2probabilityMass
        self.individualID2HaplotypeData = individualID2HaplotypeData
    def splitVCFIntoBeagleInputs(self, inputFname=None,
        beagleLikelihoodFile=None, \
        familySize2BeagleFileHandler=None, pedigreeFamilyData=None, \
        minProbForValidCall=0.9, markersFile=None):
        """
        2013.05.03
        
        The non-likelihood (unphased, trios, pairs) Beagle format:
            I id sample1 sample1 sample2 sample2
            A diabetes 1 1 2 2
            M Contig791:1086 C C C C
            M Contig791:1649 T C C C
            M Contig791:4084 G A A A
        """
        sys.stderr.write("Splitting VCFFile %s (+ one beagle Likelihood file %s) into Beagle trios/duos files, minProbForValidCall=%s ... \n"%\
                        (inputFname, beagleLikelihoodFile.inputFname, minProbForValidCall))
        counter = 0
        no_of_trios = 0
        no_of_duos = 0
        no_of_singletons = 0
        totalNoOfCalls = 0
        noOfCallsMarkedMissing = 0
        vcfFile = VCFFile(inputFname=inputFname)
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList

        for vcfRecord in vcfFile:
            oneLocus = next(beagleLikelihoodFile)
            counter += 1
            familySize2CallList = {}
            genotypeLikelihoodList = oneLocus.genotypeLikelihoodList
            for familySize, sampleIDList in familySize2SampleIDList.items():
                if familySize not in familySize2CallList:
                    familySize2CallList[familySize] = []
                for sampleID in sampleIDList:
                    totalNoOfCalls += 1
                    vcfGenotypeCallData = vcfRecord.getGenotypeCallForOneSample(
                        sampleID)
                    tripleLikelihood = beagleLikelihoodFile.getLikelihoodListOfOneGenotypeOneSample(
                        oneLocus=oneLocus, sampleID=sampleID)
                    if familySize == 1:
                        no_of_singletons += 1
                        familySize2CallList[familySize].extend(
                            tripleLikelihood)
                    else:
                        if familySize == 2:
                            no_of_duos += 1
                        elif familySize == 3:
                            no_of_trios += 1
                        tripleLikelihood = list(map(float, tripleLikelihood))
                        maxLikelihoodIndex = numpy.argmax(tripleLikelihood)
                        maxLikelihood = tripleLikelihood[maxLikelihoodIndex]
                        if maxLikelihood >= minProbForValidCall:
                            if maxLikelihoodIndex == 0:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleA
                                ]
                            elif maxLikelihoodIndex == 1:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleB
                                ]
                            else:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleB, oneLocus.alleleB
                                ]
                        else:
                            noOfCallsMarkedMissing += 1
                            diploidCallFromBeagle = ['?', '?']
                        #if vcfGenotypeCallData is None:	#DP is zero
                        #	sys.stderr.write("vcfGenotypeCallData for sample %s at locus %s, %s is None.\n"%\
                        #					(sampleID, vcfRecord.chr, vcfRecord.pos))
                        #	import pdb
                        #	pdb.set_trace()
                        if vcfGenotypeCallData and \
                            self.checkConcordanceBetweenBeagleAndVCFCall(vcfGenotypeCallData['GT'], diploidCallFromBeagle):
                            diploidCall = [
                                vcfGenotypeCallData['GT'][0],
                                vcfGenotypeCallData['GT'][1]
                            ]
                        else:
                            diploidCall = ['?', '?']
                        familySize2CallList[familySize].extend(diploidCall)

            for familySize, callList in familySize2CallList.items():
                if familySize == 1:
                    rowHeaderList = [
                        oneLocus.markerID, oneLocus.alleleA, oneLocus.alleleB
                    ]
                else:
                    rowHeaderList = ['M', oneLocus.markerID]
                beagleFileHandler = familySize2BeagleFileHandler[familySize]

                beagleFileHandler.writerow(rowHeaderList + callList)
            if markersFile is not None:
                markersFile.writerow([
                    oneLocus.markerID,
                    oneLocus.markerID.split(':')[1], oneLocus.alleleA,
                    oneLocus.alleleB
                ])
        vcfFile.close()
        sys.stderr.write("%s loci, total %s calls, %s calls for singletons, %s calls for duos, %s calls for trios. %s calls marked missing.\n"%\
            (counter, totalNoOfCalls, no_of_singletons, no_of_duos, no_of_trios, noOfCallsMarkedMissing))
Esempio n. 21
0
    def extractSamples(self, db_main=None, inputFname=None, outputFname=None, \
        tax_id_set=None, site_id_set=None, country_id_set=None, \
        min_coverage=None, max_coverage=None, outputFormat=1, is_contaminated=None,\
        **keywords):
        """
        2013.07.03 added argument is_contaminated (whether to fetch contaminated samples or not)
        2013.04.30 added argument min_coverage, max_coverage
        2012.10.10
            added argument outputFormat. 
        2012.10.5
            
        """
        sys.stderr.write("Extracting samples from %s, %s sites & %s countries & %s taxonomies, min_coverage=%s, max_coverage=%s, outputFormat=%s, is_contaminated=%s ...\n"%\
                            (inputFname,\
                            getattr(site_id_set, '__len__', returnZeroFunc)(),\
                            getattr(country_id_set, '__len__', returnZeroFunc)(),\
                            getattr(tax_id_set, '__len__', returnZeroFunc)(), min_coverage, max_coverage,\
                            outputFormat, is_contaminated ))
        vcfFile = VCFFile(inputFname=inputFname)

        oldHeader = vcfFile.header
        oldHeaderLength = len(oldHeader)
        newHeader = oldHeader[:vcfFile.
                              sampleStartingColumn]  #anything before the samples are same
        no_of_samples = 0
        col_index2sampleID = {
        }  #this structure stores the selected samples and their column index
        for col_index, individual_name in vcfFile.get_col_index_individual_name_ls(
        ):
            individualAlignment = db_main.parseAlignmentReadGroup(
                individual_name).individualAlignment
            if individualAlignment is not None:
                filteredAlignmentList = db_main.filterAlignments(alignmentLs=[individualAlignment], min_coverage=min_coverage, \
                        max_coverage=max_coverage, individual_site_id=None, \
                        sequence_filtered=None, individual_site_id_set=site_id_set, \
                        mask_genotype_method_id=None, parent_individual_alignment_id=None,\
                        country_id_set=country_id_set, tax_id_set=tax_id_set, excludeContaminant=False, \
                        is_contaminated=is_contaminated, excludeTissueIDSet=None,\
                        local_realigned=None, reduce_reads=None, report=False)
                if filteredAlignmentList:  #non-empty, passed the filter
                    newHeader.append(individual_name)
                    no_of_samples += 1
                    col_index2sampleID[col_index] = individual_name
            else:
                sys.stderr.write(
                    "Warning: no individualAlignment for sample %s.\n" %
                    (individual_name))
                sys.exit(3)

        no_of_snps = 0
        if outputFormat == 1:
            outVCFFile = VCFFile(outputFname=outputFname)
            outVCFFile.metaInfoLs = vcfFile.metaInfoLs
            outVCFFile.header = newHeader
            outVCFFile.writeMetaAndHeader()

            newHeaderLength = len(newHeader)
            for vcfRecord in vcfFile:
                data_row = vcfRecord.row[:vcfFile.sampleStartingColumn]
                for i in range(vcfFile.sampleStartingColumn, oldHeaderLength):
                    if i in col_index2sampleID:
                        data_row.append(vcfRecord.row[i])
                outVCFFile.writer.writerow(data_row)
                no_of_snps += 1
            outVCFFile.close()
        elif outputFormat in [2, 3]:
            outf = open(outputFname, 'w')
            if outputFormat == 2:
                outf.write("sampleID\n")
            for col_index, sampleID in col_index2sampleID.items():
                outf.write("%s\n" % (sampleID))
            outf.close()
        vcfFile.close()
        sys.stderr.write("%s samples X %s SNPs.\n" %
                         (no_of_samples, no_of_snps))