def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        switchDensity = self.readInSwitchDensity(
            inputFname=self.switchPointFname).switchDensity

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()
        counter = 0
        real_counter = 0

        if switchDensity <= self.maxSwitchDensity:

            for vcfRecord in reader:  #assuming input VCF is sorted
                counter += 1
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)

        reader.close()
        writer.close()
        sys.stderr.write("%s (out of %s) records outputted.\n" %
                         (real_counter, counter))
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, mode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
Esempio n. 3
0
    def filterVCFSNPCluster(self,
                            inputFname=None,
                            outputFname=None,
                            minNeighborDistance=10,
                            **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        sys.stderr.write(
            "Filtering VCF %s to get rid of SNPs that are %s distance apart ..."
            % (inputFname, minNeighborDistance))
        vcfFile = VCFFile(inputFname=inputFname)

        outVCFFile = VCFFile(outputFname=outputFname)
        outVCFFile.metaInfoLs = vcfFile.metaInfoLs
        outVCFFile.header = vcfFile.header
        outVCFFile.writeMetaAndHeader()

        previousVCFRecord = None
        previousVCFRecordIsBad = False  #indicator whether previous record is bad or not. based on distance to the previous-previous record
        counter = 0
        for vcfRecord in vcfFile:
            if previousVCFRecord is not None:
                if previousVCFRecord.chr == vcfRecord.chr:
                    distanceToPreviousRecord = abs(vcfRecord.pos -
                                                   previousVCFRecord.pos)
                    if distanceToPreviousRecord < minNeighborDistance:
                        previousVCFRecordIsBad = True
                    else:
                        if not previousVCFRecordIsBad:  #distance to current & previous-previous record is >=minNeighborDistance
                            outVCFFile.writeVCFRecord(previousVCFRecord)
                        previousVCFRecordIsBad = False
                else:
                    #handle the last record from the previous chromosome (assuming loci are in chromosomal order)
                    if not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
                        outVCFFile.writeVCFRecord(previousVCFRecord)

                    previousVCFRecordIsBad = False  #reset

            previousVCFRecord = vcfRecord
            counter += 1
        vcfFile.close()

        #handle the last record
        if previousVCFRecord is not None and not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
            outVCFFile.writeVCFRecord(previousVCFRecord)
        outVCFFile.close()

        noOfLociAfterFilter = len(outVCFFile.locus_id_ls)
        delta = counter - noOfLociAfterFilter
        if counter > 0:
            fraction = delta / float(counter)
        else:
            fraction = -0.0
        sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" %
                         (delta, counter, noOfLociAfterFilter, fraction * 100))
Esempio n. 4
0
	def splitVCF(self, inputFname, outputFnamePrefix=None, noOfOverlappingSites=1000, noOfSitesPerUnit=5000,\
				noOfTotalSites=None):
		"""
		2012.8.25
			
		"""
		sys.stderr.write("Splitting VCF %s into files each with %s sites and %s overlapping ... \n"%(inputFname, noOfSitesPerUnit,\
																		noOfOverlappingSites))
		
		vcfFile = VCFFile(inputFname=inputFname)
		
		unitNumber2OutVCFFile = {}
		counter = 0
		real_counter = 0
		#make it 1 less than total so the last unit is >=s
		noOfUnits = max(1, utils.getNoOfUnitsNeededToCoverN(N=noOfTotalSites, s=noOfSitesPerUnit, o=noOfOverlappingSites)-1)
		sys.stderr.write(" will be split into %s units ... "%(noOfUnits))
		overlappingRecordLs = []
		for vcfRecord in vcfFile:
			counter += 1
			#below the maximum: noOfUnits.
			unitNumber = min(noOfUnits, max(1, utils.getNoOfUnitsNeededToCoverN(N=counter, s=noOfSitesPerUnit, o=noOfOverlappingSites)))
			if unitNumber not in unitNumber2OutVCFFile:
				outputFname = '%s_unit%s.vcf'%(outputFnamePrefix, unitNumber)
				outVCFFile = VCFFile(outputFname=outputFname)
				outVCFFile.metaInfoLs = vcfFile.metaInfoLs
				outVCFFile.header = vcfFile.header
				outVCFFile.writeMetaAndHeader()
				outVCFFile.noOfLoci =0
				#output the overlapping vcf records (from previous unit
				if overlappingRecordLs:
					for overlappingVCFRecord in overlappingRecordLs:
						outVCFFile.writeVCFRecord(overlappingVCFRecord)
						outVCFFile.noOfLoci += 1
					overlappingRecordLs = []	#reset it
				unitNumber2OutVCFFile[unitNumber] = outVCFFile
			outVCFFile = unitNumber2OutVCFFile[unitNumber]
			outVCFFile.writeVCFRecord(vcfRecord)
			outVCFFile.noOfLoci += 1
			#store the overlapping records
			if unitNumber<noOfUnits:
				if outVCFFile.noOfLoci>(noOfSitesPerUnit-noOfOverlappingSites):
					overlappingRecordLs.append(vcfRecord)
			
		
		vcfFile.close()
		#close all output files
		for unitNumber, outVCFFile in unitNumber2OutVCFFile.items():
			outVCFFile.close()
		
		sys.stderr.write("%s loci split into %s files.\n"%(counter, len(unitNumber2OutVCFFile)))
 def run(self):
     if self.debug:
         import pdb
         pdb.set_trace()
     
     outputDir = os.path.split(self.outputFname)[0]
     if outputDir and not os.path.isdir(outputDir):
         os.makedirs(outputDir)
     locusID2Stat = self.getLocusID2StatFunctionDict[self.runType](self.statFname)
     
     reader = VCFFile(inputFname=self.inputFname)
     writer = VCFFile(outputFname=self.outputFname, mode='w')
     writer.metaInfoLs = reader.metaInfoLs
     writer.header = reader.header
     writer.writeMetaAndHeader()
     
     counter = 0
     real_counter = 0
     
     for vcfRecord in reader:	#assuming input VCF is sorted
         counter += 1
         key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position)
         stat = locusID2Stat.get(key)
         if stat is None:
             continue
         
         toKeepLocus = True
         if self.minValue is not None and stat < self.minValue:
             toKeepLocus = False
         if self.maxValue is not None and stat > self.maxValue:
             toKeepLocus = False
         
         if toKeepLocus:
             real_counter += 1
             writer.writeVCFRecord(vcfRecord)
     reader.close()
     writer.close()
     if counter>0:
         fraction = real_counter/float(counter)
     else:
         fraction = -1
     sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                                             fraction))
Esempio n. 6
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusNewID2mapPvalue = self.getLocusNewID2mapPvalue(
            self.liftOverLocusMapPvalueFname)

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            mapPvalue = locusNewID2mapPvalue.get(key)
            if mapPvalue is None:
                continue

            if mapPvalue > self.minLiftOverMapPvalue:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2count = self.readInSNPID2GenotypeVectorLs(
            self.inputFname, returnType=2).snp_pos2returnData

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            frequency = snp_pos2count.get(key)
            if frequency == 1:
                writer.writeVCFRecord(vcfRecord)
                real_counter += 1

        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = 0
        sys.stderr.write("%s (out of %s, %s) snps are unique.\n" %
                         (real_counter, counter, fraction))
    def extractSamples(self, db_main=None, inputFname=None, outputFname=None, \
        tax_id_set=None, site_id_set=None, country_id_set=None, \
        min_coverage=None, max_coverage=None, outputFormat=1, is_contaminated=None,\
        **keywords):
        """
        2013.07.03 added argument is_contaminated (whether to fetch contaminated samples or not)
        2013.04.30 added argument min_coverage, max_coverage
        2012.10.10
            added argument outputFormat. 
        2012.10.5
            
        """
        sys.stderr.write("Extracting samples from %s, %s sites & %s countries & %s taxonomies, min_coverage=%s, max_coverage=%s, outputFormat=%s, is_contaminated=%s ...\n"%\
                            (inputFname,\
                            getattr(site_id_set, '__len__', returnZeroFunc)(),\
                            getattr(country_id_set, '__len__', returnZeroFunc)(),\
                            getattr(tax_id_set, '__len__', returnZeroFunc)(), min_coverage, max_coverage,\
                            outputFormat, is_contaminated ))
        vcfFile = VCFFile(inputFname=inputFname)

        oldHeader = vcfFile.header
        oldHeaderLength = len(oldHeader)
        newHeader = oldHeader[:vcfFile.
                              sampleStartingColumn]  #anything before the samples are same
        no_of_samples = 0
        col_index2sampleID = {
        }  #this structure stores the selected samples and their column index
        for col_index, individual_name in vcfFile.get_col_index_individual_name_ls(
        ):
            individualAlignment = db_main.parseAlignmentReadGroup(
                individual_name).individualAlignment
            if individualAlignment is not None:
                filteredAlignmentList = db_main.filterAlignments(alignmentLs=[individualAlignment], min_coverage=min_coverage, \
                        max_coverage=max_coverage, individual_site_id=None, \
                        sequence_filtered=None, individual_site_id_set=site_id_set, \
                        mask_genotype_method_id=None, parent_individual_alignment_id=None,\
                        country_id_set=country_id_set, tax_id_set=tax_id_set, excludeContaminant=False, \
                        is_contaminated=is_contaminated, excludeTissueIDSet=None,\
                        local_realigned=None, reduce_reads=None, report=False)
                if filteredAlignmentList:  #non-empty, passed the filter
                    newHeader.append(individual_name)
                    no_of_samples += 1
                    col_index2sampleID[col_index] = individual_name
            else:
                sys.stderr.write(
                    "Warning: no individualAlignment for sample %s.\n" %
                    (individual_name))
                sys.exit(3)

        no_of_snps = 0
        if outputFormat == 1:
            outVCFFile = VCFFile(outputFname=outputFname)
            outVCFFile.metaInfoLs = vcfFile.metaInfoLs
            outVCFFile.header = newHeader
            outVCFFile.writeMetaAndHeader()

            newHeaderLength = len(newHeader)
            for vcfRecord in vcfFile:
                data_row = vcfRecord.row[:vcfFile.sampleStartingColumn]
                for i in range(vcfFile.sampleStartingColumn, oldHeaderLength):
                    if i in col_index2sampleID:
                        data_row.append(vcfRecord.row[i])
                outVCFFile.writer.writerow(data_row)
                no_of_snps += 1
            outVCFFile.close()
        elif outputFormat in [2, 3]:
            outf = open(outputFname, 'w')
            if outputFormat == 2:
                outf.write("sampleID\n")
            for col_index, sampleID in col_index2sampleID.items():
                outf.write("%s\n" % (sampleID))
            outf.close()
        vcfFile.close()
        sys.stderr.write("%s samples X %s SNPs.\n" %
                         (no_of_samples, no_of_snps))