def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, mode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
Esempio n. 2
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2genotypeVectorLs = self.readInSNPID2GenotypeVectorLs(
            self.inputFname).snp_pos2returnData

        writer = MatrixFile(self.outputFname, mode='w', delimiter='\t')
        header = [
            'chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance'
        ]
        writer.writeHeader(header)

        counter = 0
        real_counter = 0
        no_of_pairs = 0
        snp_pos_ls = sorted(snp_pos2genotypeVectorLs)
        for i in range(len(snp_pos_ls)):
            counter += 1
            key = snp_pos_ls[i]
            chromosome, position = snp_pos_ls[i][:2]
            genotypeVectorLs = snp_pos2genotypeVectorLs.get(key)
            if len(genotypeVectorLs) > 1:
                real_counter += 1
                for k in range(0, len(genotypeVectorLs) - 1):
                    for l in range(k + 1, len(genotypeVectorLs)):
                        no_of_pairs += 1
                        noOfMatches = 0
                        noOfTotal = 0
                        genotypeVector0 = genotypeVectorLs[k]
                        genotypeVector1 = genotypeVectorLs[l]
                        for j in range(len(genotypeVector0)):
                            call1 = genotypeVector0[j]['GT']
                            call2 = genotypeVector1[j]['GT']
                            if call1 != 'NA' and call2 != 'NA':
                                noOfTotal += 1
                                if SNP.nt2number[call1] == SNP.nt2number[
                                        call2]:
                                    noOfMatches += 1
                        if noOfTotal > 0:
                            concordance = float(noOfMatches) / float(noOfTotal)
                        else:
                            concordance = -1
                        data_row = [
                            chromosome, position, noOfMatches, noOfTotal,
                            concordance
                        ]
                        writer.writerow(data_row)
        writer.close()
        sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \
                                                real_counter/float(counter), no_of_pairs))
 def outputFinalData(self,
                     outputFname,
                     key2dataLs=None,
                     delimiter=None,
                     header=None):
     """
     header output is not dependent on key2dataLs anymore 
     """
     writer = MatrixFile(path=outputFname, delimiter=delimiter, mode='w')
     if header and delimiter:
         writer.writerow(header)
     if key2dataLs and delimiter:
         keyLs = sorted(key2dataLs)
         for key in keyLs:
             dataLs = key2dataLs.get(key)
             writer.writerow(list(key) + dataLs)
     writer.close()
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        switchPointData = self.readInStats(inputFname=self.inputFname)

        sys.stderr.write("Processing data ...")
        writer = MatrixFile(self.outputFname, mode='w')
        header = [
            "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction',
            "noOfLoci", 'noOfLociFraction'
        ]
        writer.writeHeader(header)

        data_matrix = switchPointData.data_matrix
        totalSpan = switchPointData.totalSpan
        totalNoOfLoci = switchPointData.totalNoOfLoci

        #sort it based on switchFrequency
        data_matrix.sort(reverse=True)
        maxSwitchFrequencyLs = []
        cumulativeRegionSpanLs = []
        cumulativeNoOfLociLs = []
        for i in range(len(data_matrix)):
            switchFrequency, regionSpan, noOfLoci = data_matrix[i]
            maxSwitchFrequencyLs.append(switchFrequency)
            if i == 0:
                cumulativeRegionSpan = totalSpan - regionSpan

                cumulativeNoOfLoci = totalNoOfLoci - noOfLoci
            else:
                cumulativeRegionSpan = cumulativeRegionSpanLs[i -
                                                              1] - regionSpan
                cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci
            cumulativeRegionSpanLs.append(cumulativeRegionSpan)
            cumulativeNoOfLociLs.append(cumulativeNoOfLoci)
            writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\
                cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)])
        writer.close()
        sys.stderr.write(".\n")
    def readInStats(self, inputFname=None):
        """
		2013.07.15
		"""
        sys.stderr.write("Reading stats from %s ..." % (inputFname))

        data_matrix = []

        reader = MatrixFile(inputFname)
        reader.constructColName2IndexFromHeader()
        switchFrequencyIndex = reader.getColIndexGivenColHeader(
            "noOfSwitchPoints_by_noOfLociWithUniqueHit")
        regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan")
        noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2")

        totalSpan = 0
        totalNoOfLoci = 0
        counter = 0
        for row in reader:
            counter += 1
            switchFrequency = row[switchFrequencyIndex]
            regionSpan = row[regionSpanIndex]
            noOfLoci = row[noOfLociIndex]
            if switchFrequency and regionSpan and noOfLoci:  #non-empty
                switchFrequency = float(switchFrequency)
                regionSpan = int(float(regionSpan))
                noOfLoci = int(float(noOfLoci))
                data_matrix.append([switchFrequency, regionSpan, noOfLoci])
                totalSpan += regionSpan
                totalNoOfLoci += noOfLoci
        reader.close()
        sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\
            (len(data_matrix), counter, totalSpan, totalNoOfLoci))
        return PassingData(data_matrix=data_matrix,
                           totalSpan=totalSpan,
                           totalNoOfLoci=totalNoOfLoci)
Esempio n. 6
0
    def setup(self, **keywords):
        """
		2012.10.15
			run before anything is run
		"""
        AbstractMatrixFileWalker.setup(self, **keywords)
        #self.writer = BeagleGenotypeFile(path=self.outputFname, mode='w')

        #read in the IBD check result
        self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \
            rowIDHeader=None, colIDHeader=None, \
            rowIDIndex=0, colIDIndex=1, \
            dataHeader=None, dataIndex=2, hasHeader=False)

        #. read in the alignment coverage data
        alignmentCoverageFile = MatrixFile(
            path=self.individualAlignmentCoverageFname)
        alignmentCoverageFile.constructColName2IndexFromHeader()
        alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(
            keyColumnIndexList=[0], valueColumnIndexList=[1])
        alignmentCoverageFile.close()

        sys.stderr.write(
            "Reading in all samples from %s VCF input files ... \n" %
            (len(self.inputFnameLs)))
        # read all the Beagle files
        individualID2HaplotypeData = {}
        for inputFname in self.inputFnameLs:
            vcfFile = VCFFile(inputFname=inputFname)
            #vcfFile.readInAllHaplotypes()
            for individualID in vcfFile.getSampleIDList():
                individualID2HaplotypeData[individualID] = None
                #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID)
                #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList,
                #													locusIDList=vcfFile.locusIDList)
            # get all haplotypes , etc.
            # get all sample IDs
        sys.stderr.write("%s individuals total.\n" %
                         (len(individualID2HaplotypeData)))

        #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns)
        #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child)
        sys.stderr.write("Constructing individualID2pedigreeContext ...")
        plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname)
        pGraph = plinkPedigreeFile.pedigreeGraph
        #shrink the graph to only individuals with data
        pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys())

        cc_subgraph_list = nx.connected_component_subgraphs(
            pGraph.to_undirected())
        individualID2familyContext = {}
        outDegreeContainer = NumberContainer(minValue=0)
        familySizeContainer = NumberContainer(minValue=0)
        individualCoverageContainer = NumberContainer(minValue=0)
        familyCoverageContainer = NumberContainer(minValue=0)
        for cc_subgraph in cc_subgraph_list:
            familySize = len(cc_subgraph)
            familySizeContainer.addOneValue(familySize)

            familyCoverage = 0
            for n in cc_subgraph:  #assuming each family is a two-generation trio/nuclear family
                individualCoverage = self.getIndividualCoverage(
                    individualID=n,
                    alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs
                )
                individualCoverage = float(individualCoverage)
                individualCoverageContainer.addOneValue(individualCoverage)
                familyCoverage += individualCoverage
                in_degree = pGraph.in_degree(n)
                out_degree = pGraph.out_degree(n)
                outDegreeContainer.addOneValue(out_degree)
                familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \
                      individualCoverage=individualCoverage,\
                      familyCoverage=None)
                if n not in individualID2familyContext:
                    individualID2familyContext[n] = familyContext
                else:
                    sys.stderr.write(
                        "Node %s already in individualID2familyContext.\n" %
                        (n))
            familyCoverageContainer.addOneValue(familyCoverage)
            #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype
            for n in cc_subgraph:
                individualID2familyContext[n].familyCoverage = familyCoverage
        plinkPedigreeFile.close()
        sys.stderr.write("%s individuals.\n" %
                         (len(individualID2familyContext)))

        # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual
        sys.stderr.write(
            "Weighing each individual , assigning probability mass  ...")
        individualID2probabilityMass = {}
        for individualID, familyContext in individualID2familyContext.items():
            outDegreeQuotient = outDegreeContainer.normalizeValue(
                familyContext.familySize)
            individualCoverageQuotient = individualCoverageContainer.normalizeValue(
                familyContext.individualCoverage)
            #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage)
            importanceScore = outDegreeQuotient + individualCoverageQuotient
            representativeImportanceScore = importanceScore
            individualID2probabilityMass[
                individualID] = representativeImportanceScore
        sys.stderr.write(" %s IDs with probability mass assigned.\n" %
                         (len(individualID2probabilityMass)))

        self.individualID2probabilityMass = individualID2probabilityMass
        self.individualID2HaplotypeData = individualID2HaplotypeData
Esempio n. 7
0
    def run(self):
        """
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        reader = MatrixFile(path=self.inputFname)
        reader.constructColName2IndexFromHeader()

        meanMendelErrorIndex = reader.getColIndexGivenColHeader(
            "meanMendelError")
        noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count")
        sumOfMendelErrorIndex = reader.getColIndexGivenColHeader(
            "sumOfMendelError")

        plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname)
        familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay()

        twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=2)
        singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=1)
        zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=0)

        writer = MatrixFile(self.outputFname, mode='w', delimiter='\t')
        header = ["ID", "noOfTotalLoci", \
          "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \
          "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies",  "noOfIndividualsInSingleParentFamilies", \
          "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \
          "noOfTotalMendelErrors", \
          "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"]
        writer.writeHeader(header)
        for row in reader:
            meanMendelError = float(row[meanMendelErrorIndex])
            noOfLoci = int(row[noOfLociIndex])
            sumOfMendelError = int(row[sumOfMendelErrorIndex])
            noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies
            if noOfNuclearFamilies > 0:
                noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float(
                    noOfNuclearFamilies)
                noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float(
                    noOfNuclearFamilies)
            else:
                noOfMendelErrorsPerLocusPerNuclearFamily = -1
                noOfMendelErrorsPerNuclearFamily = -1
            data_row = [row[0], noOfLoci, \
              noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \
               twoParentFamilyCountData.noOfIndividuals,\
              singleParentFamilyCountData.noOfFamilies,  singleParentFamilyCountData.noOfParents,  singleParentFamilyCountData.noOfKids,\
               singleParentFamilyCountData.noOfIndividuals,\
              zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents,  zeroParentFamilyCountData.noOfKids,\
               zeroParentFamilyCountData.noOfIndividuals,\
              sumOfMendelError, \
              noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ]
            writer.writerow(data_row)

        plinkPedigreeFile.close()
        reader.close()
        writer.close()