Exemple #1
0
    def makeAlignmentsOfMultipleHits(self,
                                       outDir,
                                       markerFile,
                                       hmmTableFile,
                                       binIdToModels,
                                       binIdToBinMarkerSets,
                                       bIgnoreThresholds,
                                       evalueThreshold,
                                       lengthThreshold,
                                       alignOutputDir,
                                       ):
        """Align markers with multiple hits within a bin."""

        makeSurePathExists(alignOutputDir)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold)

        # align any markers with multiple hits in a bin
        self.logger.info('  Aligning marker genes with multiple hits in a single bin:')

        # process each bin in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for binId in binIdToModels:
            workerQueue.put(binId)

        for _ in range(self.totalThreads):
            workerQueue.put(None)

        try:
            calcProc = [mp.Process(target=self.__createMSA, args=(resultsParser, binIdToBinMarkerSets, markerFile, outDir, alignOutputDir, workerQueue, writerQueue)) for _ in range(self.totalThreads)]
            writeProc = mp.Process(target=self.__reportBinProgress, args=(len(binIdToModels), writerQueue))

            writeProc.start()

            for p in calcProc:
                p.start()

            for p in calcProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            # make sure all processes are terminated
            for p in calcProc:
                p.terminate()

            writeProc.terminate()
Exemple #2
0
    def makeAlignmentsOfMultipleHits(self,
                                       outDir,
                                       markerFile,
                                       hmmTableFile,
                                       binIdToModels,
                                       binIdToBinMarkerSets,
                                       bIgnoreThresholds,
                                       evalueThreshold,
                                       lengthThreshold,
                                       alignOutputDir,
                                       ):
        """Align markers with multiple hits within a bin."""

        makeSurePathExists(alignOutputDir)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold)

        # align any markers with multiple hits in a bin
        self.logger.info('  Aligning marker genes with multiple hits in a single bin:')

        # process each bin in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for binId in binIdToModels:
            workerQueue.put(binId)

        for _ in range(self.totalThreads):
            workerQueue.put(None)

        try:
            calcProc = [mp.Process(target=self.__createMSA, args=(resultsParser, binIdToBinMarkerSets, markerFile, outDir, alignOutputDir, workerQueue, writerQueue)) for _ in range(self.totalThreads)]
            writeProc = mp.Process(target=self.__reportBinProgress, args=(len(binIdToModels), writerQueue))

            writeProc.start()

            for p in calcProc:
                p.start()

            for p in calcProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            # make sure all processes are terminated
            for p in calcProc:
                p.terminate()

            writeProc.terminate()
Exemple #3
0
    def makeAlignmentToPhyloMarkers(self,
                                    outDir,
                                    hmmModelFile,
                                    hmmTableFile,
                                    binIdToModels,
                                    bIgnoreThresholds,
                                    evalueThreshold,
                                    lengthThreshold,
                                    bReportHitStats,
                                    alignOutputDir,
                                    bKeepUnmaskedAlign=False):
        """Align hits to a set of common marker genes."""

        self.logger.info("Extracting marker genes to align.")

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False,
                                   bIgnoreThresholds, evalueThreshold,
                                   lengthThreshold)

        # extract the ORFs to align
        markerSeqs, markerStats = self.__extractMarkerSeqsUnique(
            outDir, resultsParser)

        # generate individual HMMs required to create multiple sequence alignments
        binId = list(binIdToModels.keys())[0]
        hmmModelFiles = {}
        self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId],
                                   hmmModelFiles)

        # align each of the marker genes
        makeSurePathExists(alignOutputDir)
        self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats,
                                hmmModelFiles, alignOutputDir,
                                bKeepUnmaskedAlign)

        # remove the temporary HMM files
        for fileName in hmmModelFiles:
            os.remove(hmmModelFiles[fileName])

        return resultsParser
Exemple #4
0
    def makeAlignmentToPhyloMarkers(self,
                                       outDir,
                                       hmmModelFile,
                                       hmmTableFile,
                                       binIdToModels,
                                       bIgnoreThresholds,
                                       evalueThreshold,
                                       lengthThreshold,
                                       bReportHitStats,
                                       alignOutputDir,
                                       bKeepUnmaskedAlign=False
                                       ):
        """Align hits to a set of common marker genes."""

        self.logger.info("  Extracting marker genes to align.")

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold)

        # extract the ORFs to align
        markerSeqs, markerStats = self.__extractMarkerSeqsUnique(outDir, resultsParser)

        # generate individual HMMs required to create multiple sequence alignments
        binId = binIdToModels.keys()[0]
        hmmModelFiles = {}
        self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId], hmmModelFiles)

        # align each of the marker genes
        makeSurePathExists(alignOutputDir)
        self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats, hmmModelFiles, alignOutputDir, bKeepUnmaskedAlign)

        # remove the temporary HMM files
        for fileName in hmmModelFiles:
            os.remove(hmmModelFiles[fileName])

        return resultsParser
Exemple #5
0
    def run(self, binFiles, outDir, hmmTableFile, binIdToModels,
            binIdToBinMarkerSets, minDeltaComp, maxDeltaCont, minMergedComp,
            maxMergedCont):
        checkDirExists(outDir)

        self.logger.info('  Comparing marker sets between all pairs of bins.')

        # ensure all bins are using the same marker set
        markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys(
        )[0]].mostSpecificMarkerSet().getMarkerGenes()
        for binIdJ in binIdToBinMarkerSets:
            if markerGenesI != binIdToBinMarkerSets[
                    binIdJ].mostSpecificMarkerSet().getMarkerGenes():
                self.logger.error(
                    '  [Error] All bins must use the same marker set to assess potential mergers.'
                )
                sys.exit(0)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile)

        # determine union and intersection of marker sets for each pair of bins
        outputFile = os.path.join(outDir, "merger.tsv")
        fout = open(outputFile, 'w')
        fout.write('Bin Id 1\tBin Id 2')
        fout.write('\tBin 1 completeness\tBin 1 contamination')
        fout.write('\tBin 2 completeness\tBin 2 contamination')
        fout.write('\tDelta completeness\tDelta contamination\tMerger delta')
        fout.write('\tMerged completeness\tMerged contamination\n')

        binMarkerHits = resultsParser.results
        binIds = sorted(binMarkerHits.keys())
        for i in range(0, len(binMarkerHits)):
            binIdI = binIds[i]

            geneCountsI = binMarkerHits[binIdI].geneCounts(
                binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(),
                binMarkerHits[binIdI].markerHits, True)
            completenessI, contaminationI = geneCountsI[6:8]

            for j in range(i + 1, len(binMarkerHits)):
                binIdJ = binIds[j]

                geneCountsJ = binMarkerHits[binIdJ].geneCounts(
                    binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(),
                    binMarkerHits[binIdJ].markerHits, True)
                completenessJ, contaminationJ = geneCountsJ[6:8]

                # merge together hits from both bins and calculate completeness and contamination
                mergedHits = {}
                for markerId, hits in binMarkerHits[
                        binIdI].markerHits.iteritems():
                    mergedHits[markerId] = list(hits)

                for markerId, hits in binMarkerHits[
                        binIdJ].markerHits.iteritems():
                    if markerId in mergedHits:
                        mergedHits[markerId].extend(hits)
                    else:
                        mergedHits[markerId] = hits

                geneCountsMerged = binMarkerHits[binIdI].geneCounts(
                    binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(),
                    mergedHits, True)
                completenessMerged, contaminationMerged = geneCountsMerged[6:8]

                if not (completenessMerged >= minMergedComp
                        and contaminationMerged < maxMergedCont):
                    continue

                # calculate merged statistics
                deltaComp = completenessMerged - max(completenessI,
                                                     completenessJ)
                deltaCont = contaminationMerged - max(contaminationI,
                                                      contaminationJ)
                delta = deltaComp - deltaCont

                if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont:
                    fout.write(
                        '%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n'
                        % (binIdI, binIdJ, completenessI, contaminationI,
                           completenessJ, contaminationJ, deltaComp, deltaCont,
                           delta, completenessMerged, contaminationMerged))

        fout.close()

        return outputFile
Exemple #6
0
    def run(self, binFiles, outDir, hmmTableFile,
                binIdToModels, binIdToBinMarkerSets,
                minDeltaComp, maxDeltaCont,
                minMergedComp, maxMergedCont):
        checkDirExists(outDir)

        self.logger.info('  Comparing marker sets between all pairs of bins.')

        # ensure all bins are using the same marker set
        markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]].mostSpecificMarkerSet().getMarkerGenes()
        for binIdJ in binIdToBinMarkerSets:
            if markerGenesI != binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet().getMarkerGenes():
                self.logger.error('  [Error] All bins must use the same marker set to assess potential mergers.')
                sys.exit(0)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile)

        # determine union and intersection of marker sets for each pair of bins
        outputFile = os.path.join(outDir, "merger.tsv")
        fout = open(outputFile, 'w')
        fout.write('Bin Id 1\tBin Id 2')
        fout.write('\tBin 1 completeness\tBin 1 contamination')
        fout.write('\tBin 2 completeness\tBin 2 contamination')
        fout.write('\tDelta completeness\tDelta contamination\tMerger delta')
        fout.write('\tMerged completeness\tMerged contamination\n')

        binMarkerHits = resultsParser.results
        binIds = sorted(binMarkerHits.keys())
        for i in xrange(0, len(binMarkerHits)):
            binIdI = binIds[i]

            geneCountsI = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(), binMarkerHits[binIdI].markerHits, True)
            completenessI, contaminationI = geneCountsI[6:8]

            for j in xrange(i + 1, len(binMarkerHits)):
                binIdJ = binIds[j]

                geneCountsJ = binMarkerHits[binIdJ].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), binMarkerHits[binIdJ].markerHits, True)
                completenessJ, contaminationJ = geneCountsJ[6:8]

                # merge together hits from both bins and calculate completeness and contamination
                mergedHits = {}
                for markerId, hits in binMarkerHits[binIdI].markerHits.iteritems():
                    mergedHits[markerId] = list(hits)

                for markerId, hits in binMarkerHits[binIdJ].markerHits.iteritems():
                    if markerId in mergedHits:
                        mergedHits[markerId].extend(hits)
                    else:
                        mergedHits[markerId] = hits

                geneCountsMerged = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), mergedHits, True)
                completenessMerged, contaminationMerged = geneCountsMerged[6:8]

                if not (completenessMerged >= minMergedComp and contaminationMerged < maxMergedCont):
                    continue

                # calculate merged statistics
                deltaComp = completenessMerged - max(completenessI, completenessJ)
                deltaCont = contaminationMerged - max(contaminationI, contaminationJ)
                delta = deltaComp - deltaCont

                if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont:
                    fout.write('%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' %
                                                                        (binIdI, binIdJ,
                                                                         completenessI, contaminationI,
                                                                         completenessJ, contaminationJ,
                                                                         deltaComp, deltaCont, delta,
                                                                         completenessMerged, contaminationMerged))

        fout.close()

        return outputFile