Example #1
0
    def treeQA(self, options):
        """QA command"""
        self.logger.info(
            '[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.'
        )

        checkDirExists(options.tree_dir)

        # set HMM file for each bin
        markerSetParser = MarkerSetParser()
        hmmModelInfoFile = os.path.join(options.tree_dir, 'storage',
                                        DefaultValues.PHYLO_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # calculate marker gene statistics
        RP = ResultsParser(binIdToModels)
        binStats = RP.analyseResults(options.tree_dir,
                                     DefaultValues.BIN_STATS_PHYLO_OUT,
                                     DefaultValues.HMMER_TABLE_PHYLO_OUT)

        # determine taxonomy of each bin

        treeParser = TreeParser()
        treeParser.printSummary(options.out_format, options.tree_dir, RP,
                                options.bTabTable, options.file, binStats)

        if options.file != '':
            self.logger.info('QA information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
Example #2
0
    def makeAlignmentsOfMultipleHits(self,
                                       outDir,
                                       markerFile,
                                       hmmTableFile,
                                       binIdToModels,
                                       binIdToBinMarkerSets,
                                       bIgnoreThresholds,
                                       evalueThreshold,
                                       lengthThreshold,
                                       alignOutputDir,
                                       ):
        """Align markers with multiple hits within a bin."""

        makeSurePathExists(alignOutputDir)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold)

        # align any markers with multiple hits in a bin
        self.logger.info('  Aligning marker genes with multiple hits in a single bin:')

        # process each bin in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for binId in binIdToModels:
            workerQueue.put(binId)

        for _ in range(self.totalThreads):
            workerQueue.put(None)

        try:
            calcProc = [mp.Process(target=self.__createMSA, args=(resultsParser, binIdToBinMarkerSets, markerFile, outDir, alignOutputDir, workerQueue, writerQueue)) for _ in range(self.totalThreads)]
            writeProc = mp.Process(target=self.__reportBinProgress, args=(len(binIdToModels), writerQueue))

            writeProc.start()

            for p in calcProc:
                p.start()

            for p in calcProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            # make sure all processes are terminated
            for p in calcProc:
                p.terminate()

            writeProc.terminate()
Example #3
0
    def makeAlignmentsOfMultipleHits(self,
                                       outDir,
                                       markerFile,
                                       hmmTableFile,
                                       binIdToModels,
                                       binIdToBinMarkerSets,
                                       bIgnoreThresholds,
                                       evalueThreshold,
                                       lengthThreshold,
                                       alignOutputDir,
                                       ):
        """Align markers with multiple hits within a bin."""

        makeSurePathExists(alignOutputDir)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold)

        # align any markers with multiple hits in a bin
        self.logger.info('  Aligning marker genes with multiple hits in a single bin:')

        # process each bin in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for binId in binIdToModels:
            workerQueue.put(binId)

        for _ in range(self.totalThreads):
            workerQueue.put(None)

        try:
            calcProc = [mp.Process(target=self.__createMSA, args=(resultsParser, binIdToBinMarkerSets, markerFile, outDir, alignOutputDir, workerQueue, writerQueue)) for _ in range(self.totalThreads)]
            writeProc = mp.Process(target=self.__reportBinProgress, args=(len(binIdToModels), writerQueue))

            writeProc.start()

            for p in calcProc:
                p.start()

            for p in calcProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            # make sure all processes are terminated
            for p in calcProc:
                p.terminate()

            writeProc.terminate()
Example #4
0
    def makeAlignmentToPhyloMarkers(self,
                                    outDir,
                                    hmmModelFile,
                                    hmmTableFile,
                                    binIdToModels,
                                    bIgnoreThresholds,
                                    evalueThreshold,
                                    lengthThreshold,
                                    bReportHitStats,
                                    alignOutputDir,
                                    bKeepUnmaskedAlign=False):
        """Align hits to a set of common marker genes."""

        self.logger.info("Extracting marker genes to align.")

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False,
                                   bIgnoreThresholds, evalueThreshold,
                                   lengthThreshold)

        # extract the ORFs to align
        markerSeqs, markerStats = self.__extractMarkerSeqsUnique(
            outDir, resultsParser)

        # generate individual HMMs required to create multiple sequence alignments
        binId = list(binIdToModels.keys())[0]
        hmmModelFiles = {}
        self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId],
                                   hmmModelFiles)

        # align each of the marker genes
        makeSurePathExists(alignOutputDir)
        self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats,
                                hmmModelFiles, alignOutputDir,
                                bKeepUnmaskedAlign)

        # remove the temporary HMM files
        for fileName in hmmModelFiles:
            os.remove(hmmModelFiles[fileName])

        return resultsParser
Example #5
0
    def qa(self, options):
        """QA command"""
        self.logger.info('[CheckM - qa] Tabulating genome statistics.')

        checkDirExists(options.analyze_dir)

        if options.exclude_markers:
            checkFileExists(options.exclude_markers)

        # calculate AAI between marks with multiple hits in a single bin
        aai = AminoAcidIdentity()
        aai.run(options.aai_strain, options.analyze_dir,
                options.alignment_file)

        # get HMM file for each bin

        markerSetParser = MarkerSetParser(options.threads)

        hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir),
            options.marker_file, options.exclude_markers)

        # get results for each bin
        RP = ResultsParser(binIdToModels)
        RP.analyseResults(
            options.analyze_dir,
            DefaultValues.BIN_STATS_OUT,
            DefaultValues.HMMER_TABLE_OUT,
            bIgnoreThresholds=options.bIgnoreThresholds,
            evalueThreshold=options.e_value,
            lengthThreshold=options.length,
            bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection,
            bSkipAdjCorrection=options.bSkipAdjCorrection)

        RP.printSummary(options.out_format,
                        aai,
                        binIdToBinMarkerSets,
                        options.bIndividualMarkers,
                        options.coverage_file,
                        options.bTabTable,
                        options.file,
                        anaFolder=options.analyze_dir)
        RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets,
                        options.bIndividualMarkers)

        if options.file != '':
            self.logger.info('QA information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
Example #6
0
    def markerPlot(self, options):
        """Marker gene position plot command"""

        self.logger.info(
            '[CheckM - marker_plot] Creating marker gene position plot.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        # generate plot for each bin
        binFiles = self.binFiles(options.bin_dir, options.extension)

        resultsParser = ResultsParser(None)
        markerGeneStats = resultsParser.parseMarkerGeneStats(
            options.results_dir)
        binStats = resultsParser.parseBinStatsExt(options.results_dir)

        plot = MarkerGenePosPlot(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info(
                'Plotting marker gene position plot for %s (%d of %d)' %
                (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1

            if binId not in markerGeneStats or binId not in binStats:
                continue  # bin has no marker genes

            bPlotted = plot.plot(f, markerGeneStats[binId], binStats[binId])

            if bPlotted:
                outputFile = os.path.join(
                    options.output_dir,
                    binId) + '.marker_pos_plot.' + options.image_type
                plot.savePlot(outputFile, dpi=options.dpi)
                self.logger.info('Plot written to: ' + outputFile)
            else:
                self.logger.info('No marker genes found in bin.')

        self.timeKeeper.printTimeStamp()
Example #7
0
    def makeAlignmentToPhyloMarkers(self,
                                       outDir,
                                       hmmModelFile,
                                       hmmTableFile,
                                       binIdToModels,
                                       bIgnoreThresholds,
                                       evalueThreshold,
                                       lengthThreshold,
                                       bReportHitStats,
                                       alignOutputDir,
                                       bKeepUnmaskedAlign=False
                                       ):
        """Align hits to a set of common marker genes."""

        self.logger.info("  Extracting marker genes to align.")

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold)

        # extract the ORFs to align
        markerSeqs, markerStats = self.__extractMarkerSeqsUnique(outDir, resultsParser)

        # generate individual HMMs required to create multiple sequence alignments
        binId = binIdToModels.keys()[0]
        hmmModelFiles = {}
        self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId], hmmModelFiles)

        # align each of the marker genes
        makeSurePathExists(alignOutputDir)
        self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats, hmmModelFiles, alignOutputDir, bKeepUnmaskedAlign)

        # remove the temporary HMM files
        for fileName in hmmModelFiles:
            os.remove(hmmModelFiles[fileName])

        return resultsParser
Example #8
0
    def binQAPlot(self, options):
        """Bin QA plot command"""

        self.logger.info(
            '[CheckM - bin_qa_plot] Creating bar plot of bin quality.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        # read model info
        # hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO)
        # binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # read sequence stats file
        resultsParser = ResultsParser(None)
        binStatsExt = resultsParser.parseBinStatsExt(options.results_dir)

        # create plot for each bin
        plot = BinQAPlot(options)
        bMakePlot = True
        if not options.bIgnoreHetero:
            aai = AminoAcidIdentity()
            aai.run(options.aai_strain, options.results_dir, None)
            bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero,
                                  aai.aaiHetero)
        else:
            bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero,
                                  None)

        if bMakePlot:
            outputFile = os.path.join(options.output_dir,
                                      'bin_qa_plot.' + options.image_type)
            plot.savePlot(outputFile, dpi=options.dpi)

            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Example #9
0
    def lineageSet(self, options, db=None):
        """Lineage set command"""
        self.logger.info(
            '[CheckM - lineage_set] Inferring lineage-specific marker sets.')

        checkDirExists(options.tree_dir)

        # set HMM file for each bin
        markerSetParser = MarkerSetParser()
        hmmModelInfoFile = os.path.join(options.tree_dir, 'storage',
                                        DefaultValues.PHYLO_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # calculate marker gene statistics
        resultsParser = ResultsParser(binIdToModels)
        resultsParser.analyseResults(options.tree_dir,
                                     DefaultValues.BIN_STATS_PHYLO_OUT,
                                     DefaultValues.HMMER_TABLE_PHYLO_OUT)

        # These options are incompatible with how the lineage-specific marker set is selected, so
        # the default values are currently hard-coded

        options.num_genomes_markers = 2
        options.bootstrap = 0
        options.bRequireTaxonomy = False

        treeParser = TreeParser()
        treeParser.getBinMarkerSets(
            options.tree_dir, options.marker_file, options.num_genomes_markers,
            options.bootstrap, options.bNoLineageSpecificRefinement,
            options.bForceDomain, options.bRequireTaxonomy, resultsParser,
            options.unique, options.multi)

        self.logger.info('Marker set written to: ' + options.marker_file)

        self.timeKeeper.printTimeStamp()
Example #10
0
    def verifyAnalyze(self, outdir):
        """Verify output of analyze command."""

        # verify bin stats using independently verified ground truth values
        binStats = ResultsParser(None).parseBinStats(
            outdir, DefaultValues.BIN_STATS_OUT)

        np.testing.assert_almost_equal(binStats['637000110']['GC'],
                                       0.508,
                                       decimal=3,
                                       err_msg="Failed GC test")
        np.testing.assert_almost_equal(binStats['637000110']['GC std'],
                                       0.0,
                                       err_msg="Failed GC std test")
        # np.testing.assert_almost_equal(binStats['637000110']['Coding density'], 0.877, decimal=3, err_msg="Failed coding density test") # depends on exact version of prodigal
        np.testing.assert_equal(binStats['637000110']['# contigs'],
                                1,
                                err_msg="Failed # contigs test")
        np.testing.assert_equal(binStats['637000110']['# scaffolds'],
                                1,
                                err_msg="Failed # scaffolds test")
        np.testing.assert_equal(binStats['637000110']['Longest contig'],
                                4646332,
                                err_msg="Failed longest contig test")
        np.testing.assert_equal(binStats['637000110']['Longest scaffold'],
                                4646332,
                                err_msg="Failed longest scaffold test")
        # np.testing.assert_equal(binStats['637000110']['# predicted genes'], 4326, err_msg="Failed # predicted genes test") # depends on exact version of prodigal
        np.testing.assert_equal(binStats['637000110']['N50 (contigs)'],
                                4646332,
                                err_msg="Failed N50 (contigs) test")
        np.testing.assert_equal(binStats['637000110']['N50 (scaffolds)'],
                                4646332,
                                err_msg="Failed N50 (scaffolds) test")
        np.testing.assert_equal(binStats['637000110']['Genome size'],
                                4646332,
                                err_msg="Failed genome size test")
Example #11
0
    def run(self, binFiles, outDir, hmmTableFile, binIdToModels,
            binIdToBinMarkerSets, minDeltaComp, maxDeltaCont, minMergedComp,
            maxMergedCont):
        checkDirExists(outDir)

        self.logger.info('  Comparing marker sets between all pairs of bins.')

        # ensure all bins are using the same marker set
        markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys(
        )[0]].mostSpecificMarkerSet().getMarkerGenes()
        for binIdJ in binIdToBinMarkerSets:
            if markerGenesI != binIdToBinMarkerSets[
                    binIdJ].mostSpecificMarkerSet().getMarkerGenes():
                self.logger.error(
                    '  [Error] All bins must use the same marker set to assess potential mergers.'
                )
                sys.exit(0)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile)

        # determine union and intersection of marker sets for each pair of bins
        outputFile = os.path.join(outDir, "merger.tsv")
        fout = open(outputFile, 'w')
        fout.write('Bin Id 1\tBin Id 2')
        fout.write('\tBin 1 completeness\tBin 1 contamination')
        fout.write('\tBin 2 completeness\tBin 2 contamination')
        fout.write('\tDelta completeness\tDelta contamination\tMerger delta')
        fout.write('\tMerged completeness\tMerged contamination\n')

        binMarkerHits = resultsParser.results
        binIds = sorted(binMarkerHits.keys())
        for i in range(0, len(binMarkerHits)):
            binIdI = binIds[i]

            geneCountsI = binMarkerHits[binIdI].geneCounts(
                binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(),
                binMarkerHits[binIdI].markerHits, True)
            completenessI, contaminationI = geneCountsI[6:8]

            for j in range(i + 1, len(binMarkerHits)):
                binIdJ = binIds[j]

                geneCountsJ = binMarkerHits[binIdJ].geneCounts(
                    binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(),
                    binMarkerHits[binIdJ].markerHits, True)
                completenessJ, contaminationJ = geneCountsJ[6:8]

                # merge together hits from both bins and calculate completeness and contamination
                mergedHits = {}
                for markerId, hits in binMarkerHits[
                        binIdI].markerHits.iteritems():
                    mergedHits[markerId] = list(hits)

                for markerId, hits in binMarkerHits[
                        binIdJ].markerHits.iteritems():
                    if markerId in mergedHits:
                        mergedHits[markerId].extend(hits)
                    else:
                        mergedHits[markerId] = hits

                geneCountsMerged = binMarkerHits[binIdI].geneCounts(
                    binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(),
                    mergedHits, True)
                completenessMerged, contaminationMerged = geneCountsMerged[6:8]

                if not (completenessMerged >= minMergedComp
                        and contaminationMerged < maxMergedCont):
                    continue

                # calculate merged statistics
                deltaComp = completenessMerged - max(completenessI,
                                                     completenessJ)
                deltaCont = contaminationMerged - max(contaminationI,
                                                      contaminationJ)
                delta = deltaComp - deltaCont

                if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont:
                    fout.write(
                        '%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n'
                        % (binIdI, binIdJ, completenessI, contaminationI,
                           completenessJ, contaminationJ, deltaComp, deltaCont,
                           delta, completenessMerged, contaminationMerged))

        fout.close()

        return outputFile
Example #12
0
    def run(self, binFiles, outDir, hmmTableFile,
                binIdToModels, binIdToBinMarkerSets,
                minDeltaComp, maxDeltaCont,
                minMergedComp, maxMergedCont):
        checkDirExists(outDir)

        self.logger.info('  Comparing marker sets between all pairs of bins.')

        # ensure all bins are using the same marker set
        markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]].mostSpecificMarkerSet().getMarkerGenes()
        for binIdJ in binIdToBinMarkerSets:
            if markerGenesI != binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet().getMarkerGenes():
                self.logger.error('  [Error] All bins must use the same marker set to assess potential mergers.')
                sys.exit(0)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile)

        # determine union and intersection of marker sets for each pair of bins
        outputFile = os.path.join(outDir, "merger.tsv")
        fout = open(outputFile, 'w')
        fout.write('Bin Id 1\tBin Id 2')
        fout.write('\tBin 1 completeness\tBin 1 contamination')
        fout.write('\tBin 2 completeness\tBin 2 contamination')
        fout.write('\tDelta completeness\tDelta contamination\tMerger delta')
        fout.write('\tMerged completeness\tMerged contamination\n')

        binMarkerHits = resultsParser.results
        binIds = sorted(binMarkerHits.keys())
        for i in xrange(0, len(binMarkerHits)):
            binIdI = binIds[i]

            geneCountsI = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(), binMarkerHits[binIdI].markerHits, True)
            completenessI, contaminationI = geneCountsI[6:8]

            for j in xrange(i + 1, len(binMarkerHits)):
                binIdJ = binIds[j]

                geneCountsJ = binMarkerHits[binIdJ].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), binMarkerHits[binIdJ].markerHits, True)
                completenessJ, contaminationJ = geneCountsJ[6:8]

                # merge together hits from both bins and calculate completeness and contamination
                mergedHits = {}
                for markerId, hits in binMarkerHits[binIdI].markerHits.iteritems():
                    mergedHits[markerId] = list(hits)

                for markerId, hits in binMarkerHits[binIdJ].markerHits.iteritems():
                    if markerId in mergedHits:
                        mergedHits[markerId].extend(hits)
                    else:
                        mergedHits[markerId] = hits

                geneCountsMerged = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), mergedHits, True)
                completenessMerged, contaminationMerged = geneCountsMerged[6:8]

                if not (completenessMerged >= minMergedComp and contaminationMerged < maxMergedCont):
                    continue

                # calculate merged statistics
                deltaComp = completenessMerged - max(completenessI, completenessJ)
                deltaCont = contaminationMerged - max(contaminationI, contaminationJ)
                delta = deltaComp - deltaCont

                if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont:
                    fout.write('%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' %
                                                                        (binIdI, binIdJ,
                                                                         completenessI, contaminationI,
                                                                         completenessJ, contaminationJ,
                                                                         deltaComp, deltaCont, delta,
                                                                         completenessMerged, contaminationMerged))

        fout.close()

        return outputFile