Beispiel #1
0
    def treeQA(self, options):
        """QA command"""
        self.logger.info(
            '[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.'
        )

        checkDirExists(options.tree_dir)

        # set HMM file for each bin
        markerSetParser = MarkerSetParser()
        hmmModelInfoFile = os.path.join(options.tree_dir, 'storage',
                                        DefaultValues.PHYLO_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # calculate marker gene statistics
        RP = ResultsParser(binIdToModels)
        binStats = RP.analyseResults(options.tree_dir,
                                     DefaultValues.BIN_STATS_PHYLO_OUT,
                                     DefaultValues.HMMER_TABLE_PHYLO_OUT)

        # determine taxonomy of each bin

        treeParser = TreeParser()
        treeParser.printSummary(options.out_format, options.tree_dir, RP,
                                options.bTabTable, options.file, binStats)

        if options.file != '':
            self.logger.info('QA information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
Beispiel #2
0
    def tree(self, options):
        """Tree command"""
        self.logger.info(
            '[CheckM - tree] Placing bins in reference genome tree.')

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        # setup directory structure
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))

        # find phylogenetically informative genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 DefaultValues.HMMER_TABLE_PHYLO_OUT,
                                 DefaultValues.HMMER_PHYLO_OUT,
                                 DefaultValues.PHYLO_HMM_MODELS,
                                 options.bKeepAlignment, options.bNucORFs,
                                 options.bCalledGenes)

        # write model information to file
        markerSetParser = MarkerSetParser(options.threads)
        hmmModelInfoFile = os.path.join(options.output_dir, 'storage',
                                        DefaultValues.PHYLO_HMM_MODEL_INFO)
        markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile)

        # calculate statistics for each genome bin

        binStats = BinStatistics(options.threads)
        binStats.calculate(binFiles, options.output_dir,
                           DefaultValues.BIN_STATS_PHYLO_OUT)

        # align identified marker genes

        HA = HmmerAligner(options.threads)
        resultsParser = HA.makeAlignmentToPhyloMarkers(
            options.output_dir, DefaultValues.PHYLO_HMM_MODELS,
            DefaultValues.HMMER_TABLE_PHYLO_OUT, binIdToModels, False,
            DefaultValues.E_VAL, DefaultValues.LENGTH, False,
            os.path.join(options.output_dir, 'storage', 'tree'))

        # place bins into genome tree

        pplacer = PplacerRunner(
            threads=options.pplacer_threads
        )  # fix at one thread to keep memory requirements reasonable
        pplacer.run(binFiles, resultsParser, options.output_dir,
                    options.bReducedTree)

        self.timeKeeper.printTimeStamp()
Beispiel #3
0
    def qa(self, options):
        """QA command"""
        self.logger.info('[CheckM - qa] Tabulating genome statistics.')

        checkDirExists(options.analyze_dir)

        if options.exclude_markers:
            checkFileExists(options.exclude_markers)

        # calculate AAI between marks with multiple hits in a single bin
        aai = AminoAcidIdentity()
        aai.run(options.aai_strain, options.analyze_dir,
                options.alignment_file)

        # get HMM file for each bin

        markerSetParser = MarkerSetParser(options.threads)

        hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir),
            options.marker_file, options.exclude_markers)

        # get results for each bin
        RP = ResultsParser(binIdToModels)
        RP.analyseResults(
            options.analyze_dir,
            DefaultValues.BIN_STATS_OUT,
            DefaultValues.HMMER_TABLE_OUT,
            bIgnoreThresholds=options.bIgnoreThresholds,
            evalueThreshold=options.e_value,
            lengthThreshold=options.length,
            bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection,
            bSkipAdjCorrection=options.bSkipAdjCorrection)

        RP.printSummary(options.out_format,
                        aai,
                        binIdToBinMarkerSets,
                        options.bIndividualMarkers,
                        options.coverage_file,
                        options.bTabTable,
                        options.file,
                        anaFolder=options.analyze_dir)
        RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets,
                        options.bIndividualMarkers)

        if options.file != '':
            self.logger.info('QA information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
Beispiel #4
0
    def __init__(self):
        """Initialization."""

        self.logger = logging.getLogger('timestamp')

        parser = MarkerSetParser()
        bin_marker_sets = parser.parseTaxonomicMarkerSetFile(CHECKM_BAC_MS)
        self.bac_ms = bin_marker_sets.mostSpecificMarkerSet()

        bin_marker_sets = parser.parseTaxonomicMarkerSetFile(CHECKM_AR_MS)
        self.ar_ms = bin_marker_sets.mostSpecificMarkerSet()

        self.bac_markers_on_contigs = None
        self.ar_markers_on_contigs = None
    def __processBin(self, outDir, tableOut, hmmerOut, markerFile,
                     bKeepAlignment, bNucORFs, bCalledGenes, queueIn,
                     queueOut):
        """Thread safe bin processing."""

        markerSetParser = MarkerSetParser(self.threadsPerSearch)

        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # run Prodigal
            if not bCalledGenes:
                prodigal = ProdigalRunner(binDir)
                if not prodigal.areORFsCalled(bNucORFs):
                    prodigal.run(binFile, bNucORFs)
                aaGeneFile = prodigal.aaGeneFile
            else:
                aaGeneFile = binFile
                shutil.copyfile(
                    aaGeneFile, os.path.join(binDir,
                                             DefaultValues.PRODIGAL_AA))

            # extract HMMs into temporary file
            hmmModelFile = markerSetParser.createHmmModelFile(
                binId, markerFile)

            # run HMMER
            hmmer = HMMERRunner()
            tableOutPath = os.path.join(binDir, tableOut)
            hmmerOutPath = os.path.join(binDir, hmmerOut)

            keepAlignStr = ''
            if not bKeepAlignment:
                keepAlignStr = '--noali'
            hmmer.search(
                hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath,
                '--cpu ' + str(self.threadsPerSearch) +
                ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr, bKeepAlignment)

            queueOut.put((binId, hmmModelFile))
    def __processBin(self, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, queueIn, queueOut):
        """Thread safe bin processing."""

        markerSetParser = MarkerSetParser(self.threadsPerSearch)

        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # run Prodigal
            if not bCalledGenes:
                prodigal = ProdigalRunner(binDir)
                if not prodigal.areORFsCalled(bNucORFs):
                    prodigal.run(binFile, bNucORFs)
                aaGeneFile = prodigal.aaGeneFile
            else:
                aaGeneFile = binFile
                shutil.copyfile(aaGeneFile, os.path.join(binDir, DefaultValues.PRODIGAL_AA))

            # extract HMMs into temporary file
            hmmModelFile = markerSetParser.createHmmModelFile(binId, markerFile)
  
            # run HMMER
            hmmer = HMMERRunner()
            tableOutPath = os.path.join(binDir, tableOut)
            hmmerOutPath = os.path.join(binDir, hmmerOut)

            keepAlignStr = ''
            if not bKeepAlignment:
                keepAlignStr = '--noali'
            hmmer.search(hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath,
                         '--cpu ' + str(self.threadsPerSearch) + ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr,
                         bKeepAlignment)
          
            queueOut.put((binId, hmmModelFile))
Beispiel #7
0
    def merge(self, options):
        """Merge command"""

        self.logger.info(
            '[CheckM - merge] Identifying bins with complementary sets of marker genes.'
        )

        checkDirExists(options.bin_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        markerSetParser = MarkerSetParser()
        if markerSetParser.markerFileType(
                options.marker_file) == BinMarkerSets.TREE_MARKER_SET:
            self.logger.error(
                'Merge command requires a taxonomic-specific marker set or a user-defined HMM file.\n'
            )
            return

        # setup directory structure
        makeSurePathExists(options.output_dir)
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage', 'hmms'))

        binIds = []
        for binFile in binFiles:
            binIds.append(binIdFromFilename(binFile))

        # find marker genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 "merger.table.txt", "merger.hmmer3",
                                 options.marker_file, False, False,
                                 options.bCalledGenes)

        # get HMM file for each bin
        markerSetParser = MarkerSetParser()
        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.output_dir, binIds, options.marker_file)

        # compare markers found in each bin

        merger = Merger()
        outputFile = merger.run(binFiles, options.output_dir,
                                "merger.table.txt", binIdToModels,
                                binIdToBinMarkerSets, options.delta_comp,
                                options.delta_cont, options.merged_comp,
                                options.merged_cont)

        self.logger.info('Merger information written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Beispiel #8
0
    def lineageSet(self, options, db=None):
        """Lineage set command"""
        self.logger.info(
            '[CheckM - lineage_set] Inferring lineage-specific marker sets.')

        checkDirExists(options.tree_dir)

        # set HMM file for each bin
        markerSetParser = MarkerSetParser()
        hmmModelInfoFile = os.path.join(options.tree_dir, 'storage',
                                        DefaultValues.PHYLO_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # calculate marker gene statistics
        resultsParser = ResultsParser(binIdToModels)
        resultsParser.analyseResults(options.tree_dir,
                                     DefaultValues.BIN_STATS_PHYLO_OUT,
                                     DefaultValues.HMMER_TABLE_PHYLO_OUT)

        # These options are incompatible with how the lineage-specific marker set is selected, so
        # the default values are currently hard-coded

        options.num_genomes_markers = 2
        options.bootstrap = 0
        options.bRequireTaxonomy = False

        treeParser = TreeParser()
        treeParser.getBinMarkerSets(
            options.tree_dir, options.marker_file, options.num_genomes_markers,
            options.bootstrap, options.bNoLineageSpecificRefinement,
            options.bForceDomain, options.bRequireTaxonomy, resultsParser,
            options.unique, options.multi)

        self.logger.info('Marker set written to: ' + options.marker_file)

        self.timeKeeper.printTimeStamp()
Beispiel #9
0
    def analyze(self, options, db=None):
        """Analyze command"""
        self.logger.info(
            '[CheckM - analyze] Identifying marker genes in bins.')

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        # setup directory structure
        makeSurePathExists(options.output_dir)
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
        makeSurePathExists(
            os.path.join(options.output_dir, 'storage', 'aai_qa'))

        # find marker genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 DefaultValues.HMMER_TABLE_OUT,
                                 DefaultValues.HMMER_OUT, options.marker_file,
                                 options.bKeepAlignment, options.bNucORFs,
                                 options.bCalledGenes)

        markerSetParser = MarkerSetParser(options.threads)
        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.output_dir, getBinIdsFromOutDir(options.output_dir),
            options.marker_file)

        hmmModelInfoFile = os.path.join(options.output_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile)

        self.timeKeeper.printTimeStamp()

        # HMM model file
        if markerSetParser.markerFileType(
                options.marker_file) == BinMarkerSets.HMM_MODELS_SET:
            markerFile = options.marker_file
        else:
            markerFile = DefaultValues.HMM_MODELS

        # align marker genes with multiple hits within a bin
        HA = HmmerAligner(options.threads)
        HA.makeAlignmentsOfMultipleHits(
            options.output_dir, markerFile, DefaultValues.HMMER_TABLE_OUT,
            binIdToModels, binIdToBinMarkerSets, False, DefaultValues.E_VAL,
            DefaultValues.LENGTH,
            os.path.join(options.output_dir, 'storage', 'aai_qa'))

        self.timeKeeper.printTimeStamp()

        # calculate statistics for each genome bin
        binStats = BinStatistics(options.threads)
        binStats.calculate(binFiles, options.output_dir,
                           DefaultValues.BIN_STATS_OUT)

        self.timeKeeper.printTimeStamp()

        # align top hit to each marker if requested
        if options.bAlignTopHit:
            alignmentOutputFolder = os.path.join(options.output_dir, 'storage',
                                                 'alignments')
            makeSurePathExists(alignmentOutputFolder)

            HA = HmmerAligner(options.threads)
            resultsParser = HA.makeAlignmentTopHit(
                options.output_dir, options.marker_file,
                DefaultValues.HMMER_TABLE_OUT, binIdToModels, False,
                DefaultValues.E_VAL, DefaultValues.LENGTH, True,
                alignmentOutputFolder)

            # report marker gene data
            fout = open(
                os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w')
            fout.write('Marker Id\tLength (bp)\n')
            markerIds = resultsParser.models[list(
                resultsParser.models.keys())[0]].keys()
            for markerId in markerIds:
                fout.write('%s\t%d\n' % (markerId, resultsParser.models[list(
                    resultsParser.models.keys())[0]][markerId].leng))
            fout.close()

            self.logger.info('Alignments to top hits stored in: ' +
                             alignmentOutputFolder)

            self.timeKeeper.printTimeStamp()