Example #1
0
    def run(self, aaiStrainThreshold, outDir, alignmentOutputFile):
        """Calculate AAI between input alignments."""

        self.logger.info('Calculating AAI between multi-copy marker genes.')

        if alignmentOutputFile:
            fout = open(alignmentOutputFile, 'w')

        # calculate AAI for duplicate marker genes
        binIds = getBinIdsFromOutDir(outDir)
        aaiOutputDir = os.path.join(outDir, 'storage', 'aai_qa')
        for binId in binIds:
            binPath = os.path.join(aaiOutputDir, binId)
            if not os.path.exists(binPath):
                continue

            for f in os.listdir(binPath):
                if not f.endswith('.masked.faa'):
                    continue

                markerId = f[0:f.find('.')]

                seqs = readFasta(os.path.join(binPath, f))

                # calculate AAI between all pairs of seqs
                for i in range(0, len(seqs)):
                    seqIdI = list(seqs.keys())[i]
                    binIdI = seqIdI[0:seqIdI.find(DefaultValues.SEQ_CONCAT_CHAR)]

                    seqI = seqs[seqIdI]

                    for j in range(i + 1, len(seqs)):
                        seqIdJ = list(seqs.keys())[j]
                        binIdJ = seqIdJ[0:seqIdJ.find(DefaultValues.SEQ_CONCAT_CHAR)]

                        seqJ = seqs[seqIdJ]

                        if binIdI == binIdJ:
                            aai = self.aai(seqI, seqJ)

                            if alignmentOutputFile:
                                fout.write(binId + ',' + markerId + '\n')
                                fout.write(seqIdI + '\t' + seqI + '\n')
                                fout.write(seqIdJ + '\t' + seqJ + '\n')
                                fout.write('AAI: %.3f\n' % aai)
                                fout.write('\n')

                            if binIdI not in self.aaiRawScores:
                                self.aaiRawScores[binIdI] = defaultdict(list)
                            self.aaiRawScores[binIdI][markerId].append(aai)
                        else:
                            # something is wrong as the bin Ids should always be the same
                            self.logger.error('  [Error] Bin ids do not match.')
                            sys.exit(1)

        if alignmentOutputFile:
            fout.close()

        # calculate strain heterogeneity for each marker gene in each bin
        self.aaiHetero, self.aaiMeanBinHetero = self.strainHetero(self.aaiRawScores, aaiStrainThreshold)
Example #2
0
    def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile,
                          binStats, bLineageStatistics):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get taxonomy for each bin
        binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds)

        # write table
        if not bLineageStatistics:
            self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser,
                                           bTabTable, outFile)
        else:
            # get taxonomy of sister lineage for each bin
            binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds)
            binIdToUID = self.getInsertionBranchId(outDir, binIds)

            binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds)
            self.__printFullTable(binIdToUID, binIdToTaxonomy,
                                  binIdToSisterTaxonomy,
                                  binIdToLineageStatistics, resultsParser,
                                  binStats, bTabTable, outFile)
Example #3
0
    def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get taxonomy for each bin
        binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds)

        # get weighted ML likelihood
        #pplacerJsonFile = os.path.join(outDir, 'storage', 'tree', 'concatenated.pplacer.json')
        #binIdToWeightedML = self.readPlacementFile(pplacerJsonFile)

        # write table
        if not bLineageStatistics:
            self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser, bTabTable, outFile)
        else:
            # get taxonomy of sister lineage for each bin
            binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds)

            binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds)
            self.__printFullTable(binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile)
Example #4
0
    def run(self, aaiStrainThreshold, outDir, alignmentOutputFile):
        """Calculate AAI between input alignments."""

        self.logger.info('  Calculating AAI between multi-copy marker genes.')

        if alignmentOutputFile:
            fout = open(alignmentOutputFile, 'w')

        # calculate AAI for duplicate marker genes
        binIds = getBinIdsFromOutDir(outDir)
        aaiOutputDir = os.path.join(outDir, 'storage', 'aai_qa')
        for binId in binIds:
            binPath = os.path.join(aaiOutputDir, binId)
            if not os.path.exists(binPath):
                continue

            for f in os.listdir(binPath):
                if not f.endswith('.masked.faa'):
                    continue

                markerId = f[0:f.find('.')]

                seqs = readFasta(os.path.join(binPath, f))

                # calculate AAI between all pairs of seqs
                for i in xrange(0, len(seqs)):
                    seqIdI = seqs.keys()[i]
                    binIdI = seqIdI[0:seqIdI.find(DefaultValues.SEQ_CONCAT_CHAR)]

                    seqI = seqs[seqIdI]

                    for j in xrange(i+1, len(seqs)):
                        seqIdJ = seqs.keys()[j]
                        binIdJ = seqIdJ[0:seqIdJ.find(DefaultValues.SEQ_CONCAT_CHAR)]

                        seqJ = seqs[seqIdJ]

                        if binIdI == binIdJ:
                            aai = self.aai(seqI, seqJ)

                            if alignmentOutputFile:
                                fout.write(binId + ',' + markerId + '\n')
                                fout.write(seqIdI + '\t' + seqI + '\n')
                                fout.write(seqIdJ + '\t' + seqJ + '\n')
                                fout.write('AAI: %.3f\n' % aai)
                                fout.write('\n')

                            if binIdI not in self.aaiRawScores:
                                self.aaiRawScores[binIdI] = defaultdict(list)
                            self.aaiRawScores[binIdI][markerId].append(aai)
                        else:
                            # something is wrong as the bin Ids should always be the same
                            self.logger.error('  [Error] Bin ids do not match.')
                            sys.exit()

        if alignmentOutputFile:
            fout.close()

        # calculate strain heterogeneity for each marker gene in each bin
        self.aaiHetero, self.aaiMeanBinHetero = self.strainHetero(self.aaiRawScores, aaiStrainThreshold)
Example #5
0
    def qa(self, options):
        """QA command"""
        self.logger.info('[CheckM - qa] Tabulating genome statistics.')

        checkDirExists(options.analyze_dir)

        if options.exclude_markers:
            checkFileExists(options.exclude_markers)

        # calculate AAI between marks with multiple hits in a single bin
        aai = AminoAcidIdentity()
        aai.run(options.aai_strain, options.analyze_dir,
                options.alignment_file)

        # get HMM file for each bin

        markerSetParser = MarkerSetParser(options.threads)

        hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir),
            options.marker_file, options.exclude_markers)

        # get results for each bin
        RP = ResultsParser(binIdToModels)
        RP.analyseResults(
            options.analyze_dir,
            DefaultValues.BIN_STATS_OUT,
            DefaultValues.HMMER_TABLE_OUT,
            bIgnoreThresholds=options.bIgnoreThresholds,
            evalueThreshold=options.e_value,
            lengthThreshold=options.length,
            bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection,
            bSkipAdjCorrection=options.bSkipAdjCorrection)

        RP.printSummary(options.out_format,
                        aai,
                        binIdToBinMarkerSets,
                        options.bIndividualMarkers,
                        options.coverage_file,
                        options.bTabTable,
                        options.file,
                        anaFolder=options.analyze_dir)
        RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets,
                        options.bIndividualMarkers)

        if options.file != '':
            self.logger.info('QA information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
Example #6
0
    def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get taxonomy for each bin
        binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds)

        # write table
        if not bLineageStatistics:
            self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser, bTabTable, outFile)
        else:
            # get taxonomy of sister lineage for each bin
            binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds)
            binIdToUID = self.getInsertionBranchId(outDir, binIds)

            binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds)
            self.__printFullTable(binIdToUID, binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile)
Example #7
0
    def getBinMarkerSets(self, outDir, markerFile, numGenomesMarkers,
                         bootstrap, bNoLineageSpecificRefinement, bForceDomain,
                         bRequireTaxonomy, resultsParser, minUnique, maxMulti):
        """Determine marker sets for each bin."""

        self.logger.info('  Determining marker sets for each genome bin.')

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get statistics for internal nodes
        uniqueIdToLineageStatistics = self.readNodeMetadata()

        # determine marker set for each bin
        treeFile = os.path.join(outDir, 'storage', 'tree',
                                DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)
        rootNode = tree.find_node(filter_fn=lambda n: n.parent_node == None)

        fout = open(markerFile, 'w')
        fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n')

        numProcessedBins = 0
        statusStr = ''
        for binId in binIds:
            if self.logger.getEffectiveLevel() <= logging.INFO:
                numProcessedBins += 1
                sys.stderr.write(' ' * len(statusStr) +
                                 '\r')  # clear previous line
                statusStr = '    Finished processing %d of %d (%.2f%%) bins (current: %s).' % (
                    numProcessedBins, len(binIds),
                    float(numProcessedBins) * 100 / len(binIds), binId)
                sys.stderr.write('%s\r' % statusStr)
                sys.stderr.flush()

            node = tree.find_node_with_taxon_label(binId)
            binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET)
            if node == None:
                # bin is not in tree
                node, markerSet = self.__getMarkerSet(
                    rootNode, tree, uniqueIdToLineageStatistics,
                    numGenomesMarkers, bootstrap, bForceDomain,
                    bRequireTaxonomy)
                binMarkerSets.addMarkerSet(markerSet)
            else:
                # special case: if node is on the bacterial or archaeal branch descendant from the root,
                # then move down the tree to include the domain-specific marker set
                parentNode = node.parent_node
                while parentNode != None:
                    if parentNode.label:
                        bRoot = (parentNode.parent_node == None)
                        break

                    parentNode = parentNode.parent_node

                if bRoot:
                    # since the root is the first labeled node, we need to descend the
                    # tree to incorporate the domain-specific marker set
                    domainNode = self.__findDomainNode(node)
                    curNode = domainNode.child_nodes()[0]
                else:
                    curNode = node

                # get lineage specific refinement for first node with an id
                if not bNoLineageSpecificRefinement:
                    uniqueId = parentNode.label.split('|')[0]
                    self.__readLineageSpecificGenesToRemove()
                    lineageSpecificRefinement = self.lineageSpecificGenesToRemove[
                        uniqueId]

                # ascend tree to root, recording all marker sets meeting selection criteria
                while curNode.parent_node != None:
                    uniqueHits, multiCopyHits = resultsParser.results[
                        binId].countUniqueHits()
                    tempForceDomain = bForceDomain or (
                        uniqueHits < minUnique) or (multiCopyHits > maxMulti)

                    curNode, markerSet = self.__getMarkerSet(
                        curNode.parent_node, tree, uniqueIdToLineageStatistics,
                        numGenomesMarkers, bootstrap, tempForceDomain,
                        bRequireTaxonomy)

                    if not bNoLineageSpecificRefinement:
                        markerSet = self.__removeInvalidLineageMarkerGenes(
                            markerSet, lineageSpecificRefinement)

                    binMarkerSets.addMarkerSet(markerSet)

            binMarkerSets.write(fout)

        if self.logger.getEffectiveLevel() <= logging.INFO:
            sys.stderr.write('\n')

        fout.close()
Example #8
0
    def getBinMarkerSets(self, outDir, markerFile,
                                    numGenomesMarkers,
                                    bootstrap, bNoLineageSpecificRefinement,
                                    bForceDomain, bRequireTaxonomy,
                                    resultsParser, minUnique, maxMulti):
        """Determine marker sets for each bin."""

        self.logger.info('  Determining marker sets for each genome bin.')

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get statistics for internal nodes
        uniqueIdToLineageStatistics = self.readNodeMetadata()

        # determine marker set for each bin
        treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        rootNode = tree.find_node(filter_fn = lambda n: n.parent_node == None)

        fout = open(markerFile, 'w')
        fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n')

        numProcessedBins = 0
        for binId in binIds:
            if self.logger.getEffectiveLevel() <= logging.INFO:
                numProcessedBins += 1
                statusStr = '    Finished processing %d of %d (%.2f%%) bins.' % (numProcessedBins, len(binIds), float(numProcessedBins)*100/len(binIds))
                sys.stderr.write('%s\r' % statusStr)
                sys.stderr.flush()
                
            node = tree.find_node_with_taxon_label(binId)
            binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET)
            if node == None:
                # bin is not in tree
                node, markerSet = self.__getMarkerSet(rootNode, tree, uniqueIdToLineageStatistics,
                                                        numGenomesMarkers, bootstrap,
                                                        bForceDomain, bRequireTaxonomy)
                binMarkerSets.addMarkerSet(markerSet)
            else:
                # special case: if node is on the bacterial or archaeal branch descendant from the root,
                # then move down the tree to include the domain-specific marker set
                parentNode = node.parent_node
                while parentNode != None:
                    if parentNode.label:
                        bRoot = (parentNode.parent_node == None)
                        break

                    parentNode = parentNode.parent_node

                if bRoot:
                    # since the root is the first labeled node, we need to descend the
                    # tree to incorporate the domain-specific marker set
                    domainNode = self.__findDomainNode(node)
                    curNode = domainNode.child_nodes()[0]
                else:
                    curNode = node
                    
                # get lineage specific refinement for first node with an id
                if not bNoLineageSpecificRefinement:
                    uniqueId = parentNode.label.split('|')[0]
                    self.__readLineageSpecificGenesToRemove()
                    lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId]

                # ascend tree to root, recording all marker sets meeting selection criteria                
                while curNode.parent_node != None:
                    uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits()
                    tempForceDomain = bForceDomain or (uniqueHits < minUnique) or (multiCopyHits > maxMulti)
                    
                    curNode, markerSet = self.__getMarkerSet(curNode.parent_node, tree, uniqueIdToLineageStatistics,
                                                                numGenomesMarkers, bootstrap,
                                                                tempForceDomain, bRequireTaxonomy)
                       
                    if not bNoLineageSpecificRefinement:
                        markerSet = self.__removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement)
                        
                    binMarkerSets.addMarkerSet(markerSet)

            binMarkerSets.write(fout)

        if self.logger.getEffectiveLevel() <= logging.INFO:
            sys.stderr.write('\n')

        fout.close()
Example #9
0
    def analyze(self, options, db=None):
        """Analyze command"""
        self.logger.info(
            '[CheckM - analyze] Identifying marker genes in bins.')

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        # setup directory structure
        makeSurePathExists(options.output_dir)
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
        makeSurePathExists(
            os.path.join(options.output_dir, 'storage', 'aai_qa'))

        # find marker genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 DefaultValues.HMMER_TABLE_OUT,
                                 DefaultValues.HMMER_OUT, options.marker_file,
                                 options.bKeepAlignment, options.bNucORFs,
                                 options.bCalledGenes)

        markerSetParser = MarkerSetParser(options.threads)
        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.output_dir, getBinIdsFromOutDir(options.output_dir),
            options.marker_file)

        hmmModelInfoFile = os.path.join(options.output_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile)

        self.timeKeeper.printTimeStamp()

        # HMM model file
        if markerSetParser.markerFileType(
                options.marker_file) == BinMarkerSets.HMM_MODELS_SET:
            markerFile = options.marker_file
        else:
            markerFile = DefaultValues.HMM_MODELS

        # align marker genes with multiple hits within a bin
        HA = HmmerAligner(options.threads)
        HA.makeAlignmentsOfMultipleHits(
            options.output_dir, markerFile, DefaultValues.HMMER_TABLE_OUT,
            binIdToModels, binIdToBinMarkerSets, False, DefaultValues.E_VAL,
            DefaultValues.LENGTH,
            os.path.join(options.output_dir, 'storage', 'aai_qa'))

        self.timeKeeper.printTimeStamp()

        # calculate statistics for each genome bin
        binStats = BinStatistics(options.threads)
        binStats.calculate(binFiles, options.output_dir,
                           DefaultValues.BIN_STATS_OUT)

        self.timeKeeper.printTimeStamp()

        # align top hit to each marker if requested
        if options.bAlignTopHit:
            alignmentOutputFolder = os.path.join(options.output_dir, 'storage',
                                                 'alignments')
            makeSurePathExists(alignmentOutputFolder)

            HA = HmmerAligner(options.threads)
            resultsParser = HA.makeAlignmentTopHit(
                options.output_dir, options.marker_file,
                DefaultValues.HMMER_TABLE_OUT, binIdToModels, False,
                DefaultValues.E_VAL, DefaultValues.LENGTH, True,
                alignmentOutputFolder)

            # report marker gene data
            fout = open(
                os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w')
            fout.write('Marker Id\tLength (bp)\n')
            markerIds = resultsParser.models[list(
                resultsParser.models.keys())[0]].keys()
            for markerId in markerIds:
                fout.write('%s\t%d\n' % (markerId, resultsParser.models[list(
                    resultsParser.models.keys())[0]][markerId].leng))
            fout.close()

            self.logger.info('Alignments to top hits stored in: ' +
                             alignmentOutputFolder)

            self.timeKeeper.printTimeStamp()