コード例 #1
0
    def __createMSA(self, resultsParser, binIdToBinMarkerSets, hmmModelFile,
                    outDir, alignOutputDir, queueIn, queueOut):
        """Create multiple sequence alignment for markers with multiple hits in a bin."""

        HF = HMMERRunner(mode='fetch')

        while True:
            binId = queueIn.get(block=True, timeout=None)
            if binId == None:
                break

            markersWithMultipleHits = self.__extractMarkersWithMultipleHits(
                outDir, binId, resultsParser, binIdToBinMarkerSets[binId])

            if len(markersWithMultipleHits) != 0:
                # create multiple sequence alignments for markers with multiple hits
                binAlignOutputDir = os.path.join(alignOutputDir, binId)
                makeSurePathExists(binAlignOutputDir)
                for markerId in markersWithMultipleHits:
                    tempModelFile = os.path.join(tempfile.gettempdir(),
                                                 str(uuid.uuid4()))
                    HF.fetch(hmmModelFile, markerId, tempModelFile)

                    self.__alignMarker(markerId,
                                       markersWithMultipleHits[markerId],
                                       None,
                                       False,
                                       binAlignOutputDir,
                                       tempModelFile,
                                       bKeepUnmaskedAlign=False)

                    os.remove(tempModelFile)

            queueOut.put(binId)
コード例 #2
0
ファイル: hmmerAligner.py プロジェクト: Ecogenomics/CheckM
    def __createMSA(self, resultsParser, binIdToBinMarkerSets, hmmModelFile, outDir, alignOutputDir, queueIn, queueOut):
        """Create multiple sequence alignment for markers with multiple hits in a bin."""

        HF = HMMERRunner(mode='fetch')

        while True:
            binId = queueIn.get(block=True, timeout=None)
            if binId == None:
                break

            markersWithMultipleHits = self.__extractMarkersWithMultipleHits(outDir, binId, resultsParser, binIdToBinMarkerSets[binId])

            if len(markersWithMultipleHits) != 0:
                # create multiple sequence alignments for markers with multiple hits
                binAlignOutputDir = os.path.join(alignOutputDir, binId)
                makeSurePathExists(binAlignOutputDir)
                for markerId in markersWithMultipleHits:
                    tempModelFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
                    HF.fetch(hmmModelFile, markerId, tempModelFile)

                    self.__alignMarker(markerId, markersWithMultipleHits[markerId], None, False, binAlignOutputDir, tempModelFile, bKeepUnmaskedAlign=False)

                    os.remove(tempModelFile)

            queueOut.put(binId)
コード例 #3
0
ファイル: markerSets.py プロジェクト: Ecogenomics/CheckM
    def __createMarkerHMMs(self, binMarkerSet, outputFile, bReportProgress=True):
        """Create HMM file for markers."""

        # get list of marker genes
        markerGenes = binMarkerSet.getMarkerGenes()

        # get all genes from the same clan as any marker gene
        pfam = PFAM(DefaultValues.PFAM_CLAN_FILE)
        genesInSameClan = pfam.genesInSameClan(markerGenes)

        # extract marker genes along with all genes from the same clan
        allMarkers = markerGenes | genesInSameClan

        if bReportProgress:
            self.logger.info("  There are %d genes in the marker set and %d genes from the same PFAM clan." % (len(markerGenes), len(genesInSameClan)))

        # create file with all model accession numbers
        keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        fout = open(keyFile, 'w')
        for modelAcc in allMarkers:
            fout.write(modelAcc + '\n')
        fout.close()

        # fetch specified models
        HF = HMMERRunner(mode='fetch')
        HF.fetch(DefaultValues.HMM_MODELS, keyFile, outputFile, bKeyFile=True)

        # index the HMM file
        if os.path.exists(outputFile + '.ssi'):
            os.remove(outputFile + '.ssi')
        HF.index(outputFile)

        # remove key file
        os.remove(keyFile)
コード例 #4
0
    def __alignMarker(self, markerId, binSeqs, binStats, bReportHitStats, alignOutputDir, hmmModelFile, bKeepUnmaskedAlign):
        unalignSeqFile = os.path.join(alignOutputDir, markerId + '.unaligned.faa')
        fout = open(unalignSeqFile, 'w')
        numSeqs = 0
        for binId, seqs in binSeqs.items():
            for seqId, seq in seqs.items():
                header = '>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqId
                if bReportHitStats:
                    header += ' [e-value=%.4g,score=%.1f]' % (binStats[binId][seqId][0], binStats[binId][seqId][1])

                fout.write(header + '\n')
                fout.write(seq + '\n')
                numSeqs += 1
        fout.close()

        if numSeqs > 0:
            alignSeqFile = os.path.join(alignOutputDir, markerId + '.aligned.faa')
            HA = HMMERRunner(mode='align')
            HA.align(hmmModelFile, unalignSeqFile, alignSeqFile, writeMode='>', outputFormat=self.outputFormat, trim=False)

            makedSeqFile = os.path.join(alignOutputDir, markerId + '.masked.faa')
            self.__maskAlignment(alignSeqFile, makedSeqFile)

            if not bKeepUnmaskedAlign:
                os.remove(alignSeqFile)

        os.remove(unalignSeqFile)
コード例 #5
0
    def __runHmmAlign(self, allTrustedGenomeIds, genesInGenomes, outputGeneDir,
                      outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            markerId = queueIn.get(block=True, timeout=None)
            if markerId == None:
                break

            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in allTrustedGenomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' +
                                 genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()

            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'),
                        markerSeqFile,
                        os.path.join(outputGeneDir, modelName + '.aln.faa'),
                        trim=False,
                        outputFormat='Pfam')
            self.__maskAlignment(
                os.path.join(outputGeneDir, modelName + '.aln.faa'),
                os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))

            queueOut.put(modelName)
コード例 #6
0
    def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            markerId = queueIn.get(block=True, timeout=None)
            if markerId == None:
                break

            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in genomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    if geneId not in seqs:
                        # this shouldn't be necessary, but the IMG metadata isn't always
                        # perfectly in sync with the sequence data
                        continue

                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()

            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam')
            self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))

            queueOut.put(modelName)
コード例 #7
0
ファイル: hmmerAligner.py プロジェクト: Ecogenomics/CheckM
    def __alignMarker(self, markerId, binSeqs, binStats, bReportHitStats, alignOutputDir, hmmModelFile, bKeepUnmaskedAlign):
        unalignSeqFile = os.path.join(alignOutputDir, markerId + '.unaligned.faa')
        fout = open(unalignSeqFile, 'w')
        numSeqs = 0
        for binId, seqs in binSeqs.iteritems():
            for seqId, seq in seqs.iteritems():
                header = '>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqId
                if bReportHitStats:
                    header += ' [e-value=%.4g,score=%.1f]' % (binStats[binId][seqId][0], binStats[binId][seqId][1])

                fout.write(header + '\n')
                fout.write(seq + '\n')
                numSeqs += 1
        fout.close()

        if numSeqs > 0:
            alignSeqFile = os.path.join(alignOutputDir, markerId + '.aligned.faa')
            HA = HMMERRunner(mode='align')
            HA.align(hmmModelFile, unalignSeqFile, alignSeqFile, writeMode='>', outputFormat=self.outputFormat, trim=False)

            makedSeqFile = os.path.join(alignOutputDir, markerId + '.masked.faa')
            self.__maskAlignment(alignSeqFile, makedSeqFile)

            if not bKeepUnmaskedAlign:
                os.remove(alignSeqFile)

        os.remove(unalignSeqFile)
コード例 #8
0
    def __runHmmAlign(self, allTrustedGenomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""
        
        while True:
            markerId = queueIn.get(block=True, timeout=None) 
            if markerId == None:
                break 
            
            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in allTrustedGenomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()
            
            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam')
            self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))
            
            queueOut.put(modelName)
コード例 #9
0
ファイル: hmmerAligner.py プロジェクト: Ecogenomics/CheckM
    def __extractModel(self, hmmModelFile, queueIn, queueOut):
        """Extract HMM."""
        HF = HMMERRunner(mode='fetch')

        while True:
            modelId, fetchFilename = queueIn.get(block=True, timeout=None)
            if modelId == None:
                break

            HF.fetch(hmmModelFile, modelId, fetchFilename)

            queueOut.put(modelId)
コード例 #10
0
    def __extractModel(self, hmmModelFile, queueIn, queueOut):
        """Extract HMM."""
        HF = HMMERRunner(mode='fetch')

        while True:
            modelId, fetchFilename = queueIn.get(block=True, timeout=None)
            if modelId == None:
                break

            HF.fetch(hmmModelFile, modelId, fetchFilename)

            queueOut.put(modelId)
コード例 #11
0
 def run(self):
     # read all taxonomic-specific marker genes
     print 'Reading taxonomic-specific marker genes.'
     taxonomicMarkers = set()
     taxonParser = TaxonParser()
     taxonMarkerSets = taxonParser.readMarkerSets()
     for _, taxa in taxonMarkerSets.iteritems():
         for _, markerSet in taxa.iteritems():
             taxonomicMarkers = taxonomicMarkers.union(markerSet.getMarkerGenes())
             
     print '  Taxonomic-specific marker genes: %d' % len(taxonomicMarkers)
             
     # read all lineage-specific marker genes
     print 'Reading lineage-specific marker genes.'
     lineageMarkers = set()
     treeParser = TreeParser()
     uniqueIdToLineageStatistics = treeParser.readNodeMetadata()
     for uniqueId, d in uniqueIdToLineageStatistics.iteritems():
         markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set']))
         lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes())
         
     print '  Lineage-specific marker genes: %d' % len(lineageMarkers)
     
     # gather all marker genes
     markerGenes = taxonomicMarkers.union(lineageMarkers)
     print '  Total marker genes: %d' % len(markerGenes)
     
     # get genes from same clan as marker genes
     print 'Gathering HMMs from the same clan as marker genes.'
     pfam = PFAM()
     genesInSameClan = pfam.genesInSameClan(markerGenes)
     allMarkers = markerGenes.union(genesInSameClan)
     
     # create file with all model accession numbers
     keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
     fout = open(keyFile, 'w')
     for modelAcc in allMarkers:
         fout.write(modelAcc + '\n')
     fout.close()
     
     # fetch specified models
     HF = HMMERRunner(mode='fetch')
     HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True)
     
     # index the HMM file
     if os.path.exists(self.outputHMMs + '.ssi'):
         os.remove(self.outputHMMs + '.ssi')
     HF.index(self.outputHMMs)
     
     # remove key file
     os.remove(keyFile)
コード例 #12
0
    def __processBin(self, outDir, tableOut, hmmerOut, markerFile,
                     bKeepAlignment, bNucORFs, bCalledGenes, queueIn,
                     queueOut):
        """Thread safe bin processing."""

        markerSetParser = MarkerSetParser(self.threadsPerSearch)

        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # run Prodigal
            if not bCalledGenes:
                prodigal = ProdigalRunner(binDir)
                if not prodigal.areORFsCalled(bNucORFs):
                    prodigal.run(binFile, bNucORFs)
                aaGeneFile = prodigal.aaGeneFile
            else:
                aaGeneFile = binFile
                shutil.copyfile(
                    aaGeneFile, os.path.join(binDir,
                                             DefaultValues.PRODIGAL_AA))

            # extract HMMs into temporary file
            hmmModelFile = markerSetParser.createHmmModelFile(
                binId, markerFile)

            # run HMMER
            hmmer = HMMERRunner()
            tableOutPath = os.path.join(binDir, tableOut)
            hmmerOutPath = os.path.join(binDir, hmmerOut)

            keepAlignStr = ''
            if not bKeepAlignment:
                keepAlignStr = '--noali'
            hmmer.search(
                hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath,
                '--cpu ' + str(self.threadsPerSearch) +
                ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr, bKeepAlignment)

            queueOut.put((binId, hmmModelFile))
コード例 #13
0
    def __processBin(self, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, queueIn, queueOut):
        """Thread safe bin processing."""

        markerSetParser = MarkerSetParser(self.threadsPerSearch)

        while True:
            binFile = queueIn.get(block=True, timeout=None)
            if binFile == None:
                break

            binId = binIdFromFilename(binFile)
            binDir = os.path.join(outDir, 'bins', binId)
            makeSurePathExists(binDir)

            # run Prodigal
            if not bCalledGenes:
                prodigal = ProdigalRunner(binDir)
                if not prodigal.areORFsCalled(bNucORFs):
                    prodigal.run(binFile, bNucORFs)
                aaGeneFile = prodigal.aaGeneFile
            else:
                aaGeneFile = binFile
                shutil.copyfile(aaGeneFile, os.path.join(binDir, DefaultValues.PRODIGAL_AA))

            # extract HMMs into temporary file
            hmmModelFile = markerSetParser.createHmmModelFile(binId, markerFile)
  
            # run HMMER
            hmmer = HMMERRunner()
            tableOutPath = os.path.join(binDir, tableOut)
            hmmerOutPath = os.path.join(binDir, hmmerOut)

            keepAlignStr = ''
            if not bKeepAlignment:
                keepAlignStr = '--noali'
            hmmer.search(hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath,
                         '--cpu ' + str(self.threadsPerSearch) + ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr,
                         bKeepAlignment)
          
            queueOut.put((binId, hmmModelFile))
コード例 #14
0
    def __createMarkerHMMs(self, binMarkerSet, outputFile, bReportProgress=True):
        """Create HMM file for markers."""

        # get list of marker genes
        markerGenes = binMarkerSet.getMarkerGenes()

        # get all genes from the same clan as any marker gene
        pfam = PFAM(DefaultValues.PFAM_CLAN_FILE)
        genesInSameClan = pfam.genesInSameClan(markerGenes)

        # extract marker genes along with all genes from the same clan
        allMarkers = markerGenes | genesInSameClan

        if bReportProgress:
            self.logger.info("  There are %d genes in the marker set and %d genes from the same PFAM clan." % (len(markerGenes), len(genesInSameClan)))

        # create file with all model accession numbers
        keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        fout = open(keyFile, 'w')
        for modelAcc in allMarkers:
            fout.write(modelAcc + '\n')
        fout.close()

        # fetch specified models
        HF = HMMERRunner(mode='fetch')
        HF.fetch(DefaultValues.HMM_MODELS, keyFile, outputFile, bKeyFile=True)

        # index the HMM file
        if os.path.exists(outputFile + '.ssi'):
            os.remove(outputFile + '.ssi')
        HF.index(outputFile)

        # remove key file
        os.remove(keyFile)
コード例 #15
0
    def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir,
                      outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            markerId = queueIn.get(block=True, timeout=None)
            if markerId == None:
                break

            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in genomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' +
                                 genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    if geneId not in seqs:
                        # this shouldn't be necessary, but the IMG metadata isn't always
                        # perfectly in sync with the sequence data
                        continue

                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()

            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'),
                        markerSeqFile,
                        os.path.join(outputGeneDir, modelName + '.aln.faa'),
                        trim=False,
                        outputFormat='Pfam')
            self.__maskAlignment(
                os.path.join(outputGeneDir, modelName + '.aln.faa'),
                os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))

            queueOut.put(modelName)
コード例 #16
0
    def find(self, binFiles, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes):
        """Identify marker genes in each bin using prodigal and HMMER."""

        # make sure HMMER and prodigal are on system path
        HMMERRunner()

        if not bCalledGenes:
            ProdigalRunner('')

        # process each fasta file
        self.threadsPerSearch = max(1, int(self.totalThreads / len(binFiles)))
        self.logger.info("  Identifying marker genes in %d bins with %d threads:" % (len(binFiles), self.totalThreads))

        # process each bin in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for binFile in binFiles:
            workerQueue.put(binFile)

        for _ in range(self.totalThreads):
            workerQueue.put(None)

        binIdToModels = mp.Manager().dict()

        try:
            calcProc = [mp.Process(target=self.__processBin, args=(outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, workerQueue, writerQueue)) for _ in range(self.totalThreads)]
            writeProc = mp.Process(target=self.__reportProgress, args=(len(binFiles), binIdToModels, writerQueue))

            writeProc.start()

            for p in calcProc:
                p.start()

            for p in calcProc:
                p.join()

            writerQueue.put((None, None))
            writeProc.join()
        except:
            # make sure all processes are terminated
            for p in calcProc:
                p.terminate()

            writeProc.terminate()

        # create a standard dictionary from the managed dictionary
        d = {}
        for binId in binIdToModels.keys():
            d[binId] = binIdToModels[binId]

        return d
コード例 #17
0
    def run(self):
        # read all taxonomic-specific marker genes
        print('Reading taxonomic-specific marker genes.')
        taxonomicMarkers = set()
        taxonParser = TaxonParser()
        taxonMarkerSets = taxonParser.readMarkerSets()
        for _, taxa in taxonMarkerSets.items():
            for _, markerSet in taxa.items():
                taxonomicMarkers = taxonomicMarkers.union(
                    markerSet.getMarkerGenes())

        print('  Taxonomic-specific marker genes: %d' % len(taxonomicMarkers))

        # read all lineage-specific marker genes
        print('Reading lineage-specific marker genes.')
        lineageMarkers = set()
        treeParser = TreeParser()
        uniqueIdToLineageStatistics = treeParser.readNodeMetadata()
        for uniqueId, d in uniqueIdToLineageStatistics.items():
            markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']),
                                  eval(d['marker set']))
            lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes())

        print('  Lineage-specific marker genes: %d' % len(lineageMarkers))

        # gather all marker genes
        markerGenes = taxonomicMarkers.union(lineageMarkers)
        print('  Total marker genes: %d' % len(markerGenes))

        # get genes from same clan as marker genes
        print('Gathering HMMs from the same clan as marker genes.')
        pfam = PFAM()
        genesInSameClan = pfam.genesInSameClan(markerGenes)
        allMarkers = markerGenes.union(genesInSameClan)

        # create file with all model accession numbers
        keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        fout = open(keyFile, 'w')
        for modelAcc in allMarkers:
            fout.write(modelAcc + '\n')
        fout.close()

        # fetch specified models
        HF = HMMERRunner(mode='fetch')
        HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True)

        # index the HMM file
        if os.path.exists(self.outputHMMs + '.ssi'):
            os.remove(self.outputHMMs + '.ssi')
        HF.index(self.outputHMMs)

        # remove key file
        os.remove(keyFile)