def __alignMarker(self, markerId, binSeqs, binStats, bReportHitStats, alignOutputDir, hmmModelFile, bKeepUnmaskedAlign): unalignSeqFile = os.path.join(alignOutputDir, markerId + '.unaligned.faa') fout = open(unalignSeqFile, 'w') numSeqs = 0 for binId, seqs in binSeqs.items(): for seqId, seq in seqs.items(): header = '>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqId if bReportHitStats: header += ' [e-value=%.4g,score=%.1f]' % (binStats[binId][seqId][0], binStats[binId][seqId][1]) fout.write(header + '\n') fout.write(seq + '\n') numSeqs += 1 fout.close() if numSeqs > 0: alignSeqFile = os.path.join(alignOutputDir, markerId + '.aligned.faa') HA = HMMERRunner(mode='align') HA.align(hmmModelFile, unalignSeqFile, alignSeqFile, writeMode='>', outputFormat=self.outputFormat, trim=False) makedSeqFile = os.path.join(alignOutputDir, markerId + '.masked.faa') self.__maskAlignment(alignSeqFile, makedSeqFile) if not bKeepUnmaskedAlign: os.remove(alignSeqFile) os.remove(unalignSeqFile)
def __runHmmAlign(self, allTrustedGenomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in allTrustedGenomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment( os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName)
def __createMSA(self, resultsParser, binIdToBinMarkerSets, hmmModelFile, outDir, alignOutputDir, queueIn, queueOut): """Create multiple sequence alignment for markers with multiple hits in a bin.""" HF = HMMERRunner(mode='fetch') while True: binId = queueIn.get(block=True, timeout=None) if binId == None: break markersWithMultipleHits = self.__extractMarkersWithMultipleHits( outDir, binId, resultsParser, binIdToBinMarkerSets[binId]) if len(markersWithMultipleHits) != 0: # create multiple sequence alignments for markers with multiple hits binAlignOutputDir = os.path.join(alignOutputDir, binId) makeSurePathExists(binAlignOutputDir) for markerId in markersWithMultipleHits: tempModelFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) HF.fetch(hmmModelFile, markerId, tempModelFile) self.__alignMarker(markerId, markersWithMultipleHits[markerId], None, False, binAlignOutputDir, tempModelFile, bKeepUnmaskedAlign=False) os.remove(tempModelFile) queueOut.put(binId)
def __createMarkerHMMs(self, binMarkerSet, outputFile, bReportProgress=True): """Create HMM file for markers.""" # get list of marker genes markerGenes = binMarkerSet.getMarkerGenes() # get all genes from the same clan as any marker gene pfam = PFAM(DefaultValues.PFAM_CLAN_FILE) genesInSameClan = pfam.genesInSameClan(markerGenes) # extract marker genes along with all genes from the same clan allMarkers = markerGenes | genesInSameClan if bReportProgress: self.logger.info(" There are %d genes in the marker set and %d genes from the same PFAM clan." % (len(markerGenes), len(genesInSameClan))) # create file with all model accession numbers keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) fout = open(keyFile, 'w') for modelAcc in allMarkers: fout.write(modelAcc + '\n') fout.close() # fetch specified models HF = HMMERRunner(mode='fetch') HF.fetch(DefaultValues.HMM_MODELS, keyFile, outputFile, bKeyFile=True) # index the HMM file if os.path.exists(outputFile + '.ssi'): os.remove(outputFile + '.ssi') HF.index(outputFile) # remove key file os.remove(keyFile)
def __extractModel(self, hmmModelFile, queueIn, queueOut): """Extract HMM.""" HF = HMMERRunner(mode='fetch') while True: modelId, fetchFilename = queueIn.get(block=True, timeout=None) if modelId == None: break HF.fetch(hmmModelFile, modelId, fetchFilename) queueOut.put(modelId)
def run(self): # read all taxonomic-specific marker genes print('Reading taxonomic-specific marker genes.') taxonomicMarkers = set() taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() for _, taxa in taxonMarkerSets.items(): for _, markerSet in taxa.items(): taxonomicMarkers = taxonomicMarkers.union( markerSet.getMarkerGenes()) print(' Taxonomic-specific marker genes: %d' % len(taxonomicMarkers)) # read all lineage-specific marker genes print('Reading lineage-specific marker genes.') lineageMarkers = set() treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uniqueId, d in uniqueIdToLineageStatistics.items(): markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set'])) lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes()) print(' Lineage-specific marker genes: %d' % len(lineageMarkers)) # gather all marker genes markerGenes = taxonomicMarkers.union(lineageMarkers) print(' Total marker genes: %d' % len(markerGenes)) # get genes from same clan as marker genes print('Gathering HMMs from the same clan as marker genes.') pfam = PFAM() genesInSameClan = pfam.genesInSameClan(markerGenes) allMarkers = markerGenes.union(genesInSameClan) # create file with all model accession numbers keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) fout = open(keyFile, 'w') for modelAcc in allMarkers: fout.write(modelAcc + '\n') fout.close() # fetch specified models HF = HMMERRunner(mode='fetch') HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True) # index the HMM file if os.path.exists(self.outputHMMs + '.ssi'): os.remove(self.outputHMMs + '.ssi') HF.index(self.outputHMMs) # remove key file os.remove(keyFile)
def find(self, binFiles, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes): """Identify marker genes in each bin using prodigal and HMMER.""" # make sure HMMER and prodigal are on system path HMMERRunner() if not bCalledGenes: ProdigalRunner('') # process each fasta file self.threadsPerSearch = max(1, int(self.totalThreads / len(binFiles))) self.logger.info(" Identifying marker genes in %d bins with %d threads:" % (len(binFiles), self.totalThreads)) # process each bin in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() for binFile in binFiles: workerQueue.put(binFile) for _ in range(self.totalThreads): workerQueue.put(None) binIdToModels = mp.Manager().dict() try: calcProc = [mp.Process(target=self.__processBin, args=(outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, workerQueue, writerQueue)) for _ in range(self.totalThreads)] writeProc = mp.Process(target=self.__reportProgress, args=(len(binFiles), binIdToModels, writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None)) writeProc.join() except: # make sure all processes are terminated for p in calcProc: p.terminate() writeProc.terminate() # create a standard dictionary from the managed dictionary d = {} for binId in binIdToModels.keys(): d[binId] = binIdToModels[binId] return d
def __processBin(self, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, queueIn, queueOut): """Thread safe bin processing.""" markerSetParser = MarkerSetParser(self.threadsPerSearch) while True: binFile = queueIn.get(block=True, timeout=None) if binFile == None: break binId = binIdFromFilename(binFile) binDir = os.path.join(outDir, 'bins', binId) makeSurePathExists(binDir) # run Prodigal if not bCalledGenes: prodigal = ProdigalRunner(binDir) if not prodigal.areORFsCalled(bNucORFs): prodigal.run(binFile, bNucORFs) aaGeneFile = prodigal.aaGeneFile else: aaGeneFile = binFile shutil.copyfile( aaGeneFile, os.path.join(binDir, DefaultValues.PRODIGAL_AA)) # extract HMMs into temporary file hmmModelFile = markerSetParser.createHmmModelFile( binId, markerFile) # run HMMER hmmer = HMMERRunner() tableOutPath = os.path.join(binDir, tableOut) hmmerOutPath = os.path.join(binDir, hmmerOut) keepAlignStr = '' if not bKeepAlignment: keepAlignStr = '--noali' hmmer.search( hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath, '--cpu ' + str(self.threadsPerSearch) + ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr, bKeepAlignment) queueOut.put((binId, hmmModelFile))
def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in genomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): if geneId not in seqs: # this shouldn't be necessary, but the IMG metadata isn't always # perfectly in sync with the sequence data continue fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment( os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName)