def __runHmmAlign(self, allTrustedGenomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in allTrustedGenomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment( os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName)
def __alignMarker(self, markerId, binSeqs, binStats, bReportHitStats, alignOutputDir, hmmModelFile, bKeepUnmaskedAlign): unalignSeqFile = os.path.join(alignOutputDir, markerId + '.unaligned.faa') fout = open(unalignSeqFile, 'w') numSeqs = 0 for binId, seqs in binSeqs.iteritems(): for seqId, seq in seqs.iteritems(): header = '>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqId if bReportHitStats: header += ' [e-value=%.4g,score=%.1f]' % (binStats[binId][seqId][0], binStats[binId][seqId][1]) fout.write(header + '\n') fout.write(seq + '\n') numSeqs += 1 fout.close() if numSeqs > 0: alignSeqFile = os.path.join(alignOutputDir, markerId + '.aligned.faa') HA = HMMERRunner(mode='align') HA.align(hmmModelFile, unalignSeqFile, alignSeqFile, writeMode='>', outputFormat=self.outputFormat, trim=False) makedSeqFile = os.path.join(alignOutputDir, markerId + '.masked.faa') self.__maskAlignment(alignSeqFile, makedSeqFile) if not bKeepUnmaskedAlign: os.remove(alignSeqFile) os.remove(unalignSeqFile)
def __alignMarker(self, markerId, binSeqs, binStats, bReportHitStats, alignOutputDir, hmmModelFile, bKeepUnmaskedAlign): unalignSeqFile = os.path.join(alignOutputDir, markerId + '.unaligned.faa') fout = open(unalignSeqFile, 'w') numSeqs = 0 for binId, seqs in binSeqs.items(): for seqId, seq in seqs.items(): header = '>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqId if bReportHitStats: header += ' [e-value=%.4g,score=%.1f]' % (binStats[binId][seqId][0], binStats[binId][seqId][1]) fout.write(header + '\n') fout.write(seq + '\n') numSeqs += 1 fout.close() if numSeqs > 0: alignSeqFile = os.path.join(alignOutputDir, markerId + '.aligned.faa') HA = HMMERRunner(mode='align') HA.align(hmmModelFile, unalignSeqFile, alignSeqFile, writeMode='>', outputFormat=self.outputFormat, trim=False) makedSeqFile = os.path.join(alignOutputDir, markerId + '.masked.faa') self.__maskAlignment(alignSeqFile, makedSeqFile) if not bKeepUnmaskedAlign: os.remove(alignSeqFile) os.remove(unalignSeqFile)
def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in genomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): if geneId not in seqs: # this shouldn't be necessary, but the IMG metadata isn't always # perfectly in sync with the sequence data continue fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName)
def __runHmmAlign(self, allTrustedGenomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in allTrustedGenomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName)
def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in genomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): if geneId not in seqs: # this shouldn't be necessary, but the IMG metadata isn't always # perfectly in sync with the sequence data continue fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment( os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName)