def __createMSA(self, resultsParser, binIdToBinMarkerSets, hmmModelFile, outDir, alignOutputDir, queueIn, queueOut): """Create multiple sequence alignment for markers with multiple hits in a bin.""" HF = HMMERRunner(mode='fetch') while True: binId = queueIn.get(block=True, timeout=None) if binId == None: break markersWithMultipleHits = self.__extractMarkersWithMultipleHits(outDir, binId, resultsParser, binIdToBinMarkerSets[binId]) if len(markersWithMultipleHits) != 0: # create multiple sequence alignments for markers with multiple hits binAlignOutputDir = os.path.join(alignOutputDir, binId) makeSurePathExists(binAlignOutputDir) for markerId in markersWithMultipleHits: tempModelFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) HF.fetch(hmmModelFile, markerId, tempModelFile) self.__alignMarker(markerId, markersWithMultipleHits[markerId], None, False, binAlignOutputDir, tempModelFile, bKeepUnmaskedAlign=False) os.remove(tempModelFile) queueOut.put(binId)
def __createMSA(self, resultsParser, binIdToBinMarkerSets, hmmModelFile, outDir, alignOutputDir, queueIn, queueOut): """Create multiple sequence alignment for markers with multiple hits in a bin.""" HF = HMMERRunner(mode='fetch') while True: binId = queueIn.get(block=True, timeout=None) if binId == None: break markersWithMultipleHits = self.__extractMarkersWithMultipleHits( outDir, binId, resultsParser, binIdToBinMarkerSets[binId]) if len(markersWithMultipleHits) != 0: # create multiple sequence alignments for markers with multiple hits binAlignOutputDir = os.path.join(alignOutputDir, binId) makeSurePathExists(binAlignOutputDir) for markerId in markersWithMultipleHits: tempModelFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) HF.fetch(hmmModelFile, markerId, tempModelFile) self.__alignMarker(markerId, markersWithMultipleHits[markerId], None, False, binAlignOutputDir, tempModelFile, bKeepUnmaskedAlign=False) os.remove(tempModelFile) queueOut.put(binId)
def __createMarkerHMMs(self, binMarkerSet, outputFile, bReportProgress=True): """Create HMM file for markers.""" # get list of marker genes markerGenes = binMarkerSet.getMarkerGenes() # get all genes from the same clan as any marker gene pfam = PFAM(DefaultValues.PFAM_CLAN_FILE) genesInSameClan = pfam.genesInSameClan(markerGenes) # extract marker genes along with all genes from the same clan allMarkers = markerGenes | genesInSameClan if bReportProgress: self.logger.info(" There are %d genes in the marker set and %d genes from the same PFAM clan." % (len(markerGenes), len(genesInSameClan))) # create file with all model accession numbers keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) fout = open(keyFile, 'w') for modelAcc in allMarkers: fout.write(modelAcc + '\n') fout.close() # fetch specified models HF = HMMERRunner(mode='fetch') HF.fetch(DefaultValues.HMM_MODELS, keyFile, outputFile, bKeyFile=True) # index the HMM file if os.path.exists(outputFile + '.ssi'): os.remove(outputFile + '.ssi') HF.index(outputFile) # remove key file os.remove(keyFile)
def __extractModel(self, hmmModelFile, queueIn, queueOut): """Extract HMM.""" HF = HMMERRunner(mode='fetch') while True: modelId, fetchFilename = queueIn.get(block=True, timeout=None) if modelId == None: break HF.fetch(hmmModelFile, modelId, fetchFilename) queueOut.put(modelId)
def run(self): # read all taxonomic-specific marker genes print('Reading taxonomic-specific marker genes.') taxonomicMarkers = set() taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() for _, taxa in taxonMarkerSets.items(): for _, markerSet in taxa.items(): taxonomicMarkers = taxonomicMarkers.union( markerSet.getMarkerGenes()) print(' Taxonomic-specific marker genes: %d' % len(taxonomicMarkers)) # read all lineage-specific marker genes print('Reading lineage-specific marker genes.') lineageMarkers = set() treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uniqueId, d in uniqueIdToLineageStatistics.items(): markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set'])) lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes()) print(' Lineage-specific marker genes: %d' % len(lineageMarkers)) # gather all marker genes markerGenes = taxonomicMarkers.union(lineageMarkers) print(' Total marker genes: %d' % len(markerGenes)) # get genes from same clan as marker genes print('Gathering HMMs from the same clan as marker genes.') pfam = PFAM() genesInSameClan = pfam.genesInSameClan(markerGenes) allMarkers = markerGenes.union(genesInSameClan) # create file with all model accession numbers keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) fout = open(keyFile, 'w') for modelAcc in allMarkers: fout.write(modelAcc + '\n') fout.close() # fetch specified models HF = HMMERRunner(mode='fetch') HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True) # index the HMM file if os.path.exists(self.outputHMMs + '.ssi'): os.remove(self.outputHMMs + '.ssi') HF.index(self.outputHMMs) # remove key file os.remove(keyFile)
def run(self): # read all taxonomic-specific marker genes print 'Reading taxonomic-specific marker genes.' taxonomicMarkers = set() taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() for _, taxa in taxonMarkerSets.iteritems(): for _, markerSet in taxa.iteritems(): taxonomicMarkers = taxonomicMarkers.union(markerSet.getMarkerGenes()) print ' Taxonomic-specific marker genes: %d' % len(taxonomicMarkers) # read all lineage-specific marker genes print 'Reading lineage-specific marker genes.' lineageMarkers = set() treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uniqueId, d in uniqueIdToLineageStatistics.iteritems(): markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set'])) lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes()) print ' Lineage-specific marker genes: %d' % len(lineageMarkers) # gather all marker genes markerGenes = taxonomicMarkers.union(lineageMarkers) print ' Total marker genes: %d' % len(markerGenes) # get genes from same clan as marker genes print 'Gathering HMMs from the same clan as marker genes.' pfam = PFAM() genesInSameClan = pfam.genesInSameClan(markerGenes) allMarkers = markerGenes.union(genesInSameClan) # create file with all model accession numbers keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) fout = open(keyFile, 'w') for modelAcc in allMarkers: fout.write(modelAcc + '\n') fout.close() # fetch specified models HF = HMMERRunner(mode='fetch') HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True) # index the HMM file if os.path.exists(self.outputHMMs + '.ssi'): os.remove(self.outputHMMs + '.ssi') HF.index(self.outputHMMs) # remove key file os.remove(keyFile)