def taxonList(self, options, db=None): """Lineage set command""" self.logger.info( '[CheckM - taxon_list] Listing available taxonomic-specific marker sets.' ) taxonParser = TaxonParser() taxonParser.list(options.rank) self.timeKeeper.printTimeStamp()
def run(self): # read all taxonomic-specific marker genes print('Reading taxonomic-specific marker genes.') taxonomicMarkers = set() taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() for _, taxa in taxonMarkerSets.items(): for _, markerSet in taxa.items(): taxonomicMarkers = taxonomicMarkers.union( markerSet.getMarkerGenes()) print(' Taxonomic-specific marker genes: %d' % len(taxonomicMarkers)) # read all lineage-specific marker genes print('Reading lineage-specific marker genes.') lineageMarkers = set() treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uniqueId, d in uniqueIdToLineageStatistics.items(): markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set'])) lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes()) print(' Lineage-specific marker genes: %d' % len(lineageMarkers)) # gather all marker genes markerGenes = taxonomicMarkers.union(lineageMarkers) print(' Total marker genes: %d' % len(markerGenes)) # get genes from same clan as marker genes print('Gathering HMMs from the same clan as marker genes.') pfam = PFAM() genesInSameClan = pfam.genesInSameClan(markerGenes) allMarkers = markerGenes.union(genesInSameClan) # create file with all model accession numbers keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) fout = open(keyFile, 'w') for modelAcc in allMarkers: fout.write(modelAcc + '\n') fout.close() # fetch specified models HF = HMMERRunner(mode='fetch') HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True) # index the HMM file if os.path.exists(self.outputHMMs + '.ssi'): os.remove(self.outputHMMs + '.ssi') HF.index(self.outputHMMs) # remove key file os.remove(keyFile)
def run(self): # read all taxonomic-specific marker genes print 'Reading taxonomic-specific marker genes.' taxonomicMarkers = set() taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() for _, taxa in taxonMarkerSets.iteritems(): for _, markerSet in taxa.iteritems(): taxonomicMarkers = taxonomicMarkers.union(markerSet.getMarkerGenes()) print ' Taxonomic-specific marker genes: %d' % len(taxonomicMarkers) # read all lineage-specific marker genes print 'Reading lineage-specific marker genes.' lineageMarkers = set() treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uniqueId, d in uniqueIdToLineageStatistics.iteritems(): markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set'])) lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes()) print ' Lineage-specific marker genes: %d' % len(lineageMarkers) # gather all marker genes markerGenes = taxonomicMarkers.union(lineageMarkers) print ' Total marker genes: %d' % len(markerGenes) # get genes from same clan as marker genes print 'Gathering HMMs from the same clan as marker genes.' pfam = PFAM() genesInSameClan = pfam.genesInSameClan(markerGenes) allMarkers = markerGenes.union(genesInSameClan) # create file with all model accession numbers keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) fout = open(keyFile, 'w') for modelAcc in allMarkers: fout.write(modelAcc + '\n') fout.close() # fetch specified models HF = HMMERRunner(mode='fetch') HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True) # index the HMM file if os.path.exists(self.outputHMMs + '.ssi'): os.remove(self.outputHMMs + '.ssi') HF.index(self.outputHMMs) # remove key file os.remove(keyFile)
def taxonSet(self, options, db=None): """Taxon set command""" self.logger.info( '[CheckM - taxon_set] Generate taxonomic-specific marker set.') path = os.path.split(options.marker_file)[0] if path: makeSurePathExists(path) taxonParser = TaxonParser() bValidSet = taxonParser.markerSet(options.rank, options.taxon, options.marker_file) if bValidSet: self.logger.info('Marker set written to: ' + options.marker_file) self.timeKeeper.printTimeStamp()
def run(self): # get all draft genomes consisting of a user-specific minimum number of scaffolds print('') metadata = self.img.genomeMetadata() print(' Total genomes: %d' % len(metadata)) arGenome = set() for genomeId in metadata: if metadata[genomeId]['taxonomy'][0] == 'Archaea': arGenome.add(genomeId) draftGenomeIds = arGenome - self.img.filterGenomeIds(arGenome, metadata, 'status', 'Finished') print(' Number of draft genomes: %d' % len(draftGenomeIds)) minScaffolds = 20 genomeIdsToTest = set() for genomeId in draftGenomeIds: if metadata[genomeId]['scaffold count'] >= minScaffolds: genomeIdsToTest.add(genomeId) print(' Number of draft genomes with >= %d scaffolds: %d' % (minScaffolds, len(genomeIdsToTest))) print('') print(' Calculating genome information for calculating marker sets:') genomeFamilyScaffolds = self.img.precomputeGenomeFamilyScaffolds(genomeIdsToTest) print(' Calculating genome sequence lengths.') genomeSeqLens = self.img.precomputeGenomeSeqLens(genomeIdsToTest) print(' Determining domain-specific marker sets.') taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() bacMarkers = taxonMarkerSets['domain']['Bacteria'].getMarkerGenes() arMarkers = taxonMarkerSets['domain']['Archaea'].getMarkerGenes() print(' There are %d bacterial markers and %d archaeal markers.' % (len(bacMarkers), len(arMarkers))) print(' Determining percentage of markers on each scaffold.') totalMarkers = 0 totalSequenceLen = 0 markersOnShortScaffolds = 0 totalShortScaffoldLen = 0 scaffoldLen = {} percentageMarkers = defaultdict(float) for genomeId, markerIds in genomeFamilyScaffolds.items(): domain = metadata[genomeId]['taxonomy'][0] markerGenes = bacMarkers if domain == 'Bacteria' else arMarkers for markerId in markerGenes: if markerId.startswith('PF'): markerId = markerId.replace('PF', 'pfam') markerId = markerId[0:markerId.rfind('.')] if markerId in markerIds: for scaffoldId in markerIds[markerId]: scaffoldLen[scaffoldId] = genomeSeqLens[genomeId][scaffoldId] percentageMarkers[scaffoldId] += 1.0/len(markerGenes) totalMarkers += 1 totalSequenceLen += genomeSeqLens[genomeId][scaffoldId] if genomeSeqLens[genomeId][scaffoldId] < 10000: markersOnShortScaffolds += 1 totalShortScaffoldLen += genomeSeqLens[genomeId][scaffoldId] print('Markers on short scaffolds: %d over %d Mbp (%f markers per base)' % (markersOnShortScaffolds, totalShortScaffoldLen, float(markersOnShortScaffolds)/totalShortScaffoldLen)) print('Total markers on scaffolds: %d over %d Mbp (%f markers per base)' % (totalMarkers, totalSequenceLen, float(totalMarkers)/totalSequenceLen)) print(' Create plot.') plotLens = [] plotPerMarkers = [] for scaffoldId in percentageMarkers: plotLens.append(scaffoldLen[scaffoldId]) plotPerMarkers.append(percentageMarkers[scaffoldId]/scaffoldLen[scaffoldId] * 1e6) scatterPlot = ScatterPlot() scatterPlot.plot(plotLens, plotPerMarkers) scatterPlot.savePlot('./experiments/plotScaffoldLenVsMarkers.png')
def run(self): # get all draft genomes consisting of a user-specific minimum number of scaffolds print '' metadata = self.img.genomeMetadata() print ' Total genomes: %d' % len(metadata) arGenome = set() for genomeId in metadata: if metadata[genomeId]['taxonomy'][0] == 'Archaea': arGenome.add(genomeId) draftGenomeIds = arGenome - self.img.filterGenomeIds(arGenome, metadata, 'status', 'Finished') print ' Number of draft genomes: %d' % len(draftGenomeIds) minScaffolds = 20 genomeIdsToTest = set() for genomeId in draftGenomeIds: if metadata[genomeId]['scaffold count'] >= minScaffolds: genomeIdsToTest.add(genomeId) print ' Number of draft genomes with >= %d scaffolds: %d' % (minScaffolds, len(genomeIdsToTest)) print '' print ' Calculating genome information for calculating marker sets:' genomeFamilyScaffolds = self.img.precomputeGenomeFamilyScaffolds(genomeIdsToTest) print ' Calculating genome sequence lengths.' genomeSeqLens = self.img.precomputeGenomeSeqLens(genomeIdsToTest) print ' Determining domain-specific marker sets.' taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() bacMarkers = taxonMarkerSets['domain']['Bacteria'].getMarkerGenes() arMarkers = taxonMarkerSets['domain']['Archaea'].getMarkerGenes() print ' There are %d bacterial markers and %d archaeal markers.' % (len(bacMarkers), len(arMarkers)) print ' Determining percentage of markers on each scaffold.' totalMarkers = 0 totalSequenceLen = 0 markersOnShortScaffolds = 0 totalShortScaffoldLen = 0 scaffoldLen = {} percentageMarkers = defaultdict(float) for genomeId, markerIds in genomeFamilyScaffolds.iteritems(): domain = metadata[genomeId]['taxonomy'][0] markerGenes = bacMarkers if domain == 'Bacteria' else arMarkers for markerId in markerGenes: if markerId.startswith('PF'): markerId = markerId.replace('PF', 'pfam') markerId = markerId[0:markerId.rfind('.')] if markerId in markerIds: for scaffoldId in markerIds[markerId]: scaffoldLen[scaffoldId] = genomeSeqLens[genomeId][scaffoldId] percentageMarkers[scaffoldId] += 1.0/len(markerGenes) totalMarkers += 1 totalSequenceLen += genomeSeqLens[genomeId][scaffoldId] if genomeSeqLens[genomeId][scaffoldId] < 10000: markersOnShortScaffolds += 1 totalShortScaffoldLen += genomeSeqLens[genomeId][scaffoldId] print 'Markers on short scaffolds: %d over %d Mbp (%f markers per base)' % (markersOnShortScaffolds, totalShortScaffoldLen, float(markersOnShortScaffolds)/totalShortScaffoldLen) print 'Total markers on scaffolds: %d over %d Mbp (%f markers per base)' % (totalMarkers, totalSequenceLen, float(totalMarkers)/totalSequenceLen) print ' Create plot.' plotLens = [] plotPerMarkers = [] for scaffoldId in percentageMarkers: plotLens.append(scaffoldLen[scaffoldId]) plotPerMarkers.append(percentageMarkers[scaffoldId]/scaffoldLen[scaffoldId] * 1e6) scatterPlot = ScatterPlot() scatterPlot.plot(plotLens, plotPerMarkers) scatterPlot.savePlot('./experiments/plotScaffoldLenVsMarkers.png')