Beispiel #1
0
    def treeQA(self, options):
        """QA command"""
        self.logger.info(
            '[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.'
        )

        checkDirExists(options.tree_dir)

        # set HMM file for each bin
        markerSetParser = MarkerSetParser()
        hmmModelInfoFile = os.path.join(options.tree_dir, 'storage',
                                        DefaultValues.PHYLO_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # calculate marker gene statistics
        RP = ResultsParser(binIdToModels)
        binStats = RP.analyseResults(options.tree_dir,
                                     DefaultValues.BIN_STATS_PHYLO_OUT,
                                     DefaultValues.HMMER_TABLE_PHYLO_OUT)

        # determine taxonomy of each bin

        treeParser = TreeParser()
        treeParser.printSummary(options.out_format, options.tree_dir, RP,
                                options.bTabTable, options.file, binStats)

        if options.file != '':
            self.logger.info('QA information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
    def run(self):
        # read all taxonomic-specific marker genes
        print('Reading taxonomic-specific marker genes.')
        taxonomicMarkers = set()
        taxonParser = TaxonParser()
        taxonMarkerSets = taxonParser.readMarkerSets()
        for _, taxa in taxonMarkerSets.items():
            for _, markerSet in taxa.items():
                taxonomicMarkers = taxonomicMarkers.union(
                    markerSet.getMarkerGenes())

        print('  Taxonomic-specific marker genes: %d' % len(taxonomicMarkers))

        # read all lineage-specific marker genes
        print('Reading lineage-specific marker genes.')
        lineageMarkers = set()
        treeParser = TreeParser()
        uniqueIdToLineageStatistics = treeParser.readNodeMetadata()
        for uniqueId, d in uniqueIdToLineageStatistics.items():
            markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']),
                                  eval(d['marker set']))
            lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes())

        print('  Lineage-specific marker genes: %d' % len(lineageMarkers))

        # gather all marker genes
        markerGenes = taxonomicMarkers.union(lineageMarkers)
        print('  Total marker genes: %d' % len(markerGenes))

        # get genes from same clan as marker genes
        print('Gathering HMMs from the same clan as marker genes.')
        pfam = PFAM()
        genesInSameClan = pfam.genesInSameClan(markerGenes)
        allMarkers = markerGenes.union(genesInSameClan)

        # create file with all model accession numbers
        keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        fout = open(keyFile, 'w')
        for modelAcc in allMarkers:
            fout.write(modelAcc + '\n')
        fout.close()

        # fetch specified models
        HF = HMMERRunner(mode='fetch')
        HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True)

        # index the HMM file
        if os.path.exists(self.outputHMMs + '.ssi'):
            os.remove(self.outputHMMs + '.ssi')
        HF.index(self.outputHMMs)

        # remove key file
        os.remove(keyFile)
 def run(self):
     # read all taxonomic-specific marker genes
     print 'Reading taxonomic-specific marker genes.'
     taxonomicMarkers = set()
     taxonParser = TaxonParser()
     taxonMarkerSets = taxonParser.readMarkerSets()
     for _, taxa in taxonMarkerSets.iteritems():
         for _, markerSet in taxa.iteritems():
             taxonomicMarkers = taxonomicMarkers.union(markerSet.getMarkerGenes())
             
     print '  Taxonomic-specific marker genes: %d' % len(taxonomicMarkers)
             
     # read all lineage-specific marker genes
     print 'Reading lineage-specific marker genes.'
     lineageMarkers = set()
     treeParser = TreeParser()
     uniqueIdToLineageStatistics = treeParser.readNodeMetadata()
     for uniqueId, d in uniqueIdToLineageStatistics.iteritems():
         markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set']))
         lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes())
         
     print '  Lineage-specific marker genes: %d' % len(lineageMarkers)
     
     # gather all marker genes
     markerGenes = taxonomicMarkers.union(lineageMarkers)
     print '  Total marker genes: %d' % len(markerGenes)
     
     # get genes from same clan as marker genes
     print 'Gathering HMMs from the same clan as marker genes.'
     pfam = PFAM()
     genesInSameClan = pfam.genesInSameClan(markerGenes)
     allMarkers = markerGenes.union(genesInSameClan)
     
     # create file with all model accession numbers
     keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
     fout = open(keyFile, 'w')
     for modelAcc in allMarkers:
         fout.write(modelAcc + '\n')
     fout.close()
     
     # fetch specified models
     HF = HMMERRunner(mode='fetch')
     HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True)
     
     # index the HMM file
     if os.path.exists(self.outputHMMs + '.ssi'):
         os.remove(self.outputHMMs + '.ssi')
     HF.index(self.outputHMMs)
     
     # remove key file
     os.remove(keyFile)
    def run(self):
        # read internal nodes file
        metadata = {}
        for line in open('./experiments/classTree.internal_nodes.tsv'):
            uid, label = [x.strip() for x in line.split('\t')]
            metadata[uid] = label
        

        # read all lineage-specific marker genes
        treeParser = TreeParser()
        uniqueIdToLineageStatistics = treeParser.readNodeMetadata()
        for uid in metadata:
            stats = uniqueIdToLineageStatistics[uid]
            markerSet = MarkerSet(uid, 'NA', int(stats['# genomes']), eval(stats['marker set']))
            
            metadata[uid] += ' [%d, %d, %d]' % (stats['# genomes'], markerSet.numMarkers(), markerSet.numSets())
            
        # write out results
        fout = open('./experiments/classTree.internal_nodes.metadata.tsv', 'w')
        for uid, label in metadata.iteritems():
            fout.write(uid + '\t' + label + '\n')
        fout.close()
Beispiel #5
0
    def lineageSet(self, options, db=None):
        """Lineage set command"""
        self.logger.info(
            '[CheckM - lineage_set] Inferring lineage-specific marker sets.')

        checkDirExists(options.tree_dir)

        # set HMM file for each bin
        markerSetParser = MarkerSetParser()
        hmmModelInfoFile = os.path.join(options.tree_dir, 'storage',
                                        DefaultValues.PHYLO_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # calculate marker gene statistics
        resultsParser = ResultsParser(binIdToModels)
        resultsParser.analyseResults(options.tree_dir,
                                     DefaultValues.BIN_STATS_PHYLO_OUT,
                                     DefaultValues.HMMER_TABLE_PHYLO_OUT)

        # These options are incompatible with how the lineage-specific marker set is selected, so
        # the default values are currently hard-coded

        options.num_genomes_markers = 2
        options.bootstrap = 0
        options.bRequireTaxonomy = False

        treeParser = TreeParser()
        treeParser.getBinMarkerSets(
            options.tree_dir, options.marker_file, options.num_genomes_markers,
            options.bootstrap, options.bNoLineageSpecificRefinement,
            options.bForceDomain, options.bRequireTaxonomy, resultsParser,
            options.unique, options.multi)

        self.logger.info('Marker set written to: ' + options.marker_file)

        self.timeKeeper.printTimeStamp()
    def run(self):
        # read internal nodes file
        metadata = {}
        for line in open('./experiments/classTree.internal_nodes.tsv'):
            uid, label = [x.strip() for x in line.split('\t')]
            metadata[uid] = label

        # read all lineage-specific marker genes
        treeParser = TreeParser()
        uniqueIdToLineageStatistics = treeParser.readNodeMetadata()
        for uid in metadata:
            stats = uniqueIdToLineageStatistics[uid]
            markerSet = MarkerSet(uid, 'NA', int(stats['# genomes']),
                                  eval(stats['marker set']))

            metadata[uid] += ' [%d, %d, %d]' % (stats['# genomes'],
                                                markerSet.numMarkers(),
                                                markerSet.numSets())

        # write out results
        fout = open('./experiments/classTree.internal_nodes.metadata.tsv', 'w')
        for uid, label in metadata.items():
            fout.write(uid + '\t' + label + '\n')
        fout.close()
    def run(self, ubiquityThreshold, minGenomes):
        # Pre-compute gene count table
        print 'Computing gene count table.'
        start = time.time()
        metadata = self.img.genomeMetadata()
        self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys())
        end = time.time()
        print '    globalGeneCountTable: %.2f' % (end - start)

        # read selected node for defining marker set
        print 'Reading node defining marker set for each internal node.'
        selectedMarkerNode = {}
        for line in open('/srv/whitlam/bio/db/checkm/selected_marker_sets.tsv'):
            lineSplit = line.split('\t')
            selectedMarkerNode[lineSplit[0].strip()] = lineSplit[1].strip()
            
        # read duplicate taxa
        print 'Reading list of identical taxa in genome tree.'
        duplicateTaxa = {}
        for line in open('/srv/whitlam/bio/db/checkm/genome_tree/genome_tree.derep.txt'):
            lineSplit = line.rstrip().split()
            if len(lineSplit) > 1:
                duplicateTaxa[lineSplit[0]] = lineSplit[1:]
        
        # read in node metadata
        print 'Reading node metadata.'
        treeParser = TreeParser()
        uniqueIdToLineageStatistics = treeParser.readNodeMetadata()
        
        # read genome tree
        print 'Reading in genome tree.'
                
        treeFile = '/srv/whitlam/bio/db/checkm/genome_tree/genome_tree_prok.refpkg/genome_tree.final.tre'
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        
        # determine lineage-specific gene loss and duplication (relative to potential marker genes used by a node)
        print 'Determining lineage-specific gene loss and duplication'
        
        fout = open('/srv/whitlam/bio/db/checkm/genome_tree/missing_duplicate_genes_50.tsv', 'w')
        
        processed = 0
        numInternalNodes = len(tree.internal_nodes())
        for node in tree.internal_nodes():
            processed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) internal nodes.' % (processed, numInternalNodes, float(processed)*100/numInternalNodes)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            nodeId = node.label.split('|')[0]
            
            missingGenes = []
            duplicateGenes = []
            
            nodeStats = uniqueIdToLineageStatistics[nodeId]
            if nodeStats['# genomes'] >= minGenomes:               
                # get marker genes defined for current node along with all parental nodes    
                markerGenes = set() 
                parentNode = node
                while parentNode != None:                     
                    parentNodeId = parentNode.label.split('|')[0]
                
                    stats = uniqueIdToLineageStatistics[parentNodeId]
                    markerSet = MarkerSet(parentNodeId, stats['taxonomy'], stats['# genomes'], eval(stats['marker set']))
                    markerGenes = markerGenes.union(markerSet.getMarkerGenes())
                
                    parentNode = parentNode.parent_node
                
                # silly hack since PFAM ids are inconsistent between the PFAM data and IMG data
                revisedMarkerGeneIds = set()
                for mg in markerGenes:
                    if mg.startswith('PF'):
                        revisedMarkerGeneIds.add(mg[0:mg.rfind('.')].replace('PF', 'pfam'))
                    else:
                        revisedMarkerGeneIds.add(mg)
                
                # get all genomes below the internal node (including genomes removed as duplicates)
                genomeIds = []
                for leaf in node.leaf_nodes():
                    genomeIds.append(leaf.taxon.label.replace('IMG_', ''))
                    if leaf.taxon.label in duplicateTaxa:
                        for genomeId in duplicateTaxa[leaf.taxon.label]:
                            genomeIds.append(genomeId.replace('IMG_', ''))
                            
                    genomeIds.append(leaf.taxon.label.replace('IMG_', ''))
                
                missingGenes = self.markerSetBuilder.missingGenes(genomeIds, revisedMarkerGeneIds, ubiquityThreshold)
                duplicateGenes = self.markerSetBuilder.duplicateGenes(genomeIds, revisedMarkerGeneIds, ubiquityThreshold)
                
            fout.write('%s\t%s\t%s\n' % (nodeId, str(missingGenes), str(duplicateGenes)))
            
        sys.stdout.write('\n')
            
        fout.close()