Exemple #1
0
 def run(self, outputFile):
     img = IMG()
     
     print 'Identifying all IMG prokaryotic genomes with valid data.'
     metadata = img.genomeMetadata()
     genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata)
     genomeMissingData = img.genomesWithMissingData(genomeIds)
     genomeIds -= genomeMissingData
     
     print '  Identified %d valid genomes.' % (len(genomeIds))
     
     print 'Calculating gene copy number for each genome.'
     countTable = img.geneCountTable(genomeIds)
     
     counts = []
     for _, count in countTable['pfam00318'].iteritems():
         counts.append(count)
             
     print len(genomeIds)
     print len(counts)
     print mean(counts)
      
     fout = open(outputFile, 'w')
     fout.write(str(countTable))
     fout.close()
     
     print 'Gene count dictionary to: ' + outputFile
        def run(self):
            img = IMG()

            fout = open('./data/evaluate_hmms_with_prodigal.txt', 'w', 1)

            # get list of all marker genes
            markerset = MarkerSet()
            pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes()

            print 'PFAM marker genes: ' + str(len(tigrMarkers))
            print 'TIGR marker genes: ' + str(len(pfamMarkers))
            print ''

            # run HMMs on each of the finished genomes
            genomeIds = img.genomeIds('Finished')
            for genomeId in genomeIds:
                print genomeId + ':'
                fout.write(genomeId + ':\n')

                self.runPFAM(genomeId)
                self.runTIGRFAM(genomeId)

                fout.write('  ORF results:\n')
                self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout)

                #self.translateSixFrames(genomeId)
                #self.runPFAM_SixFrames(genomeId)
                #self.runTIGRFAM_SixFrames(genomeId)

                #fout.write('  Six-frame translation results:\n')
                #self.compareSixFrameResults(genomeId, pfamMarkers, tigrMarkers, fout)

            fout.close()
Exemple #3
0
        def run(self):
            img = IMG()

            fout = open('./data/evaluate_hmms_with_prodigal.txt', 'w', 1)

            # get list of all marker genes
            markerset = MarkerSet()
            pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes()

            print('PFAM marker genes: ' + str(len(tigrMarkers)))
            print('TIGR marker genes: ' + str(len(pfamMarkers)))
            print('')

            # run HMMs on each of the finished genomes
            genomeIds = img.genomeIds('Finished')
            for genomeId in genomeIds:
                print(genomeId + ':')
                fout.write(genomeId + ':\n')

                self.runPFAM(genomeId)
                self.runTIGRFAM(genomeId)

                fout.write('  ORF results:\n')
                self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout)

                #self.translateSixFrames(genomeId)
                #self.runPFAM_SixFrames(genomeId)
                #self.runTIGRFAM_SixFrames(genomeId)

                #fout.write('  Six-frame translation results:\n')
                #self.compareSixFrameResults(genomeId, pfamMarkers, tigrMarkers, fout)

            fout.close()
Exemple #4
0
    def __init__(self, outputDir):
        self.img = IMG()
        self.markerSetBuilder = MarkerSetBuilder()

        if os.path.exists(outputDir):
            print '[Error] Output directory already exists: ' + outputDir
            sys.exit(0)
        else:
            os.makedirs(outputDir)

        self.__checkForHMMER()
        self.__checkForFastTree()

        self.hmmDir = os.path.join(outputDir, 'phylo_hmms')
        self.alignmentDir = os.path.join(outputDir, 'gene_alignments')
        self.geneTreeDir = os.path.join(outputDir, 'gene_trees')
        self.conspecificGeneTreeDir = os.path.join(outputDir,
                                                   'gene_trees_conspecific')
        self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final')

        self.consistencyOut = os.path.join(outputDir,
                                           'genome_tree.consistency.tsv')
        self.concatenatedAlignFile = os.path.join(
            outputDir, 'genome_tree.concatenated.faa')
        self.derepConcatenatedAlignFile = os.path.join(
            outputDir, 'genome_tree.concatenated.derep.fasta')
        self.treeOut = os.path.join(outputDir, 'genome_tree.tre')
        self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre')

        self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre')
        self.treeTaxonomyOut = os.path.join(outputDir,
                                            'genome_tree.taxonomy.tre')
        self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre')
        self.treeDerepRootedOut = os.path.join(outputDir,
                                               'genome_tree.derep.rooted.tre')
        self.treeDerepBootstrapOut = os.path.join(outputDir,
                                                  'genome_tree.derep.bs.tre')
        self.treeDerepFinalOut = os.path.join(outputDir,
                                              'genome_tree.final.tre')
        self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv')
        self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv')
        self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm')
        self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt')

        self.phyloUbiquity = 0.90
        self.phyloSingleCopy = 0.90
        self.paralogAcceptPer = 0.01
        # self.consistencyAcceptPer = 0.95    # for trees at the class-level
        self.consistencyAcceptPer = 0.906  # for trees at the phylum-level
        self.consistencyMinTaxa = 20

        # create output directories
        os.makedirs(self.hmmDir)
        os.makedirs(self.alignmentDir)
        os.makedirs(self.geneTreeDir)
        os.makedirs(self.conspecificGeneTreeDir)
        os.makedirs(self.finalGeneTreeDir)
Exemple #5
0
class ClassTree(object):
    def __init__(self):
        self.img = IMG()
    
    def run(self):
        tree = dendropy.Tree.get_from_path('../data/genome_tree/genome_tree_prok.refpkg/genome_tree.final.tre', schema='newick', as_rooted=True, preserve_underscores=True)
        
        metadata = self.img.genomeMetadata()
        
        # relabel taxa
        for leaf in tree.leaf_nodes():
            genomeId = leaf.taxon.label.replace('IMG_', '')
            classT = metadata[genomeId]['taxonomy'][2]
            newLeafLabel = classT + '_' + genomeId
            leaf.taxon.label = newLeafLabel
            
        # relabel internal nodes
        for node in tree.internal_nodes():
            uid, taxaStr, bootstrap = node.label.split('|')
            
            if bootstrap:
                node.label = uid + ':' + str(node.edge_length) + '[' + str(int(float(bootstrap)*100 + 0.5)) + ']'
            else:
                node.label = uid + ':' + str(node.edge_length)

        # write out reduced tree
        tree.write_to_path('./experiments/classTree.tre', schema='newick', suppress_rooting=True, suppress_edge_lengths=True, unquoted_underscores=True)
        tree.write_to_path('./experiments/classTree_no_internal.tre', schema='newick', suppress_rooting=True, suppress_edge_lengths=True, unquoted_underscores=True, suppress_internal_node_labels=True)
Exemple #6
0
class ClassTree(object):
    def __init__(self):
        self.img = IMG()
    
    def run(self):
        tree = dendropy.Tree.get_from_path('../data/genome_tree/genome_tree_prok.refpkg/genome_tree.final.tre', schema='newick', as_rooted=True, preserve_underscores=True)
        
        metadata = self.img.genomeMetadata()
        
        # relabel taxa
        for leaf in tree.leaf_nodes():
            genomeId = leaf.taxon.label.replace('IMG_', '')
            classT = metadata[genomeId]['taxonomy'][2]
            newLeafLabel = classT + '_' + genomeId
            leaf.taxon.label = newLeafLabel
            
        # relabel internal nodes
        for node in tree.internal_nodes():
            uid, taxaStr, bootstrap = node.label.split('|')
            
            if bootstrap:
                node.label = uid + ':' + str(node.edge_length) + '[' + str(int(float(bootstrap)*100 + 0.5)) + ']'
            else:
                node.label = uid + ':' + str(node.edge_length)

        # write out reduced tree
        tree.write_to_path('./experiments/classTree.tre', schema='newick', suppress_rooting=True, suppress_edge_lengths=True, unquoted_underscores=True)
        tree.write_to_path('./experiments/classTree_no_internal.tre', schema='newick', suppress_rooting=True, suppress_edge_lengths=True, unquoted_underscores=True, suppress_internal_node_labels=True)
Exemple #7
0
    def run(self, outputFile):
        img = IMG()

        print('Identifying all IMG prokaryotic genomes with valid data.')
        metadata = img.genomeMetadata()
        genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata)
        genomeMissingData = img.genomesWithMissingData(genomeIds)
        genomeIds -= genomeMissingData

        print('  Identified %d valid genomes.' % (len(genomeIds)))

        print('Calculating gene copy number for each genome.')
        countTable = img.geneCountTable(genomeIds)

        counts = []
        for _, count in countTable['pfam00318'].iteritems():
            counts.append(count)

        print(len(genomeIds))
        print(len(counts))
        print(mean(counts))

        fout = open(outputFile, 'w')
        fout.write(str(countTable))
        fout.close()

        print('Gene count dictionary to: ' + outputFile)
    def __init__(self, outputDir):
        self.img = IMG()
        self.markerSetBuilder = MarkerSetBuilder()

        if os.path.exists(outputDir):
            print '[Error] Output directory already exists: ' + outputDir
            sys.exit(0)
        else:
            os.makedirs(outputDir)

        self.__checkForHMMER()
        self.__checkForFastTree()

        self.hmmDir = os.path.join(outputDir, 'phylo_hmms')
        self.alignmentDir = os.path.join(outputDir, 'gene_alignments')
        self.geneTreeDir = os.path.join(outputDir, 'gene_trees')
        self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific')
        self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final')

        self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv')
        self.concatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.faa')
        self.derepConcatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.derep.fasta')
        self.treeOut = os.path.join(outputDir, 'genome_tree.tre')
        self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre')

        self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre')
        self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre')
        self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre')
        self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre')
        self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre')
        self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre')
        self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv')
        self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv')
        self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm')
        self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt')

        self.phyloUbiquity = 0.90
        self.phyloSingleCopy = 0.90
        self.paralogAcceptPer = 0.01
        #self.consistencyAcceptPer = 0.95    # for trees at the class-level
        self.consistencyAcceptPer = 0.906   # for trees at the phylum-level
        self.consistencyMinTaxa = 20

        # create output directories
        os.makedirs(self.hmmDir)
        os.makedirs(self.alignmentDir)
        os.makedirs(self.geneTreeDir)
        os.makedirs(self.conspecificGeneTreeDir)
        os.makedirs(self.finalGeneTreeDir)
class MarkerSetStabilityTest(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold,
                         singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None)
            if lineage == None:
                break

            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata,
                                                     'trusted')

            markerGenes = []
            perChange = []
            numGenomesToSelect = int(0.9 * len(genomeIds))
            if len(genomeIds) >= minGenomes:
                # calculate marker set for all genomes in lineage
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(
                    genomeIds, geneCountTable,
                    ubiquityThreshold * len(genomeIds),
                    singleCopyThreshold * len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)

                markerGenes = markerGenes - tigrToRemove

                for _ in range(0, 100):
                    # calculate marker set for subset of genomes
                    subsetGenomeIds = random.sample(genomeIds,
                                                    numGenomesToSelect)
                    geneCountTable = self.img.geneCountTable(subsetGenomeIds)
                    subsetMarkerGenes = self.markerset.markerGenes(
                        subsetGenomeIds, geneCountTable,
                        ubiquityThreshold * numGenomesToSelect,
                        singleCopyThreshold * numGenomesToSelect)
                    tigrToRemove = self.img.identifyRedundantTIGRFAMs(
                        subsetMarkerGenes)
                    subsetMarkerGenes = subsetMarkerGenes - tigrToRemove

                    perChange.append(
                        float(
                            len(
                                markerGenes.symmetric_difference(
                                    subsetMarkerGenes))) * 100.0 /
                        len(markerGenes))

            if perChange != []:
                queueOut.put(
                    (lineage, len(genomeIds), len(markerGenes),
                     numGenomesToSelect, mean(perChange), std(perChange)))
            else:
                queueOut.put((lineage, len(genomeIds), len(markerGenes),
                              numGenomesToSelect, -1, -1))

    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""

        fout = open(outputFile, 'w')
        fout.write(
            'Lineage\t# genomes\t# markers\t# sampled genomes\tmean % change\tstd % change\n'
        )

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange = writerQueue.get(
                block=True, timeout=None)
            if lineage == None:
                break

            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (
                numProcessedLineages, totalLineages,
                float(numProcessedLineages) * 100 / totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' %
                       (lineage, numGenomes, numMarkerGenes, numSampledGenomes,
                        meanPerChange, stdPerChange))

        sys.stdout.write('\n')

        fout.close()

    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold,
            minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""

        print('  Testing stability of marker sets:')

        random.seed(1)

        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes,
                                               mostSpecificRank)

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)

        calcProc = [
            mp.Process(target=self.__processLineage,
                       args=(metadata, ubiquityThreshold, singleCopyThreshold,
                             minGenomes, workerQueue, writerQueue))
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__storeResults,
                               args=(outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None))
        writeProc.join()
Exemple #10
0
 def __init__(self):
     self.img = IMG()
Exemple #11
0
 def __init__(self):
     self.markerSetBuilder = MarkerSetBuilder()
     self.img = IMG()
class Simulation(object):
    def __init__(self):
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()
        
        self.simContigLen = 10000
        
    def __selectMarkerSet(self, tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut):
        """Select marker set for parent edge of specified internal node."""
        
        # get genomes descendant from each child of the specified internal node
        leaves = []
        for child in internalNode.child_nodes(): 
            genomeIds = set()  
            for leaf in child.leaf_nodes():
                genomeId = leaf.taxon.label.replace('IMG_', '')
                genomeIds.add(genomeId)
                
                duplicateGenomes = self.markerSetBuilder.duplicateSeqs.get(leaf.taxon.label, [])
                for dup in duplicateGenomes:
                    dupId = dup.replace('IMG_', '')
                    genomeIds.add(dupId)
                 
            leaves.append(genomeIds)
            
        # make sure each set of leaves contains at least a minimum number of genomes
        orderedLeaves = sorted(leaves, key=len)
        if len(orderedLeaves[0]) < 5:
            queueOut.put(('NA', -1, -1, -1, -1, -1))
            return
                   
        # calculate marker genes with all genomes in lineage with the fewest genomes removed 
        binMarkerGenes, _ = self.markerSetBuilder.buildBinMarkerSet(tree, internalNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = orderedLeaves[0])
        
        # evaluate accuracy of completeness and contamination estimations on different partial genomes from lineage with fewest genomes   
        testGenomeIds = random.sample(orderedLeaves[0], min(len(orderedLeaves[0]), 100))    
        
        deltaComp = defaultdict(list)
        deltaCont = defaultdict(list)
        
        for testGenomeId in testGenomeIds:   
            geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerGenes.getMarkerGenes(), spacingBetweenContigs=0)
            genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna'))
            
            repsPerGenome = 100
            for _ in xrange(0, repsPerGenome): 
                testComp = random.uniform(0.5, 1.0)
                testCont = random.uniform(0, 0.2)
                trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(genomeSize, testComp, testCont, self.simContigLen)   
      
                for ms in binMarkerGenes.markerSetIter():  
                    containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, self.simContigLen)
                    completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True)      
                    if completeness == 0.0:
                        print ms.getMarkerGenes()
                        print geneDistTable[testGenomeId]
                        print startPartialGenomeContigs
                        print genomeSize
                        print '*****************' + testGenomeId
                        sys.exit()
                    deltaComp[ms.lineageStr].append(completeness - trueComp)
                    deltaCont[ms.lineageStr].append(contamination - trueCont)
            
        # determine lineage-specific marker set with best average performance
        curBest = 1000
        bestUID = None
        dCompBest = 0
        dContBest = 0
        
        for lineageStr in deltaComp:
            dComp, dCont = mean(abs(array(deltaComp[lineageStr]))), mean(abs(array(deltaCont[lineageStr])))

            if (dComp + dCont) < curBest:
                dCompBest = dComp
                dContBest = dCont
                dCompStdBest = std(abs(array(deltaComp[lineageStr])))
                dContStdBest = std(abs(array(deltaCont[lineageStr])))
                bestUID = lineageStr.split('|')[0]
                curBest = dComp + dCont

        queueOut.put((internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest))
                        
    def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            internalNode = queueIn.get(block=True, timeout=None)
            if internalNode == None:
                break
            
            self.__selectMarkerSet(tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut)      
                      
    def __writerThread(self, numInternalNodes, writerQueue):
        """Store or write results of worker threads in a single thread."""

        fout = open('/tmp/simInferBestMarkerSet.tsv', 'w')
        fout.write('Internal node ID\tMarker set ID\tmean % delta comp\tstd % delta comp\tmean % delta cont\tstd % delta cont\n')

        itemsProcessed = 0
        while True:
            internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest = writerQueue.get(block=True, timeout=None)
            if internalNode == None:
                break
            
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) internal branches.' % (itemsProcessed, numInternalNodes, float(itemsProcessed)*100/(numInternalNodes))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            if internalNode != 'NA':
                fout.write(internalNode.label + '\t%s\t%.2f\t%.2f\t%.2f\t%.2f\n' % (bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest)) 
            
        fout.close()

        sys.stdout.write('\n')

    def run(self, ubiquityThreshold, singleCopyThreshold, numThreads):
        random.seed(0)
          
        print '\n  Calculating global gene count table.'
        metadata = self.img.genomeMetadata()
        self.markerSetBuilder.globalGeneCountTable = self.img.geneCountTable(metadata.keys())
          
        print '\n  Reading reference genome tree.'
        treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre')
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
            
        print '  Evaluating %d internal nodes.' % len(tree.internal_nodes())
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for internalNode in tree.internal_nodes():
            if internalNode.parent_node != None:
                workerQueue.put(internalNode)

        for _ in range(numThreads):
            workerQueue.put(None)

        metadata = self.img.genomeMetadata()
        workerProc = [mp.Process(target = self.__workerThread, args = (tree, metadata, ubiquityThreshold, singleCopyThreshold, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__writerThread, args = (len(tree.internal_nodes())-1, writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None))
        writeProc.join()
    def run(self, geneTreeDir, treeExtension, consistencyThreshold, minTaxaForAverage, outputFile, outputDir):
        # make sure output directory is empty
        if not os.path.exists(outputDir):
            os.makedirs(outputDir)

        files = os.listdir(outputDir)
        for f in files:
            if os.path.isfile(os.path.join(outputDir, f)):
                os.remove(os.path.join(outputDir, f))

        # get TIGRFam info
        descDict = {}
        files = os.listdir('/srv/db/tigrfam/13.0/TIGRFAMs_13.0_INFO')
        for f in files:
            shortDesc = longDesc = ''
            for line in open('/srv/db/tigrfam/13.0/TIGRFAMs_13.0_INFO/' + f):
                lineSplit = line.split('  ')
                if lineSplit[0] == 'AC':
                    acc = lineSplit[1].strip()
                elif lineSplit[0] == 'DE':
                    shortDesc = lineSplit[1].strip()
                elif lineSplit[0] == 'CC':
                    longDesc = lineSplit[1].strip()

            descDict[acc] = [shortDesc, longDesc]

        # get PFam info
        for line in open('/srv/db/pfam/27/Pfam-A.clans.tsv'):
            lineSplit = line.split('\t')
            acc = lineSplit[0]
            shortDesc = lineSplit[3]
            longDesc = lineSplit[4].strip()

            descDict[acc] = [shortDesc, longDesc]

        # get IMG taxonomy
        img = IMG()
        metadata = img.genomeMetadata()
        genomeIdToTaxonomy = {}
        for genomeId, m in metadata.iteritems():
            genomeIdToTaxonomy[genomeId] = m['taxonomy']

        # perform analysis for each tree
        treeFiles = os.listdir(geneTreeDir)
        allResults = {}
        allTaxa = [set([]), set([]), set([])]
        taxaCounts = {}
        avgConsistency = {}
        for treeFile in treeFiles:
            if not treeFile.endswith(treeExtension):
                continue

            print treeFile
            tree = dendropy.Tree.get_from_path(os.path.join(geneTreeDir, treeFile), schema='newick', as_rooted=True, preserve_underscores=True)

            domainConsistency = {}
            phylaConsistency = {}
            classConsistency = {}
            consistencyDict = [domainConsistency, phylaConsistency, classConsistency]

            # get abundance of taxa at different taxonomic ranks
            totals = [{}, {}, {}]
            leaves = tree.leaf_nodes()
            print '  Number of leaves: ' + str(len(leaves))
            totalValidLeaves = 0

            for leaf in leaves:
                genomeId = self.__genomeId(leaf.taxon.label)

                if genomeId not in metadata:
                    print '[Error] Genome is missing metadata: ' + genomeId
                    sys.exit()

                totalValidLeaves += 1
                taxonomy = genomeIdToTaxonomy[genomeId]
                for r in xrange(0, 3):
                    totals[r][taxonomy[r]] = totals[r].get(taxonomy[r], 0) + 1
                    consistencyDict[r][taxonomy[r]] = 0
                    allTaxa[r].add(taxonomy[r])

            taxaCounts[treeFile] = [totalValidLeaves, totals[0].get('Bacteria', 0), totals[0].get('Archaea', 0)]

            # find highest consistency nodes (congruent descendant taxa / (total taxa + incongruent descendant taxa))
            internalNodes = tree.internal_nodes()
            for node in internalNodes:
                leaves = node.leaf_nodes()

                for r in xrange(0, 3):
                    leafCounts = {}
                    for leaf in leaves:
                        genomeId = self.__genomeId(leaf.taxon.label)
                        taxonomy = genomeIdToTaxonomy[genomeId]
                        leafCounts[taxonomy[r]] = leafCounts.get(taxonomy[r], 0) + 1

                    # calculate consistency for node
                    for taxa in consistencyDict[r]:
                        totalTaxaCount = totals[r][taxa]
                        if totalTaxaCount <= 1 or taxa == 'unclassified':
                            consistencyDict[r][taxa] = 'N/A'
                            continue

                        taxaCount = leafCounts.get(taxa, 0)
                        incongruentTaxa = len(leaves) - taxaCount
                        c = float(taxaCount) / (totalTaxaCount + incongruentTaxa)
                        if c > consistencyDict[r][taxa]:
                            consistencyDict[r][taxa] = c

                        # consider clan in other direction since the trees are unrooted
                        taxaCount = totalTaxaCount - leafCounts.get(taxa, 0)
                        incongruentTaxa = totalValidLeaves - len(leaves) - taxaCount
                        c = float(taxaCount) / (totalTaxaCount + incongruentTaxa)
                        if c > consistencyDict[r][taxa]:
                            consistencyDict[r][taxa] = c

            # write results
            consistencyDir = os.path.join(outputDir, 'consistency')
            if not os.path.exists(consistencyDir):
                os.makedirs(consistencyDir)
            fout = open(os.path.join(consistencyDir, treeFile + '.results.tsv'), 'w')
            fout.write('Tree')
            for r in xrange(0, 3):
                for taxa in sorted(consistencyDict[r].keys()):
                    fout.write('\t' + taxa)
            fout.write('\n')

            fout.write(treeFile)
            for r in xrange(0, 3):
                for taxa in sorted(consistencyDict[r].keys()):
                    if consistencyDict[r][taxa] != 'N/A':
                        fout.write('\t%.2f' % (consistencyDict[r][taxa]*100))
                    else:
                        fout.write('\tN/A')
            fout.close()

            # calculate average consistency at each taxonomic rank
            average = []
            for r in xrange(0, 3):
                sumConsistency = []
                for taxa in consistencyDict[r]:
                    if totals[r][taxa] > minTaxaForAverage and consistencyDict[r][taxa] != 'N/A':
                        sumConsistency.append(consistencyDict[r][taxa])

                if len(sumConsistency) > 0:
                    average.append(sum(sumConsistency) / len(sumConsistency))
                else:
                    average.append(0)
            avgConsistency[treeFile] = average
            allResults[treeFile] = consistencyDict

            print '  Average consistency: ' + str(average) + ', mean = %.2f' % (sum(average)/len(average))
            print ''

        # print out combined results
        fout = open(outputFile, 'w')
        fout.write('Tree\tShort Desc.\tLong Desc.\tAlignment Length\t# Taxa\t# Bacteria\t# Archaea\tAvg. Consistency\tAvg. Domain Consistency\tAvg. Phylum Consistency\tAvg. Class Consistency')
        for r in xrange(0, 3):
            for t in sorted(allTaxa[r]):
                fout.write('\t' + t)
        fout.write('\n')

        filteredGeneTrees = 0
        retainedGeneTrees = 0
        for treeFile in sorted(allResults.keys()):
            consistencyDict = allResults[treeFile]
            treeId = treeFile[0:treeFile.find('.')].replace('pfam', 'PF')

            fout.write(treeId + '\t' + descDict[treeId][0] + '\t' + descDict[treeId][1])

            # Taxa count
            fout.write('\t' + str(taxaCounts[treeFile][0]) + '\t' + str(taxaCounts[treeFile][1]) + '\t' + str(taxaCounts[treeFile][2]))

            avgCon = 0
            for r in xrange(0, 3):
                avgCon += avgConsistency[treeFile][r]
            avgCon /= 3
            fout.write('\t' + str(avgCon))
            
            if avgCon >= consistencyThreshold:
                retainedGeneTrees += 1
                os.system('cp ' + os.path.join(geneTreeDir, treeFile) + ' ' + os.path.join(outputDir, treeFile))
            else:
                filteredGeneTrees += 1
                print 'Filtered % s with an average consistency of %.4f.' % (treeFile, avgCon)

            for r in xrange(0, 3):
                fout.write('\t' + str(avgConsistency[treeFile][r]))

            for r in xrange(0, 3):
                for t in sorted(allTaxa[r]):
                    if t in consistencyDict[r]:
                        if consistencyDict[r][t] != 'N/A':
                            fout.write('\t%.2f' % (consistencyDict[r][t]*100))
                        else:
                            fout.write('\tN/A')
                    else:
                        fout.write('\tN/A')
            fout.write('\n')
        fout.close()

        print 'Retained gene trees: ' + str(retainedGeneTrees)
        print 'Filtered gene trees: ' + str(filteredGeneTrees)
 def __init__(self):
     self.img = IMG()
     self.markerset = MarkerSet()
Exemple #15
0
class DecorateTree(object):
    def __init__(self):
        self.img = IMG()
        self.markerSetBuilder = MarkerSetBuilder()
    
    def __meanStd(self, metadata, genomeIds, category):
        values = []
        for genomeId in genomeIds:
            genomeId = genomeId.replace('IMG_', '')
            v = metadata[genomeId][category]
            if v != 'NA':
                values.append(v)
            
        return mean(values), std(values)
    
    def __calculateMarkerSet(self, genomeLabels, ubiquityThreshold = 0.97, singleCopyThreshold = 0.97):
        """Calculate marker set for a set of genomes."""
        
        # get genome IDs from genome labels
        genomeIds = set()
        for genomeLabel in genomeLabels:
            genomeIds.add(genomeLabel.replace('IMG_', ''))
            
        markerSet = self.markerSetBuilder.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold)
              
        return markerSet.markerSet
    
    def __pfamIdToPfamAcc(self, img):
        pfamIdToPfamAcc = {}
        for line in open(img.pfamHMMs):
            if 'ACC' in line:
                acc = line.split()[1].strip()
                pfamId = acc.split('.')[0]
                
                pfamIdToPfamAcc[pfamId] = acc
        
        return pfamIdToPfamAcc

    def decorate(self, taxaTreeFile, derepFile, inputTreeFile, metadataOut, numThreads):    
        # read genome metadata
        print '  Reading metadata.'
        metadata = self.img.genomeMetadata()
                
        # read list of taxa with duplicate sequences
        print '  Read list of taxa with duplicate sequences.'
        duplicateTaxa = {}
        for line in open(derepFile):
            lineSplit = line.rstrip().split()
            if len(lineSplit) > 1:
                duplicateTaxa[lineSplit[0]] = lineSplit[1:]
                
        # build gene count table
        print '  Building gene count table.'
        genomeIds = self.img.genomeMetadata().keys()
        print '    # trusted genomes = ' + str(len(genomeIds))
        
        # calculate statistics for each internal node using multiple threads
        print '  Calculating statistics for each internal node.'
        self.__internalNodeStatistics(taxaTreeFile, inputTreeFile, duplicateTaxa, metadata, metadataOut, numThreads)
     
    def __internalNodeStatistics(self, taxaTreeFile, inputTreeFile, duplicateTaxa, metadata, metadataOut, numThreads):  
        
        # determine HMM model accession numbers
        pfamIdToPfamAcc = self.__pfamIdToPfamAcc(self.img)
               
        taxaTree = dendropy.Tree.get_from_path(taxaTreeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        inputTree = dendropy.Tree.get_from_path(inputTreeFile, schema='newick', as_rooted=True, preserve_underscores=True)
    
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()
        
        uniqueId = 0
        for node in inputTree.internal_nodes():
            uniqueId += 1
            workerQueue.put((uniqueId, node))
            
        for _ in range(numThreads):
            workerQueue.put((None, None))
            
        calcProc = [mp.Process(target = self.__processInternalNode, args = (taxaTree, duplicateTaxa, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__reportStatistics, args = (metadata, metadataOut, inputTree, inputTreeFile, pfamIdToPfamAcc, writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None))
        writeProc.join()
        
    def __processInternalNode(self, taxaTree, duplicateTaxa, queueIn, queueOut):
        """Run each marker gene in a separate thread."""
        
        while True:
            uniqueId, node = queueIn.get(block=True, timeout=None) 
            if uniqueId == None:
                break 
            
            # find corresponding internal node in taxa tree
            labels = []
            for leaf in node.leaf_nodes():
                labels.append(leaf.taxon.label)
                if leaf.taxon.label in duplicateTaxa:
                    for genomeId in duplicateTaxa[leaf.taxon.label]:
                        labels.append(genomeId)
                                                                              
            # check if there is a taxonomic label
            mrca = taxaTree.mrca(taxon_labels = labels)
            taxaStr = ''
            if mrca.label:
                taxaStr = mrca.label.replace(' ', '')
                
            # give node a unique Id while retraining bootstrap value
            bootstrap = ''
            if node.label:
                bootstrap = node.label
            nodeLabel = 'UID' + str(uniqueId) + '|' + taxaStr + '|' + bootstrap
            
            # calculate marker set
            markerSet = self.__calculateMarkerSet(labels)
            
            queueOut.put((uniqueId, labels, markerSet, taxaStr, bootstrap, node.oid, nodeLabel))
            
    def __reportStatistics(self, metadata, metadataOut, inputTree, inputTreeFile, pfamIdToPfamAcc, writerQueue):
        """Store statistics for internal node."""
                
        fout = open(metadataOut, 'w')
        fout.write('UID\t# genomes\tTaxonomy\tBootstrap')
        fout.write('\tGC mean\tGC std')
        fout.write('\tGenome size mean\tGenome size std')
        fout.write('\tGene count mean\tGene count std')
        fout.write('\tMarker set')
        fout.write('\n')

        numProcessedNodes = 0
        numInternalNodes = len(inputTree.internal_nodes())
        while True:
            uniqueId, labels, markerSet, taxaStr, bootstrap, nodeID, nodeLabel = writerQueue.get(block=True, timeout=None)
            if uniqueId == None:
                break
            
            numProcessedNodes += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) internal nodes.' % (numProcessedNodes, numInternalNodes, float(numProcessedNodes)*100/numInternalNodes)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            fout.write('UID' + str(uniqueId) + '\t' + str(len(labels)) + '\t' + taxaStr + '\t' + bootstrap)
            
            m, s = self.__meanStd(metadata, labels, 'GC %')
            fout.write('\t' + str(m*100) + '\t' + str(s*100))
            
            m, s = self.__meanStd(metadata, labels, 'genome size')
            fout.write('\t' + str(m) + '\t' + str(s))
            
            m, s = self.__meanStd(metadata, labels, 'gene count')
            fout.write('\t' + str(m) + '\t' + str(s))
            
            # change model names to accession numbers, and make
            # sure there is an HMM model for each PFAM
            mungedMarkerSets = []
            for geneSet in markerSet:
                s = set()
                for geneId in geneSet:
                    if 'pfam' in geneId:
                        pfamId = geneId.replace('pfam', 'PF')
                        if pfamId in pfamIdToPfamAcc:                      
                            s.add(pfamIdToPfamAcc[pfamId])
                    else:
                        s.add(geneId)
                mungedMarkerSets.append(s)
            
            fout.write('\t' + str(mungedMarkerSets))
            
            fout.write('\n')
            
            node = inputTree.find_node(filter_fn=lambda n: hasattr(n, 'oid') and n.oid == nodeID)
            node.label = nodeLabel
            
        sys.stdout.write('\n')
        
        fout.close()
        
        inputTree.write_to_path(inputTreeFile, schema='newick', suppress_rooting=True, unquoted_underscores=True)
Exemple #16
0
    def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination):
        img = IMG()
        markerSetBuilder = MarkerSetBuilder()

        allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w')
        allOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')

        trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w')
        trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')

        filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w')
        filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')

        metadataOut = open(outputMetadataFile, 'w')
        
        # read input metadata file
        metadata = img.genomeMetadataFromFile(inputMetadataFile)
        
        finishedGenomes = defaultdict(set)
        allGenomes = defaultdict(set)
        
        metadataLine = {}
        
        bHeader = True
        for line in open(inputMetadataFile):
            if bHeader:
                metadataOut.write(line)
                bHeader = False
                continue
            
            lineSplit = line.split('\t')
            genomeId = lineSplit[0]
            domain = lineSplit[1]
            status = lineSplit[2]
            
            if status == 'Finished':
                finishedGenomes[domain].add(genomeId)
            
            allGenomes[domain].add(genomeId)
            metadataLine[genomeId] = line

        allTrustedGenomeIds = set()
        for lineage, allLineageGenomeIds in allGenomes.iteritems():
            print '[' + lineage + ']'
            print '  Number of genomes: %d' % len(allLineageGenomeIds)

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker genes for finished genomes
            print '\nDetermining initial marker gene sets for genome filtering.'
            markerSet = markerSetBuilder.buildMarkerSet(finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold)

            print '  Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets())
            fout = open(os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w')
            fout.write(str(markerSet.markerSet))
            fout.close()

            # identifying trusted genomes (highly complete, low contamination genomes)
            print '\nIdentifying highly complete, low contamination genomes.'
            trustedGenomeIds = set()
            filteredGenomes = set()
            retainedStatus = {}
            filteredStatus = {}
            geneCountTable = img.geneCountTable(allLineageGenomeIds)
            for genomeId in allLineageGenomeIds:
                completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(markerSet.markerSet, genomeId, geneCountTable)
                
                genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)
                    retainedStatus[metadata[genomeId]['status']] = retainedStatus.get(metadata[genomeId]['status'], 0) + 1

                    trustedOut.write(genomeStr)
                    allOut.write(genomeStr)
                    
                    metadataOut.write(metadataLine[genomeId])
                else:
                    filteredGenomes.add(genomeId)
                    filteredStatus[metadata[genomeId]['status']] = filteredStatus.get(metadata[genomeId]['status'], 0) + 1

                    filteredOut.write(genomeStr)
                    allOut.write(genomeStr)

            print '  Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes)*100.0 / len(allLineageGenomeIds))
            print '  ' + str(filteredStatus)
            print '  \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds)*100.0 / len(allLineageGenomeIds))
            print '  ' + str(retainedStatus)

            # determine status of retained genomes
            print '\nTrusted genomes by phylum:'
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1

            for phylum, count in allPhylumCounts.iteritems():
                print '  ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count)
            print ''

        allOut.close()
        trustedOut.close()
        filteredOut.close()
        metadataOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in xrange(0, 6): # Domain to Genus
            for genomeId, data in metadata.iteritems():
                taxaStr = ';'.join(data['taxonomy'][0:r+1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted(metadata)

        fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n')
        fout.close()
Exemple #17
0
 def __init__(self):
     img = IMG()
     self.metadata = img.genomeMetadata()
class GenomeTreeWorkflow(object):
    def __init__(self, outputDir):
        self.img = IMG()
        self.markerSetBuilder = MarkerSetBuilder()

        if os.path.exists(outputDir):
            print '[Error] Output directory already exists: ' + outputDir
            sys.exit(0)
        else:
            os.makedirs(outputDir)

        self.__checkForHMMER()
        self.__checkForFastTree()

        self.hmmDir = os.path.join(outputDir, 'phylo_hmms')
        self.alignmentDir = os.path.join(outputDir, 'gene_alignments')
        self.geneTreeDir = os.path.join(outputDir, 'gene_trees')
        self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific')
        self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final')

        self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv')
        self.concatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.faa')
        self.derepConcatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.derep.fasta')
        self.treeOut = os.path.join(outputDir, 'genome_tree.tre')
        self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre')

        self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre')
        self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre')
        self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre')
        self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre')
        self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre')
        self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre')
        self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv')
        self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv')
        self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm')
        self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt')

        self.phyloUbiquity = 0.90
        self.phyloSingleCopy = 0.90
        self.paralogAcceptPer = 0.01
        #self.consistencyAcceptPer = 0.95    # for trees at the class-level
        self.consistencyAcceptPer = 0.906   # for trees at the phylum-level
        self.consistencyMinTaxa = 20

        # create output directories
        os.makedirs(self.hmmDir)
        os.makedirs(self.alignmentDir)
        os.makedirs(self.geneTreeDir)
        os.makedirs(self.conspecificGeneTreeDir)
        os.makedirs(self.finalGeneTreeDir)

    def __checkForHMMER(self):
        """Check to see if HMMER is on the system path."""

        try:
            exit_status = os.system('hmmfetch -h > /dev/null')
        except:
            print "Unexpected error!", sys.exc_info()[0]
            raise

        if exit_status != 0:
            print "[Error] hmmfetch is not on the system path"
            sys.exit()

    def __checkForFastTree(self):
        """Check to see if FastTree is on the system path."""

        try:
            exit_status = os.system('FastTree 2> /dev/null')
        except:
            print "Unexpected error!", sys.exc_info()[0]
            raise

        if exit_status != 0:
            print "[Error] FastTree is not on the system path"
            sys.exit()

    def __genesInGenomes(self, genomeIds):
        genesInGenomes = {}
        for genomeId in genomeIds:
            markerIdToGeneIds = defaultdict(set)
            for line in open(os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.pfamExtension)):
                lineSplit = line.split('\t')
                markerIdToGeneIds[lineSplit[8]].add(lineSplit[0])

            for line in open(os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.tigrExtension)):
                lineSplit = line.split('\t')
                markerIdToGeneIds[lineSplit[6]].add(lineSplit[0])

            genesInGenomes[genomeId] = markerIdToGeneIds

        return genesInGenomes

    def __fetchMarkerModels(self, universalMarkerGenes, outputModelDir):
        markerIdToName = {}
        for line in open('/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'):
            if 'NAME' in line:
                name = line.split()[1].rstrip()
            elif 'ACC' in line:
                acc = line.split()[1].rstrip()
                markerId = acc.replace('PF', 'pfam')
                markerId = markerId[0:markerId.rfind('.')]
                markerIdToName[markerId] = name

        for markerId in universalMarkerGenes:
            if 'pfam' in markerId:
                os.system('hmmfetch  /srv/whitlam/bio/db/pfam/27/Pfam-A.hmm ' + markerIdToName[markerId] + ' > ' + os.path.join(outputModelDir, markerId.replace('pfam', 'PF') + '.hmm'))
            else:
                os.system('hmmfetch  /srv/whitlam/bio/db/tigrfam/13.0/' + markerId + '.HMM ' + markerId + ' > ' + os.path.join(outputModelDir, markerId + '.hmm'))

    def __alignMarkers(self, genomeIds, markerGenes, genesInGenomes, numThreads, outputGeneDir, outputModelDir):
        """Perform multithreaded alignment of marker genes using HMM align."""

        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for _, markerId in enumerate(markerGenes):
            workerQueue.put(markerId)

        for _ in range(numThreads):
            workerQueue.put(None)

        calcProc = [mp.Process(target = self.__runHmmAlign, args = (genomeIds, genesInGenomes, outputGeneDir, outputModelDir, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__reportThreads, args = (len(markerGenes), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put(None)
        writeProc.join()

    def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            markerId = queueIn.get(block=True, timeout=None)
            if markerId == None:
                break

            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in genomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    if geneId not in seqs:
                        # this shouldn't be necessary, but the IMG metadata isn't always
                        # perfectly in sync with the sequence data
                        continue

                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()

            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam')
            self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))

            queueOut.put(modelName)

    def __reportThreads(self, numGenes, writerQueue):
        """Store confidence intervals (i.e., to shared memory)."""

        numProcessedGenes = 0
        while True:
            markerId = writerQueue.get(block=True, timeout=None)
            if markerId == None:
                break

            numProcessedGenes += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) marker genes.' % (numProcessedGenes, numGenes, float(numProcessedGenes)*100/numGenes)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

        sys.stdout.write('\n')

    def __maskAlignment(self, inputFile, outputFile):
        """Read HMMER alignment in STOCKHOLM format and output masked alignment in FASTA format."""
        # read STOCKHOLM alignment
        seqs = {}
        for line in open(inputFile):
            line = line.rstrip()
            if line == '' or line[0] == '#' or line == '//':
                if 'GC RF' in line:
                    mask = line.split('GC RF')[1].strip()
                continue
            else:
                lineSplit = line.split()
                seqs[lineSplit[0]] = lineSplit[1].upper().replace('.', '-').strip()

        # output masked sequences in FASTA format
        fout = open(outputFile, 'w')
        for seqId, seq in seqs.iteritems():
            fout.write('>' + seqId + '\n')

            maskedSeq = ''.join([seq[i] for i in xrange(0, len(seq)) if mask[i] == 'x'])
            fout.write(maskedSeq + '\n')
        fout.close()

    def __taxonomicMarkers(self, taxaStr, metadata, ubiquityThreshold, singleCopyThreshold):
        """Get genomes and marker genes for a specific lineage."""

        genomeIds = self.img.genomeIdsByTaxonomy(taxaStr, metadata)
        
        # hack to add in other genomes with incorrect taxonomy
        aceIds = set()
        for line in open('./taxonomicTrees/firmicutes.nds'):
            aceIds.add(line.strip())
            
        aceIdsToImgIds = self.aceIdsToImgIds()
        for aceId in aceIds:
            if aceId in aceIdsToImgIds:
                genomeId = aceIdsToImgIds[aceId]
                if genomeId in metadata:
                    genomeIds.add(genomeId)
        
        markerGenes = self.markerSetBuilder.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold )

        return genomeIds, markerGenes

    def inferGeneTrees(self, phyloUbiquityThreshold, phyloSingleCopyThreshold, numThreads, outputGeneDir, outputModelDir, outgroupSize):
        # make sure output directory is empty
        if not os.path.exists(outputGeneDir):
            os.makedirs(outputGeneDir)

        if not os.path.exists(outputModelDir):
            os.makedirs(outputModelDir)

        files = os.listdir(outputGeneDir)
        for f in files:
            os.remove(os.path.join(outputGeneDir, f))

        # get genomes and marker genes for taxonomic groups of interest
        print ''
        print 'Identifying genomes and marker genes of interest:'
        metadata = self.img.genomeMetadata()
        ingroupGenomeIds, ingroupMarkers = self.__taxonomicMarkers('Bacteria;Firmicutes', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold)
        outgroupGenomeIds = self.img.genomeIdsByTaxonomy('Bacteria;Coprothermobacter', metadata)
        #alphaGenomeIds, _ = self.__taxonomicMarkers('Bacteria;Proteobacteria;Alphaproteobacteria', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold)

        print '  Identified ingroup genomes: %d' % len(ingroupGenomeIds)
        print '  Identified outgroup genomes: %d' % len(outgroupGenomeIds)

        numOutgroupTaxa = min(outgroupSize, len(outgroupGenomeIds))
        print ''
        print '  Selecting %d taxa from the outgroup.' % (numOutgroupTaxa)
        genomeIds = ingroupGenomeIds.union(random.sample(outgroupGenomeIds, numOutgroupTaxa))

        self.imgIdsToAceIds(genomeIds)

        print '  Identified markers: %d' % len(ingroupMarkers)

        # get mapping of marker ids to gene ids for each genome
        print '  Determine genes for genomes of interest.'
        genesInGenomes = self.__genesInGenomes(genomeIds)

        # get HMM for each marker gene
        print '  Fetching HMM for each marker genes.'
        self.__fetchMarkerModels(ingroupMarkers, outputModelDir)

        # align gene sequences and infer gene trees
        print '  Aligning marker genes:'
        self.__alignMarkers(genomeIds, ingroupMarkers, genesInGenomes, numThreads, outputGeneDir, outputModelDir)

        return genomeIds

    def imgIdsToAceIds(self, imgIds):
        imgIdToAceId = {}
        for line in open('ggg_tax_img.feb_2014.txt'):
            lineSplit = line.split('\t')
            imgIdToAceId[lineSplit[1].rstrip()] = lineSplit[0]

        missing = 0
        for imgId in imgIds:
            if imgId not in imgIdToAceId:
                missing += 1

        print '  Number of genomes without an ACE id: ' + str(missing)

        return imgIdToAceId
    
    def aceIdsToImgIds(self):
        aceIdsToImgIds = {}
        for line in open('ggg_tax_img.feb_2014.txt'):
            lineSplit = line.split('\t')
            aceIdsToImgIds[lineSplit[0].strip()] = lineSplit[1].strip()

        return aceIdsToImgIds

    def run(self, numThreads, outgroupSize):

        # identify genes suitable for phylogenetic inference
        print '--- Identifying genes suitable for phylogenetic inference ---'
        genomeIds = self.inferGeneTrees(self.phyloUbiquity, self.phyloSingleCopy, numThreads, self.alignmentDir, self.hmmDir, outgroupSize)

        # infer gene trees
        print ''
        print '--- Inferring gene trees ---'
        makeTrees = MakeTrees()
        makeTrees.run(self.alignmentDir, self.geneTreeDir, '.aln.masked.faa', numThreads)

        # test gene trees for paralogs
        print ''
        print '--- Testing for paralogs in gene trees ---'
        paralogTest = ParalogTest()
        paralogTest.run(self.geneTreeDir, self.paralogAcceptPer, '.tre', self.conspecificGeneTreeDir)

        # test gene trees for consistency with IMG taxonomy
        print ''
        print '--- Testing taxonomic consistency of gene trees ---'
        consistencyTest = ConsistencyTest()
        consistencyTest.run(self.conspecificGeneTreeDir, '.tre', self.consistencyAcceptPer, self.consistencyMinTaxa, self.consistencyOut, self.finalGeneTreeDir)

        # gather phylogenetically informative HMMs into a single model file
        print ''
        print '--- Gathering phylogenetically informative HMMs ---'
        getPhylogeneticHMMs = GetPhylogeneticHMMs()
        getPhylogeneticHMMs.run(self.hmmDir, self.finalGeneTreeDir, self.phyloHMMsOut)

        # infer genome tree
        print ''
        print '--- Inferring full genome tree ---'
        inferGenomeTree = InferGenomeTree()
        inferGenomeTree.run(self.finalGeneTreeDir, self.alignmentDir, '.aln.masked.faa', self.concatenatedAlignFile, self.treeOut, self.taxonomyOut, bSupportValues = True)

        # replace IMG identifiers with ACE identifiers
        imgIdToAceId = self.imgIdsToAceIds(genomeIds)
        with open(self.treeOut) as f:
            tree = ''.join(f.readlines())

            for genomeId in genomeIds:
                if genomeId in imgIdToAceId:
                    tree = tree.replace('IMG_' + genomeId, imgIdToAceId[genomeId])

        fout = open(self.treeOutAce, 'w')
        fout.write(tree)
        fout.close()
Exemple #19
0
 def __init__(self):
     self.markerSetBuilder = MarkerSetBuilder()
     self.img = IMG()
Exemple #20
0
class Simulation(object):
    def __init__(self):
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()

    def run(self):
        print('\n  Reading reference genome tree.')
        treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data',
                                'genome_tree', 'genome_tree_prok.refpkg',
                                'genome_tree.final.tre')
        tree = dendropy.Tree.get_from_path(treeFile,
                                           schema='newick',
                                           as_rooted=True,
                                           preserve_underscores=True)

        # get all Finished, Trusted genomes
        metadata = self.img.genomeMetadata()
        bacteriaIds = self.img.getGenomesByClade('domain', 'Bacteria',
                                                 metadata)
        print('# Bacteria: %d' % len(bacteriaIds))

        start = time.time()
        #self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys())
        end = time.time()
        print('globalGeneCountTable: %.2f' % (end - start))

        start = time.time()
        #self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys())
        end = time.time()
        print('precomputeGenomeSeqLens: %.2f' % (end - start))

        start = time.time()
        #self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 5000)
        end = time.time()
        print('precomputeGenomeFamilyPositions: %.2f' % (end - start))

        #start = time.time()
        #test = self.img.geneDistTable(metadata.keys(), self.markerSetBuilder.globalGeneCountTable.keys(), spacingBetweenContigs=1e6)
        #end = time.time()
        #print 'geneDistTable: %.2f' % (end - start)

        #t = raw_input('waiting...')

        start = time.time()
        #testGenomeId = archaeaIds.pop()
        #testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId)
        #binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, 0.97, 0.97, bMarkerSet = True)
        markerSet = self.markerSetBuilder.buildMarkerSet(
            bacteriaIds, 0.97, 0.97)
        end = time.time()
        print('buildMarkerSet: %.2f' % (end - start))

        print(len(markerSet.markerSet))
        test = eval(
            "[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]"
        )
        print(len(test))

        for ms in markerSet.markerSet:
            bMatch = False
            for tms in test:
                if tms == ms:
                    print(ms)
                    print(tms)
                    print('---------')
                    bMatch = True
                    break

            if not bMatch:
                print('BOO!')

        if str(
                markerSet.markerSet
        ) == "[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]":
            print('Good to go!')
        else:
            print('oh, shit!!!!')
Exemple #21
0
    def run(self, metadataFile, percentThreshold):
        img = IMG()

        metadata = img.genomeMetadataFromFile(metadataFile)

        matches = {}
        pfamCount = {}
        tigrCount = {}
        for genomeCounter, genomeId in enumerate(metadata):
            statusStr = '  Finished processing %d of %d (%.2f%%) genomes.' % (genomeCounter+1, len(metadata), float(genomeCounter+1)*100/len(metadata))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            if metadata[genomeId]['status'] == 'Finished':
                pfamFile = img.genomeDir + genomeId + '/' + genomeId + img.pfamExtension

                if not os.path.exists(pfamFile):
                    continue

                # get PFAM hits
                geneIdToPfams = {}
                bHeader = True
                for line in open(pfamFile):
                    if bHeader:
                        bHeader = False
                        continue

                    lineSplit = line.split('\t')
                    if lineSplit[0] in geneIdToPfams:
                        geneIdToPfams[lineSplit[0]].add(lineSplit[8])
                    else:
                        geneIdToPfams[lineSplit[0]] = set([lineSplit[8]])

                    if lineSplit[8] in pfamCount:
                        pfamCount[lineSplit[8]].add(genomeId)
                    else:
                        pfamCount[lineSplit[8]] = set([genomeId])

                # get TIGRFAM hits
                geneIdToTigr = {}
                bHeader = True
                for line in open(img.genomeDir + genomeId + '/' + genomeId + img.tigrExtension):
                    if bHeader:
                        bHeader = False
                        continue

                    lineSplit = line.split('\t')
                    if lineSplit[0] in geneIdToTigr:
                        geneIdToTigr[lineSplit[0]].add(lineSplit[6])
                    else:
                        geneIdToTigr[lineSplit[0]] = set([lineSplit[6]])

                    if lineSplit[6] in tigrCount:
                        tigrCount[lineSplit[6]].add(genomeId)
                    else:
                        tigrCount[lineSplit[6]] = set([genomeId])

                # keep track of TIGRFAMs matching the same gene as a PFAM
                geneIds = set(geneIdToPfams.keys()).union(set(geneIdToTigr.keys()))
                for geneId in geneIds:
                    pfams = geneIdToPfams.get(geneId, None)
                    tigrs = geneIdToTigr.get(geneId, None)

                    if pfams == None or tigrs == None:
                        continue

                    for pfamId in pfams:
                        for tigrId in tigrs:
                            key = pfamId + '-' + tigrId
                            if key in matches:
                                matches[key].add(genomeId)
                            else:
                                matches[key] = set([genomeId])

        sys.stdout.write('\n')

        # find TIGRFAMs that generally hit the same gene as a PFAM
        fout = open('../data/pfam/tigrfam2pfam.tsv', 'w')
        for key, genomeSet in matches.items():
            pfam, tigr = key.split('-')

            # deem a TIGRFAM HMM redundant if it is almost always hits that
            # same ORF as a PFAM HMM
            if float(len(genomeSet)) / len(tigrCount[tigr]) >= percentThreshold:
                fout.write(pfam + '\t' + tigr + '\n')
        fout.close()
Exemple #22
0
    def run(self, inputMetadataFile, outputMetadataFile, outputDir,
            ubiquityThreshold, singleCopyThreshold, trustedCompleteness,
            trustedContamination):
        img = IMG()
        markerSetBuilder = MarkerSetBuilder()

        allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w')
        allOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w')
        trustedOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'),
                           'w')
        filteredOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        metadataOut = open(outputMetadataFile, 'w')

        # read input metadata file
        metadata = img.genomeMetadataFromFile(inputMetadataFile)

        finishedGenomes = defaultdict(set)
        allGenomes = defaultdict(set)

        metadataLine = {}

        bHeader = True
        for line in open(inputMetadataFile):
            if bHeader:
                metadataOut.write(line)
                bHeader = False
                continue

            lineSplit = line.split('\t')
            genomeId = lineSplit[0]
            domain = lineSplit[1]
            status = lineSplit[2]

            if status == 'Finished':
                finishedGenomes[domain].add(genomeId)

            allGenomes[domain].add(genomeId)
            metadataLine[genomeId] = line

        allTrustedGenomeIds = set()
        for lineage, allLineageGenomeIds in allGenomes.items():
            print('[' + lineage + ']')
            print('  Number of genomes: %d' % len(allLineageGenomeIds))

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker genes for finished genomes
            print(
                '\nDetermining initial marker gene sets for genome filtering.')
            markerSet = markerSetBuilder.buildMarkerSet(
                finishedGenomes[lineage], ubiquityThreshold,
                singleCopyThreshold)

            print(
                '  Marker set consists of %s marker genes organized into %d sets.'
                % (markerSet.numMarkers(), markerSet.numSets()))
            fout = open(
                os.path.join(outputDir,
                             'trusted_marker_sets_' + lineage + '.txt'), 'w')
            fout.write(str(markerSet.markerSet))
            fout.close()

            # identifying trusted genomes (highly complete, low contamination genomes)
            print('\nIdentifying highly complete, low contamination genomes.')
            trustedGenomeIds = set()
            filteredGenomes = set()
            retainedStatus = {}
            filteredStatus = {}
            geneCountTable = img.geneCountTable(allLineageGenomeIds)
            for genomeId in allLineageGenomeIds:
                completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(
                    markerSet.markerSet, genomeId, geneCountTable)

                genomeStr = self.__genomeString(genomeId, metadata,
                                                completeness, contamination,
                                                missingMarkers,
                                                duplicateMarkers)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)
                    retainedStatus[metadata[genomeId]
                                   ['status']] = retainedStatus.get(
                                       metadata[genomeId]['status'], 0) + 1

                    trustedOut.write(genomeStr)
                    allOut.write(genomeStr)

                    metadataOut.write(metadataLine[genomeId])
                else:
                    filteredGenomes.add(genomeId)
                    filteredStatus[metadata[genomeId]
                                   ['status']] = filteredStatus.get(
                                       metadata[genomeId]['status'], 0) + 1

                    filteredOut.write(genomeStr)
                    allOut.write(genomeStr)

            print('  Filtered genomes: %d (%.2f%%)' %
                  (len(filteredGenomes),
                   len(filteredGenomes) * 100.0 / len(allLineageGenomeIds)))
            print('  ' + str(filteredStatus))
            print('  \nTrusted genomes: %d (%.2f%%)' %
                  (len(trustedGenomeIds),
                   len(trustedGenomeIds) * 100.0 / len(allLineageGenomeIds)))
            print('  ' + str(retainedStatus))

            # determine status of retained genomes
            print('\nTrusted genomes by phylum:')
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon,
                                                                     0) + 1

            for phylum, count in allPhylumCounts.items():
                print('  ' + phylum + ': %d of %d' %
                      (trustedPhylumCounts.get(phylum, 0), count))
            print('')

        allOut.close()
        trustedOut.close()
        filteredOut.close()
        metadataOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in range(0, 6):  # Domain to Genus
            for genomeId, data in metadata.items():
                taxaStr = ';'.join(data['taxonomy'][0:r + 1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted(metadata)

        fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' +
                       str(trustedStats.get(lineage, 0)) + '\n')
        fout.close()
Exemple #23
0
class MarkerSetSelection(object):
    def __init__(self):
        self.simFile = './experiments/simulation.tuning.genus.summary.tsv'
        self.looRank = 5

        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()

    def __stabilityTest(self,
                        genomeIds,
                        ubiquityThreshold=0.97,
                        singleCopyThreshold=0.97,
                        stabilityThreshold=0.05):
        """Test stability of marker set for a group of genomes using LOO-testing."""

        # quick escape for lineage that are clearly stable
        if len(genomeIds) > 200:
            return True

        # calculate marker sets using a LOO-testing
        looMarkerGenes = []
        for genomeId in genomeIds:
            looGenomeIds = genomeIds.difference([genomeId])

            # calculate marker genes
            geneCountTable = self.img.geneCountTable(looGenomeIds)
            markerGenes = self.markerSetBuilder.markerGenes(
                looGenomeIds, geneCountTable,
                ubiquityThreshold * len(looGenomeIds),
                singleCopyThreshold * len(looGenomeIds))
            tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)
            markerGenes = markerGenes - tigrToRemove

            looMarkerGenes.append(markerGenes)

        # calculate change in marker set for all pairs
        markerSetSize = []
        diffMarkerSet = []
        for i in range(0, len(looMarkerGenes)):
            markerSetSize.append(len(looMarkerGenes[i]))
            for j in range(i + 1, len(looMarkerGenes)):
                symmDiff = looMarkerGenes[i].symmetric_difference(
                    looMarkerGenes[j])
                diffMarkerSet.append(len(symmDiff))

        print(len(genomeIds), mean(diffMarkerSet), mean(markerSetSize))
        return (float(mean(diffMarkerSet)) /
                mean(markerSetSize)) <= stabilityThreshold

    def __patristicDist(self, tree, taxa1, taxa2):
        mrca = tree.mrca(taxon_labels=[taxa1.taxon.label, taxa2.taxon.label])

        if mrca.parent_node == None:
            # MRCA is the root of the tree
            return taxa1.distance_from_root() + taxa2.distance_from_root()
        else:

            dist = taxa1.edge_length
            parentNode = taxa1.parent_node
            while parentNode != mrca:
                dist += parentNode.edge_length
                parentNode = parentNode.parent_node

            dist += taxa2.edge_length
            parentNode = taxa2.parent_node
            while parentNode != mrca:
                dist += parentNode.edge_length
                parentNode = parentNode.parent_node

            return dist

    def __distToNodePercentileTest(self, genomeNode, markerSetNode, leaves,
                                   percentileTest):

        distToBin = self.__distanceToAncestor(genomeNode, markerSetNode)

        distToLeaves = []
        for leaf in leaves:
            distToLeaves.append(self.__distanceToAncestor(leaf, markerSetNode))

        return distToBin < percentile(distToLeaves, percentileTest)

    def __selectMarkerSetNode(self, tree, genomeId, metadata,
                              taxonToGenomeIds):
        """Determine lineage-specific marker set to use for assessing the giving genome."""

        # read genomes removed from tree as a result of duplicate sequences
        duplicateSeqs = self.markerSetBuilder.readDuplicateSeqs()

        # determine location of genome in tree
        node = tree.find_node_with_taxon_label('IMG_' + genomeId)

        # ascend tree to root looking for suitable marker set
        curNode = node.parent_node
        while curNode != None:
            uniqueId = curNode.label.split('|')[0]

            genomeIds = set()
            for leaf in curNode.leaf_nodes():
                genomeIds.add(leaf.taxon.label.replace('IMG_', ''))

                duplicateGenomes = duplicateSeqs.get(leaf.taxon.label, [])
                for dup in duplicateGenomes:
                    genomeIds.add(dup.replace('IMG_', ''))

            # remove genome (LOO-style analysis)
            print('Full:', len(genomeIds))
            genomeIds.difference_update([genomeId])
            print('LOO:', len(genomeIds))

            # remove all genomes from the same taxonomic group as the genome of interest
            taxon = metadata[genomeId]['taxonomy'][self.looRank]
            genomeIds.difference_update(taxonToGenomeIds[taxon])
            print('Rank reduced:', len(genomeIds))

            print(uniqueId)
            if len(genomeIds) > 10 and self.__stabilityTest(genomeIds):
                uidSelected = uniqueId
                break

            curNode = curNode.parent_node
            if curNode == None:
                # reach root so use universal marker set
                uidSelected = uniqueId

        return uidSelected

    def __bestMarkerSet(self, genomeId, simResults):
        """Get stats for best marker set."""
        curBest = 1000
        bestUID = None
        for uid, results in simResults[genomeId].items():
            numDescendants, dComp, dCont = results
            if (dComp + dCont) < curBest:
                numDescendantsBest = numDescendants
                dCompBest = dComp
                dContBest = dCont
                bestUID = uid
                curBest = dComp + dCont

        return bestUID, numDescendantsBest, dCompBest, dContBest

    def __workerThread(self, tree, simResults, metadata, taxonToGenomeIds,
                       queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            testGenomeId = queueIn.get(block=True, timeout=None)
            if testGenomeId == None:
                break

            uidSelected = self.__selectMarkerSetNode(tree, testGenomeId,
                                                     metadata,
                                                     taxonToGenomeIds)
            numDescendantsSelected, dCompSelected, dContSelected = simResults[
                testGenomeId][uidSelected]

            # find best marker set
            bestUID, numDescendantsBest, dCompBest, dContBest = self.__bestMarkerSet(
                testGenomeId, simResults)

            queueOut.put((testGenomeId, uidSelected, numDescendantsSelected,
                          dCompSelected, dContSelected, bestUID,
                          numDescendantsBest, dCompBest, dContBest))

    def __writerThread(self, numTestGenomes, writerQueue):
        """Store or write results of worker threads in a single thread."""

        fout = open('./experiments/markerSetSelection.tsv', 'w')

        fout.write(
            'Genome Id\tSelected UID\t# descendants\tSelected dComp\tSelected dCont\tBest UID\t# descendants\tBest dComp\tBest dCont\tdDescendants\tdComp\tdCont\n'
        )

        itemsToProcess = 0

        dComps = []
        dConts = []

        dCompsPer = []
        dContsPer = []

        bestComp = []
        bestCont = []

        selectedComp = []
        selectedCont = []

        while True:
            testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest = writerQueue.get(
                block=True, timeout=None)
            if testGenomeId == None:
                break

            itemsToProcess += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test genomes.' % (
                itemsToProcess, numTestGenomes, float(itemsToProcess) * 100 /
                (numTestGenomes))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            dComp = abs(dCompSelected - dCompBest)
            dCont = abs(dContSelected - dContBest)
            dDescendants = abs(numDescendantsSelected - numDescendantsBest)
            fout.write(
                '%s\t%s\t%d\t%.4f\t%.4f\t%s\t%d\t%.4f\t%.4f\t%d\t%.4f\t%.4f\n'
                % (testGenomeId, uidSelected, numDescendantsSelected,
                   dCompSelected, dContSelected, bestUID, numDescendantsBest,
                   dCompBest, dContBest, dDescendants, dComp, dCont))

            dComps.append(dComp)
            dConts.append(dCont)

            dCompsPer.append(dComp * 100.0 / dCompBest)
            dContsPer.append(dCont * 100.0 / max(dContBest, 0.01))

            bestComp.append(dCompBest)
            bestCont.append(dContBest)

            selectedComp.append(dCompSelected)
            selectedCont.append(dContSelected)

        sys.stdout.write('\n')
        fout.close()

        print('')
        print('  General results:')
        print('   Best comp: %.2f +/- %.2f' % (mean(bestComp), std(bestComp)))
        print('   Best cont: %.2f +/- %.2f' % (mean(bestCont), std(bestCont)))
        print('   Selected comp: %.2f +/- %.2f' %
              (mean(selectedComp), std(selectedComp)))
        print('   Selected cont: %.2f +/- %.2f' %
              (mean(selectedCont), std(selectedCont)))
        print('')
        print('   Delta comp: %.2f +/- %.2f' % (mean(dComps), std(dComps)))
        print('   Delta cont: %.2f +/- %.2f' % (mean(dConts), std(dConts)))
        print('   Delta comp per error: %.1f +/- %.1f' %
              (mean(dCompsPer), std(dCompsPer)))
        print('   Delta cont per error: %.1f +/- %.1f' %
              (mean(dContsPer), std(dContsPer)))

    def __distanceToAncestor(self, leaf, ancestor):
        dist = 0

        curNode = leaf
        while curNode != ancestor:
            dist += curNode.edge_length

            curNode = curNode.parent_node

        return dist

    def __bestNodeProperties(self, genomeId, tree, bestUID):
        # determine location of genome in tree
        node = tree.find_node_with_taxon_label('IMG_' + genomeId)

        # find node of best marker set
        curNode = node.parent_node
        nodesToBin = 0
        distanceToBin = node.edge_length
        distanceToLeaves = []
        while curNode != None:
            uniqueId = curNode.label.split('|')[0]

            nodesToBin += 1

            if uniqueId == bestUID:
                for leaf in curNode.leaf_nodes():
                    if leaf != node:
                        dist = self.__distanceToAncestor(leaf, curNode)
                        distanceToLeaves.append(dist)
                break

            distanceToBin += curNode.edge_length

            curNode = curNode.parent_node

        return nodesToBin, distanceToBin, mean(distanceToLeaves)

    def __propertiesOfBestMarkerSets(self, tree, simResults):

        numDescendants = []
        nodesToBin = []
        distanceToBin = []
        avgDistanceToLeaf = []
        percDiffs = []
        for genomeId in simResults:
            bestUID, numDescendantsBest, _, _ = self.__bestMarkerSet(
                genomeId, simResults)
            nodesToBinBest, distanceToBinBest, avgDistanceToLeafBest = self.__bestNodeProperties(
                genomeId, tree, bestUID)

            numDescendants.append(numDescendantsBest)
            nodesToBin.append(nodesToBinBest)
            distanceToBin.append(distanceToBinBest)
            avgDistanceToLeaf.append(avgDistanceToLeafBest)

            percDiff = abs(distanceToBinBest -
                           avgDistanceToLeafBest) * 100 / distanceToBinBest
            percDiffs.append(percDiff)

        print('    # descendants: %.2f +/- %.2f' %
              (mean(numDescendants), std(numDescendants)))
        print('    # nodes to bin: %.2f +/- %.2f' %
              (mean(nodesToBin), std(nodesToBin)))
        print('    Distance to bin: %.2f +/- %.2f' %
              (mean(distanceToBin), std(distanceToBin)))

        distanceToBin = array(distanceToBin)
        avgDistanceToLeaf = array(avgDistanceToLeaf)
        print('    Distance to bin - average distance to leaf: %.2f +/- %.2f' %
              (mean(abs(distanceToBin - avgDistanceToLeaf)),
               std(abs(distanceToBin - avgDistanceToLeaf))))
        print(
            '    Percent difference to average leaf distance: %.2f +/- %.2f' %
            (mean(percDiffs), std(percDiffs)))
        print('')

    def run(self, numThreads):
        # read reference tree
        print('\n  Reading reference genome tree.')
        treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data',
                                'genome_tree', 'genome_tree_prok.refpkg',
                                'genome_tree.final.tre')
        tree = dendropy.Tree.get_from_path(treeFile,
                                           schema='newick',
                                           as_rooted=True,
                                           preserve_underscores=True)

        # get all genomes with a given taxon label
        metadata = self.img.genomeMetadata()
        taxonToGenomeIds = defaultdict(set)
        for genomeId in metadata:
            for t in metadata[genomeId]['taxonomy']:
                taxonToGenomeIds[t].add(genomeId)

        # read simulation results
        print('  Reading simulation results.')

        simResults = defaultdict(dict)
        with open(self.simFile) as f:
            f.readline()
            for line in f:
                lineSplit = line.split('\t')

                simId = lineSplit[0] + '-' + lineSplit[1] + '-' + lineSplit[
                    2] + '-' + lineSplit[3]
                uid = lineSplit[5].split('|')[0].strip()
                numDescendants = int(lineSplit[6])
                comp = float(lineSplit[21])
                cont = float(lineSplit[23])

                simResults[simId][uid] = [numDescendants, comp, cont]

        #print ''
        #print '  Properties of best marker sets:'
        #self.__propertiesOfBestMarkerSets(tree, simResults)

        print('  Evaluating %d test genomes.' % len(simResults))
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for testGenomeId in simResults:
            workerQueue.put(testGenomeId)

        for _ in range(numThreads):
            workerQueue.put(None)

        workerProc = [
            mp.Process(target=self.__workerThread,
                       args=(tree, simResults, metadata, taxonToGenomeIds,
                             workerQueue, writerQueue))
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__writerThread,
                               args=(len(simResults), writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None, None, None))
        writeProc.join()
Exemple #24
0
class GenomeTreeWorkflow(object):
    def __init__(self, outputDir):
        self.img = IMG()
        self.markerSetBuilder = MarkerSetBuilder()

        if os.path.exists(outputDir):
            print '[Error] Output directory already exists: ' + outputDir
            sys.exit(0)
        else:
            os.makedirs(outputDir)

        self.__checkForHMMER()
        self.__checkForFastTree()

        self.hmmDir = os.path.join(outputDir, 'phylo_hmms')
        self.alignmentDir = os.path.join(outputDir, 'gene_alignments')
        self.geneTreeDir = os.path.join(outputDir, 'gene_trees')
        self.conspecificGeneTreeDir = os.path.join(outputDir,
                                                   'gene_trees_conspecific')
        self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final')

        self.consistencyOut = os.path.join(outputDir,
                                           'genome_tree.consistency.tsv')
        self.concatenatedAlignFile = os.path.join(
            outputDir, 'genome_tree.concatenated.faa')
        self.derepConcatenatedAlignFile = os.path.join(
            outputDir, 'genome_tree.concatenated.derep.fasta')
        self.treeOut = os.path.join(outputDir, 'genome_tree.tre')
        self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre')

        self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre')
        self.treeTaxonomyOut = os.path.join(outputDir,
                                            'genome_tree.taxonomy.tre')
        self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre')
        self.treeDerepRootedOut = os.path.join(outputDir,
                                               'genome_tree.derep.rooted.tre')
        self.treeDerepBootstrapOut = os.path.join(outputDir,
                                                  'genome_tree.derep.bs.tre')
        self.treeDerepFinalOut = os.path.join(outputDir,
                                              'genome_tree.final.tre')
        self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv')
        self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv')
        self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm')
        self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt')

        self.phyloUbiquity = 0.90
        self.phyloSingleCopy = 0.90
        self.paralogAcceptPer = 0.01
        # self.consistencyAcceptPer = 0.95    # for trees at the class-level
        self.consistencyAcceptPer = 0.906  # for trees at the phylum-level
        self.consistencyMinTaxa = 20

        # create output directories
        os.makedirs(self.hmmDir)
        os.makedirs(self.alignmentDir)
        os.makedirs(self.geneTreeDir)
        os.makedirs(self.conspecificGeneTreeDir)
        os.makedirs(self.finalGeneTreeDir)

    def __checkForHMMER(self):
        """Check to see if HMMER is on the system path."""

        try:
            exit_status = os.system('hmmfetch -h > /dev/null')
        except:
            print "Unexpected error!", sys.exc_info()[0]
            raise

        if exit_status != 0:
            print "[Error] hmmfetch is not on the system path"
            sys.exit()

    def __checkForFastTree(self):
        """Check to see if FastTree is on the system path."""

        try:
            exit_status = os.system('FastTree 2> /dev/null')
        except:
            print "Unexpected error!", sys.exc_info()[0]
            raise

        if exit_status != 0:
            print "[Error] FastTree is not on the system path"
            sys.exit()

    def __genesInGenomes(self, genomeIds):
        genesInGenomes = {}
        for genomeId in genomeIds:
            markerIdToGeneIds = defaultdict(set)
            for line in open(
                    os.path.join(IMG.genomeDir, genomeId,
                                 genomeId + IMG.pfamExtension)):
                lineSplit = line.split('\t')
                markerIdToGeneIds[lineSplit[8]].add(lineSplit[0])

            for line in open(
                    os.path.join(IMG.genomeDir, genomeId,
                                 genomeId + IMG.tigrExtension)):
                lineSplit = line.split('\t')
                markerIdToGeneIds[lineSplit[6]].add(lineSplit[0])

            genesInGenomes[genomeId] = markerIdToGeneIds

        return genesInGenomes

    def __fetchMarkerModels(self, universalMarkerGenes, outputModelDir):
        markerIdToName = {}
        for line in open('/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'):
            if 'NAME' in line:
                name = line.split()[1].rstrip()
            elif 'ACC' in line:
                acc = line.split()[1].rstrip()
                markerId = acc.replace('PF', 'pfam')
                markerId = markerId[0:markerId.rfind('.')]
                markerIdToName[markerId] = name

        for markerId in universalMarkerGenes:
            if 'pfam' in markerId:
                os.system('hmmfetch  /srv/whitlam/bio/db/pfam/27/Pfam-A.hmm ' +
                          markerIdToName[markerId] + ' > ' +
                          os.path.join(outputModelDir,
                                       markerId.replace('pfam', 'PF') +
                                       '.hmm'))
            else:
                os.system('hmmfetch  /srv/whitlam/bio/db/tigrfam/13.0/' +
                          markerId + '.HMM ' + markerId + ' > ' +
                          os.path.join(outputModelDir, markerId + '.hmm'))

    def __alignMarkers(self, genomeIds, markerGenes, genesInGenomes,
                       numThreads, outputGeneDir, outputModelDir):
        """Perform multithreaded alignment of marker genes using HMM align."""

        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for _, markerId in enumerate(markerGenes):
            workerQueue.put(markerId)

        for _ in range(numThreads):
            workerQueue.put(None)

        calcProc = [
            mp.Process(target=self.__runHmmAlign,
                       args=(genomeIds, genesInGenomes, outputGeneDir,
                             outputModelDir, workerQueue, writerQueue))
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__reportThreads,
                               args=(len(markerGenes), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put(None)
        writeProc.join()

    def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir,
                      outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            markerId = queueIn.get(block=True, timeout=None)
            if markerId == None:
                break

            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in genomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' +
                                 genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    if geneId not in seqs:
                        # this shouldn't be necessary, but the IMG metadata isn't always
                        # perfectly in sync with the sequence data
                        continue

                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()

            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'),
                        markerSeqFile,
                        os.path.join(outputGeneDir, modelName + '.aln.faa'),
                        trim=False,
                        outputFormat='Pfam')
            self.__maskAlignment(
                os.path.join(outputGeneDir, modelName + '.aln.faa'),
                os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))

            queueOut.put(modelName)

    def __reportThreads(self, numGenes, writerQueue):
        """Store confidence intervals (i.e., to shared memory)."""

        numProcessedGenes = 0
        while True:
            markerId = writerQueue.get(block=True, timeout=None)
            if markerId == None:
                break

            numProcessedGenes += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) marker genes.' % (
                numProcessedGenes, numGenes,
                float(numProcessedGenes) * 100 / numGenes)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

        sys.stdout.write('\n')

    def __maskAlignment(self, inputFile, outputFile):
        """Read HMMER alignment in STOCKHOLM format and output masked alignment in FASTA format."""
        # read STOCKHOLM alignment
        seqs = {}
        for line in open(inputFile):
            line = line.rstrip()
            if line == '' or line[0] == '#' or line == '//':
                if 'GC RF' in line:
                    mask = line.split('GC RF')[1].strip()
                continue
            else:
                lineSplit = line.split()
                seqs[lineSplit[0]] = lineSplit[1].upper().replace('.',
                                                                  '-').strip()

        # output masked sequences in FASTA format
        fout = open(outputFile, 'w')
        for seqId, seq in seqs.iteritems():
            fout.write('>' + seqId + '\n')

            maskedSeq = ''.join(
                [seq[i] for i in xrange(0, len(seq)) if mask[i] == 'x'])
            fout.write(maskedSeq + '\n')
        fout.close()

    def __taxonomicMarkers(self, taxaStr, metadata, ubiquityThreshold,
                           singleCopyThreshold):
        """Get genomes and marker genes for a specific lineage."""

        genomeIds = self.img.genomeIdsByTaxonomy(taxaStr, metadata)

        # hack to add in other genomes with incorrect taxonomy
        aceIds = set()
        for line in open('./taxonomicTrees/firmicutes.nds'):
            aceIds.add(line.strip())

        aceIdsToImgIds = self.aceIdsToImgIds()
        for aceId in aceIds:
            if aceId in aceIdsToImgIds:
                genomeId = aceIdsToImgIds[aceId]
                if genomeId in metadata:
                    genomeIds.add(genomeId)

        markerGenes = self.markerSetBuilder.buildMarkerGenes(
            genomeIds, ubiquityThreshold, singleCopyThreshold)

        return genomeIds, markerGenes

    def inferGeneTrees(self, phyloUbiquityThreshold, phyloSingleCopyThreshold,
                       numThreads, outputGeneDir, outputModelDir,
                       outgroupSize):
        # make sure output directory is empty
        if not os.path.exists(outputGeneDir):
            os.makedirs(outputGeneDir)

        if not os.path.exists(outputModelDir):
            os.makedirs(outputModelDir)

        files = os.listdir(outputGeneDir)
        for f in files:
            os.remove(os.path.join(outputGeneDir, f))

        # get genomes and marker genes for taxonomic groups of interest
        print ''
        print 'Identifying genomes and marker genes of interest:'
        metadata = self.img.genomeMetadata()
        ingroupGenomeIds, ingroupMarkers = self.__taxonomicMarkers(
            'Bacteria;Firmicutes', metadata, phyloUbiquityThreshold,
            phyloSingleCopyThreshold)
        outgroupGenomeIds = self.img.genomeIdsByTaxonomy(
            'Bacteria;Coprothermobacter', metadata)
        # alphaGenomeIds, _ = self.__taxonomicMarkers('Bacteria;Proteobacteria;Alphaproteobacteria', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold)

        print '  Identified ingroup genomes: %d' % len(ingroupGenomeIds)
        print '  Identified outgroup genomes: %d' % len(outgroupGenomeIds)

        numOutgroupTaxa = min(outgroupSize, len(outgroupGenomeIds))
        print ''
        print '  Selecting %d taxa from the outgroup.' % (numOutgroupTaxa)
        genomeIds = ingroupGenomeIds.union(
            random.sample(outgroupGenomeIds, numOutgroupTaxa))

        self.imgIdsToAceIds(genomeIds)

        print '  Identified markers: %d' % len(ingroupMarkers)

        # get mapping of marker ids to gene ids for each genome
        print '  Determine genes for genomes of interest.'
        genesInGenomes = self.__genesInGenomes(genomeIds)

        # get HMM for each marker gene
        print '  Fetching HMM for each marker genes.'
        self.__fetchMarkerModels(ingroupMarkers, outputModelDir)

        # align gene sequences and infer gene trees
        print '  Aligning marker genes:'
        #***self.__alignMarkers(genomeIds, ingroupMarkers, genesInGenomes, numThreads, outputGeneDir, outputModelDir)

        return genomeIds

    def imgIdsToAceIds(self, imgIds):
        imgIdToAceId = {}
        for line in open('ggg_tax_img.feb_2014.txt'):
            lineSplit = line.split('\t')
            imgIdToAceId[lineSplit[1].rstrip()] = lineSplit[0]

        missing = 0
        for imgId in imgIds:
            if imgId not in imgIdToAceId:
                missing += 1

        print '  Number of genomes without an ACE id: ' + str(missing)

        return imgIdToAceId

    def aceIdsToImgIds(self):
        aceIdsToImgIds = {}
        for line in open('ggg_tax_img.feb_2014.txt'):
            lineSplit = line.split('\t')
            aceIdsToImgIds[lineSplit[0].strip()] = lineSplit[1].strip()

        return aceIdsToImgIds

    def run(self, numThreads, outgroupSize):

        # identify genes suitable for phylogenetic inference
        print '--- Identifying genes suitable for phylogenetic inference ---'
        genomeIds = self.inferGeneTrees(self.phyloUbiquity,
                                        self.phyloSingleCopy, numThreads,
                                        self.alignmentDir, self.hmmDir,
                                        outgroupSize)

        # infer gene trees
        print ''
        print '--- Inferring gene trees ---'
        makeTrees = MakeTrees()
        makeTrees.run(self.alignmentDir, self.geneTreeDir, '.aln.masked.faa',
                      numThreads)

        # test gene trees for paralogs
        print ''
        print '--- Testing for paralogs in gene trees ---'
        paralogTest = ParalogTest()
        paralogTest.run(self.geneTreeDir, self.paralogAcceptPer, '.tre',
                        self.conspecificGeneTreeDir)

        # test gene trees for consistency with IMG taxonomy
        print ''
        print '--- Testing taxonomic consistency of gene trees ---'
        consistencyTest = ConsistencyTest()
        consistencyTest.run(self.conspecificGeneTreeDir, '.tre',
                            self.consistencyAcceptPer, self.consistencyMinTaxa,
                            self.consistencyOut, self.finalGeneTreeDir)

        # gather phylogenetically informative HMMs into a single model file
        print ''
        print '--- Gathering phylogenetically informative HMMs ---'
        getPhylogeneticHMMs = GetPhylogeneticHMMs()
        getPhylogeneticHMMs.run(self.hmmDir, self.finalGeneTreeDir,
                                self.phyloHMMsOut)

        # infer genome tree
        print ''
        print '--- Inferring full genome tree ---'
        inferGenomeTree = InferGenomeTree()
        inferGenomeTree.run(self.finalGeneTreeDir,
                            self.alignmentDir,
                            '.aln.masked.faa',
                            self.concatenatedAlignFile,
                            self.treeOut,
                            self.taxonomyOut,
                            bSupportValues=True)

        # replace IMG identifiers with ACE identifiers
        imgIdToAceId = self.imgIdsToAceIds(genomeIds)
        with open(self.treeOut) as f:
            tree = ''.join(f.readlines())

            for genomeId in genomeIds:
                if genomeId in imgIdToAceId:
                    tree = tree.replace('IMG_' + genomeId,
                                        imgIdToAceId[genomeId])

        fout = open(self.treeOutAce, 'w')
        fout.write(tree)
        fout.close()
 def __init__(self):
     self.simFile = './experiments/simulation.tuning.genus.summary.tsv'
     self.looRank = 5
     
     self.markerSetBuilder = MarkerSetBuilder()
     self.img = IMG()
class MarkerSetStability(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold,
                         singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None)
            if lineage == None:
                break

            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata,
                                                     'trusted')

            changeMarkerSetSize = {}
            markerGenes = []
            if len(genomeIds) >= minGenomes:
                # calculate marker set for all genomes in lineage
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(
                    genomeIds, geneCountTable,
                    ubiquityThreshold * len(genomeIds),
                    singleCopyThreshold * len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)
                markerGenes = markerGenes - tigrToRemove

                for selectPer in range(50, 101, 5):
                    numGenomesToSelect = int(
                        float(selectPer) / 100 * len(genomeIds))
                    perChange = []
                    for _ in range(0, 10):
                        # calculate marker set for subset of genomes
                        subsetGenomeIds = random.sample(
                            genomeIds, numGenomesToSelect)
                        geneCountTable = self.img.geneCountTable(
                            subsetGenomeIds)
                        subsetMarkerGenes = self.markerset.markerGenes(
                            subsetGenomeIds, geneCountTable,
                            ubiquityThreshold * numGenomesToSelect,
                            singleCopyThreshold * numGenomesToSelect)
                        tigrToRemove = self.img.identifyRedundantTIGRFAMs(
                            subsetMarkerGenes)
                        subsetMarkerGenes = subsetMarkerGenes - tigrToRemove

                        perChange.append(
                            float(
                                len(
                                    markerGenes.symmetric_difference(
                                        subsetMarkerGenes))) * 100.0 /
                            len(markerGenes))

                    changeMarkerSetSize[selectPer] = [
                        mean(perChange), std(perChange)
                    ]

            queueOut.put((lineage, len(genomeIds), len(markerGenes),
                          changeMarkerSetSize))

    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""

        fout = open(outputFile, 'w')
        fout.write(
            'Lineage\t# genomes\t# markers\tsubsample %\tmean % change\tstd % change\n'
        )

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, changeMarkerSetSize = writerQueue.get(
                block=True, timeout=None)
            if lineage == None:
                break

            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (
                numProcessedLineages, totalLineages,
                float(numProcessedLineages) * 100 / totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            for selectPer in sorted(changeMarkerSetSize.keys()):
                fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' %
                           (lineage, numGenomes, numMarkerGenes, selectPer,
                            changeMarkerSetSize[selectPer][0],
                            changeMarkerSetSize[selectPer][1]))

        sys.stdout.write('\n')

        fout.close()

    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold,
            minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""

        print('  Calculating stability of marker sets:')

        random.seed(1)

        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes,
                                               mostSpecificRank)

        #lineages = ['Bacteria']
        #lineages += ['Bacteria;Proteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;coli']

        #lineages = ['Archaea']
        #lineages += ['Archaea;Euryarchaeota']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosarcinaceae']

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)

        calcProc = [
            mp.Process(target=self.__processLineage,
                       args=(metadata, ubiquityThreshold, singleCopyThreshold,
                             minGenomes, workerQueue, writerQueue))
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__storeResults,
                               args=(outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None))
        writeProc.join()
class MarkerSetStability(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None) 
            if lineage == None:
                break  
            
            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted')
            
            changeMarkerSetSize = {}
            markerGenes = []
            if len(genomeIds) >= minGenomes:  
                # calculate marker set for all genomes in lineage          
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)
                markerGenes = markerGenes - tigrToRemove
     
                for selectPer in xrange(50, 101, 5):
                    numGenomesToSelect = int(float(selectPer)/100 * len(genomeIds))
                    perChange = []
                    for _ in xrange(0, 10):
                        # calculate marker set for subset of genomes
                        subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect)
                        geneCountTable = self.img.geneCountTable(subsetGenomeIds)
                        subsetMarkerGenes = self.markerset.markerGenes(subsetGenomeIds, geneCountTable, ubiquityThreshold*numGenomesToSelect, singleCopyThreshold*numGenomesToSelect)
                        tigrToRemove = self.img.identifyRedundantTIGRFAMs(subsetMarkerGenes)
                        subsetMarkerGenes = subsetMarkerGenes - tigrToRemove
    
                        perChange.append(float(len(markerGenes.symmetric_difference(subsetMarkerGenes)))*100.0 / len(markerGenes))
    
                    changeMarkerSetSize[selectPer] = [mean(perChange), std(perChange)]  

            queueOut.put((lineage, len(genomeIds), len(markerGenes), changeMarkerSetSize))

    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""
        
        fout = open(outputFile, 'w')
        fout.write('Lineage\t# genomes\t# markers\tsubsample %\tmean % change\tstd % change\n')

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, changeMarkerSetSize = writerQueue.get(block=True, timeout=None)
            if lineage == None:
                break
                    
            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (numProcessedLineages, totalLineages, float(numProcessedLineages)*100/totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            for selectPer in sorted(changeMarkerSetSize.keys()): 
                fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, selectPer, changeMarkerSetSize[selectPer][0], changeMarkerSetSize[selectPer][1]))

        sys.stdout.write('\n')
            
        fout.close()
        
        
    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""  
        
        print '  Calculating stability of marker sets:'
        
        random.seed(1)
        
        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank)
        
        #lineages = ['Bacteria']
        #lineages += ['Bacteria;Proteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;coli']
        
        #lineages = ['Archaea']
        #lineages += ['Archaea;Euryarchaeota']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosarcinaceae']

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)
 
        calcProc = [mp.Process(target = self.__processLineage, args = (metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__storeResults, args = (outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None))
        writeProc.join()
Exemple #28
0
    def run(self, metadataFile, percentThreshold):
        img = IMG()

        metadata = img.genomeMetadataFromFile(metadataFile)

        matches = {}
        pfamCount = {}
        tigrCount = {}
        for genomeCounter, genomeId in enumerate(metadata):
            statusStr = '  Finished processing %d of %d (%.2f%%) genomes.' % (genomeCounter+1, len(metadata), float(genomeCounter+1)*100/len(metadata))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            if metadata[genomeId]['status'] == 'Finished':
                pfamFile = img.genomeDir + genomeId + '/' + genomeId + img.pfamExtension

                if not os.path.exists(pfamFile):
                    continue

                # get PFAM hits
                geneIdToPfams = {}
                bHeader = True
                for line in open(pfamFile):
                    if bHeader:
                        bHeader = False
                        continue

                    lineSplit = line.split('\t')
                    if lineSplit[0] in geneIdToPfams:
                        geneIdToPfams[lineSplit[0]].add(lineSplit[8])
                    else:
                        geneIdToPfams[lineSplit[0]] = set([lineSplit[8]])

                    if lineSplit[8] in pfamCount:
                        pfamCount[lineSplit[8]].add(genomeId)
                    else:
                        pfamCount[lineSplit[8]] = set([genomeId])

                # get TIGRFAM hits
                geneIdToTigr = {}
                bHeader = True
                for line in open(img.genomeDir + genomeId + '/' + genomeId + img.tigrExtension):
                    if bHeader:
                        bHeader = False
                        continue

                    lineSplit = line.split('\t')
                    if lineSplit[0] in geneIdToTigr:
                        geneIdToTigr[lineSplit[0]].add(lineSplit[6])
                    else:
                        geneIdToTigr[lineSplit[0]] = set([lineSplit[6]])

                    if lineSplit[6] in tigrCount:
                        tigrCount[lineSplit[6]].add(genomeId)
                    else:
                        tigrCount[lineSplit[6]] = set([genomeId])

                # keep track of TIGRFAMs matching the same gene as a PFAM
                geneIds = set(geneIdToPfams.keys()).union(set(geneIdToTigr.keys()))
                for geneId in geneIds:
                    pfams = geneIdToPfams.get(geneId, None)
                    tigrs = geneIdToTigr.get(geneId, None)

                    if pfams == None or tigrs == None:
                        continue

                    for pfamId in pfams:
                        for tigrId in tigrs:
                            key = pfamId + '-' + tigrId
                            if key in matches:
                                matches[key].add(genomeId)
                            else:
                                matches[key] = set([genomeId])

        sys.stdout.write('\n')

        # find TIGRFAMs that generally hit the same gene as a PFAM
        fout = open('../data/pfam/tigrfam2pfam.tsv', 'w')
        for key, genomeSet in matches.iteritems():
            pfam, tigr = key.split('-')

            # deem a TIGRFAM HMM redundant if it is almost always hits that
            # same ORF as a PFAM HMM
            if float(len(genomeSet)) / len(tigrCount[tigr]) >= percentThreshold:
                fout.write(pfam + '\t' + tigr + '\n')
        fout.close()
 def __init__(self):
     self.img = IMG()
     self.markerset = MarkerSet()
Exemple #30
0
 def __init__(self):
     self.img = IMG()
class MarkerSetStabilityTest(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None) 
            if lineage == None:
                break  
            
            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted')
            
            markerGenes = []
            perChange = []
            numGenomesToSelect = int(0.9*len(genomeIds))
            if len(genomeIds) >= minGenomes:  
                # calculate marker set for all genomes in lineage          
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)

                markerGenes = markerGenes - tigrToRemove

                for _ in xrange(0, 100):
                    # calculate marker set for subset of genomes
                    subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect)
                    geneCountTable = self.img.geneCountTable(subsetGenomeIds)
                    subsetMarkerGenes = self.markerset.markerGenes(subsetGenomeIds, geneCountTable, ubiquityThreshold*numGenomesToSelect, singleCopyThreshold*numGenomesToSelect)
                    tigrToRemove = self.img.identifyRedundantTIGRFAMs(subsetMarkerGenes)
                    subsetMarkerGenes = subsetMarkerGenes - tigrToRemove

                    perChange.append(float(len(markerGenes.symmetric_difference(subsetMarkerGenes)))*100.0 / len(markerGenes))

            if perChange != []:
                queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, mean(perChange), std(perChange)))
            else:
                queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, -1, -1))
                
    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""
        
        fout = open(outputFile, 'w')
        fout.write('Lineage\t# genomes\t# markers\t# sampled genomes\tmean % change\tstd % change\n')

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange = writerQueue.get(block=True, timeout=None)
            if lineage == None:
                break
                    
            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (numProcessedLineages, totalLineages, float(numProcessedLineages)*100/totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            

            fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange))

        sys.stdout.write('\n')
            
        fout.close()
        
        
    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""  
        
        print '  Testing stability of marker sets:'
        
        random.seed(1)
        
        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank)

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)
 
        calcProc = [mp.Process(target = self.__processLineage, args = (metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__storeResults, args = (outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None))
        writeProc.join()
 def __init__(self):
     self.markerSetBuilder = MarkerSetBuilder()
     self.img = IMG()
     
     self.simContigLen = 10000
Exemple #33
0
class Simulation(object):
    def __init__(self):
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()

    def run(self):
        print '\n  Reading reference genome tree.'
        treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre')
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        
        # get all Finished, Trusted genomes
        metadata = self.img.genomeMetadata()
        bacteriaIds = self.img.getGenomesByClade('domain', 'Bacteria', metadata)
        print '# Bacteria: %d' % len(bacteriaIds)
        
        start = time.time()
        #self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys())
        end = time.time()
        print 'globalGeneCountTable: %.2f' % (end - start)
        
        start = time.time()
        #self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys())
        end = time.time()
        print 'precomputeGenomeSeqLens: %.2f' % (end - start)
        
        start = time.time()
        #self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 5000)
        end = time.time()
        print 'precomputeGenomeFamilyPositions: %.2f' % (end - start)
        
        #start = time.time()
        #test = self.img.geneDistTable(metadata.keys(), self.markerSetBuilder.globalGeneCountTable.keys(), spacingBetweenContigs=1e6)
        #end = time.time()
        #print 'geneDistTable: %.2f' % (end - start)
        
        #t = raw_input('waiting...')
        
        start = time.time()
        #testGenomeId = archaeaIds.pop()
        #testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId)
        #binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, 0.97, 0.97, bMarkerSet = True)
        markerSet = self.markerSetBuilder.buildMarkerSet(bacteriaIds, 0.97, 0.97)
        end = time.time()
        print 'buildMarkerSet: %.2f' % (end - start)
        
 
        print len(markerSet.markerSet)
        test = eval("[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]")
        print len(test)
        
        for ms in markerSet.markerSet:
            bMatch = False
            for tms in test:
                if tms == ms:
                    print ms
                    print tms
                    print '---------'
                    bMatch = True
                    break
                
            if not bMatch:
                print 'BOO!'
        
        if str(markerSet.markerSet) == "[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]":
            print 'Good to go!'
        else:
            print 'oh, shit!!!!'
Exemple #34
0
    def __init__(self):
        self.simFile = './experiments/simulation.tuning.genus.summary.tsv'
        self.looRank = 5

        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()
class MarkerSetSelection(object):
    def __init__(self):
        self.simFile = './experiments/simulation.tuning.genus.summary.tsv'
        self.looRank = 5
        
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()
        
    def __stabilityTest(self, genomeIds, ubiquityThreshold = 0.97, singleCopyThreshold = 0.97, stabilityThreshold = 0.05):
        """Test stability of marker set for a group of genomes using LOO-testing."""
        
        # quick escape for lineage that are clearly stable
        if len(genomeIds) > 200:
            return True
        
        # calculate marker sets using a LOO-testing
        looMarkerGenes = []
        for genomeId in genomeIds:
            looGenomeIds = genomeIds.difference([genomeId])
            
            # calculate marker genes
            geneCountTable = self.img.geneCountTable(looGenomeIds)
            markerGenes = self.markerSetBuilder.markerGenes(looGenomeIds, geneCountTable, ubiquityThreshold*len(looGenomeIds), singleCopyThreshold*len(looGenomeIds))
            tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)
            markerGenes = markerGenes - tigrToRemove
            
            looMarkerGenes.append(markerGenes)
            
        # calculate change in marker set for all pairs
        markerSetSize = []
        diffMarkerSet = []
        for i in xrange(0, len(looMarkerGenes)):
            markerSetSize.append(len(looMarkerGenes[i]))
            for j in xrange(i+1, len(looMarkerGenes)):     
                symmDiff = looMarkerGenes[i].symmetric_difference(looMarkerGenes[j])
                diffMarkerSet.append(len(symmDiff))
                            
        print len(genomeIds), mean(diffMarkerSet), mean(markerSetSize)
        return (float(mean(diffMarkerSet)) / mean(markerSetSize)) <= stabilityThreshold
        
    def __patristicDist(self, tree, taxa1, taxa2):
        mrca = tree.mrca(taxon_labels=[taxa1.taxon.label, taxa2.taxon.label])
        
        if mrca.parent_node == None:
            # MRCA is the root of the tree
            return taxa1.distance_from_root() + taxa2.distance_from_root()
        else:
        
            dist = taxa1.edge_length
            parentNode = taxa1.parent_node
            while parentNode != mrca:                  
                dist += parentNode.edge_length    
                parentNode = parentNode.parent_node
    
                
            dist += taxa2.edge_length
            parentNode = taxa2.parent_node
            while parentNode != mrca:                  
                dist += parentNode.edge_length
                parentNode = parentNode.parent_node
                
            return dist
        
    def __distToNodePercentileTest(self, genomeNode, markerSetNode, leaves, percentileTest):
        
        distToBin = self.__distanceToAncestor(genomeNode, markerSetNode)
        
        distToLeaves = []
        for leaf in leaves:
            distToLeaves.append(self.__distanceToAncestor(leaf, markerSetNode))          
               
        return distToBin < percentile(distToLeaves, percentileTest)
     
    def __selectMarkerSetNode(self, tree, genomeId, metadata, taxonToGenomeIds):
        """Determine lineage-specific marker set to use for assessing the giving genome."""
        
        # read genomes removed from tree as a result of duplicate sequences
        duplicateSeqs = self.markerSetBuilder.readDuplicateSeqs()
        
        # determine location of genome in tree     
        node = tree.find_node_with_taxon_label('IMG_' + genomeId)

        # ascend tree to root looking for suitable marker set
        curNode = node.parent_node
        while curNode != None:
            uniqueId = curNode.label.split('|')[0]
            
            genomeIds = set()
            for leaf in curNode.leaf_nodes():
                genomeIds.add(leaf.taxon.label.replace('IMG_', ''))
                
                duplicateGenomes = duplicateSeqs.get(leaf.taxon.label, [])
                for dup in duplicateGenomes:
                    genomeIds.add(dup.replace('IMG_', ''))
                          
            # remove genome (LOO-style analysis)
            print 'Full:', len(genomeIds)
            genomeIds.difference_update([genomeId])
            print 'LOO:', len(genomeIds)
            
            # remove all genomes from the same taxonomic group as the genome of interest
            taxon = metadata[genomeId]['taxonomy'][self.looRank]
            genomeIds.difference_update(taxonToGenomeIds[taxon]) 
            print 'Rank reduced:', len(genomeIds)
              
            print uniqueId
            if len(genomeIds) > 10 and self.__stabilityTest(genomeIds):
                uidSelected = uniqueId
                break
                
            curNode = curNode.parent_node
            if curNode == None:
                # reach root so use universal marker set
                uidSelected = uniqueId
            
        return uidSelected
    
    def __bestMarkerSet(self, genomeId, simResults):
        """Get stats for best marker set."""
        curBest = 1000
        bestUID = None
        for uid, results in simResults[genomeId].iteritems():
            numDescendants, dComp, dCont = results
            if (dComp + dCont) < curBest:
                numDescendantsBest = numDescendants
                dCompBest = dComp
                dContBest = dCont
                bestUID = uid
                curBest = dComp + dCont
                
        return bestUID, numDescendantsBest, dCompBest, dContBest
        
    
    def __workerThread(self, tree, simResults, metadata, taxonToGenomeIds, queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            testGenomeId = queueIn.get(block=True, timeout=None)
            if testGenomeId == None:
                break
            
            uidSelected = self.__selectMarkerSetNode(tree, testGenomeId, metadata, taxonToGenomeIds)
            numDescendantsSelected, dCompSelected, dContSelected = simResults[testGenomeId][uidSelected]
            
            # find best marker set
            bestUID, numDescendantsBest, dCompBest, dContBest = self.__bestMarkerSet(testGenomeId, simResults)

            queueOut.put((testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest))
                      
    def __writerThread(self, numTestGenomes, writerQueue):
        """Store or write results of worker threads in a single thread."""
        
        fout = open('./experiments/markerSetSelection.tsv', 'w')
        
        fout.write('Genome Id\tSelected UID\t# descendants\tSelected dComp\tSelected dCont\tBest UID\t# descendants\tBest dComp\tBest dCont\tdDescendants\tdComp\tdCont\n')
        
        itemsToProcess = 0
        
        dComps = []
        dConts = []
        
        dCompsPer = []
        dContsPer = []
        
        bestComp = []
        bestCont = []
        
        selectedComp = []
        selectedCont = []
        
        while True:
            testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest = writerQueue.get(block=True, timeout=None)
            if testGenomeId == None:
                break

            itemsToProcess += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test genomes.' % (itemsToProcess, numTestGenomes, float(itemsToProcess)*100/(numTestGenomes))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            dComp = abs(dCompSelected - dCompBest)
            dCont = abs(dContSelected - dContBest)
            dDescendants = abs(numDescendantsSelected - numDescendantsBest)
            fout.write('%s\t%s\t%d\t%.4f\t%.4f\t%s\t%d\t%.4f\t%.4f\t%d\t%.4f\t%.4f\n' % (testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest, dDescendants, dComp, dCont))

            dComps.append(dComp)
            dConts.append(dCont)
            
            dCompsPer.append(dComp*100.0 / dCompBest)
            dContsPer.append(dCont*100.0 / max(dContBest, 0.01))
            
            bestComp.append(dCompBest)
            bestCont.append(dContBest)
            
            selectedComp.append(dCompSelected)
            selectedCont.append(dContSelected)

        sys.stdout.write('\n')
        fout.close()
        
        print ''
        print '  General results:'
        print '   Best comp: %.2f +/- %.2f' % (mean(bestComp), std(bestComp))
        print '   Best cont: %.2f +/- %.2f' % (mean(bestCont), std(bestCont))
        print '   Selected comp: %.2f +/- %.2f' % (mean(selectedComp), std(selectedComp))
        print '   Selected cont: %.2f +/- %.2f' % (mean(selectedCont), std(selectedCont))
        print ''
        print '   Delta comp: %.2f +/- %.2f' % (mean(dComps), std(dComps))
        print '   Delta cont: %.2f +/- %.2f' % (mean(dConts), std(dConts))
        print '   Delta comp per error: %.1f +/- %.1f' % (mean(dCompsPer), std(dCompsPer))
        print '   Delta cont per error: %.1f +/- %.1f' % (mean(dContsPer), std(dContsPer))
        
    def __distanceToAncestor(self, leaf, ancestor):
        dist = 0
        
        curNode = leaf
        while curNode != ancestor:
            dist += curNode.edge_length
            
            curNode = curNode.parent_node
        
        return dist
        
    def __bestNodeProperties(self, genomeId, tree, bestUID):
        # determine location of genome in tree     
        node = tree.find_node_with_taxon_label('IMG_' + genomeId)

        # find node of best marker set
        curNode = node.parent_node
        nodesToBin = 0
        distanceToBin = node.edge_length
        distanceToLeaves = []
        while curNode != None:
            uniqueId = curNode.label.split('|')[0]
            
            nodesToBin += 1  
            
            if uniqueId == bestUID:
                for leaf in curNode.leaf_nodes():
                    if leaf != node:
                        dist = self.__distanceToAncestor(leaf, curNode)
                        distanceToLeaves.append(dist)
                break
            
            distanceToBin += curNode.edge_length
            
            curNode = curNode.parent_node
            
        return nodesToBin, distanceToBin, mean(distanceToLeaves)
        
    def __propertiesOfBestMarkerSets(self, tree, simResults):
        
        numDescendants = []
        nodesToBin = []
        distanceToBin = []
        avgDistanceToLeaf = []
        percDiffs = []
        for genomeId in simResults:
            bestUID, numDescendantsBest, _, _ = self.__bestMarkerSet(genomeId, simResults)
            nodesToBinBest, distanceToBinBest, avgDistanceToLeafBest = self.__bestNodeProperties(genomeId, tree, bestUID)
            
            numDescendants.append(numDescendantsBest)
            nodesToBin.append(nodesToBinBest)
            distanceToBin.append(distanceToBinBest)
            avgDistanceToLeaf.append(avgDistanceToLeafBest)
            
            percDiff = abs(distanceToBinBest - avgDistanceToLeafBest) * 100 / distanceToBinBest
            percDiffs.append(percDiff)
            
        print '    # descendants: %.2f +/- %.2f' % (mean(numDescendants), std(numDescendants))
        print '    # nodes to bin: %.2f +/- %.2f' % (mean(nodesToBin), std(nodesToBin))
        print '    Distance to bin: %.2f +/- %.2f' % (mean(distanceToBin), std(distanceToBin))
        
        distanceToBin = array(distanceToBin)
        avgDistanceToLeaf = array(avgDistanceToLeaf)
        print '    Distance to bin - average distance to leaf: %.2f +/- %.2f' % (mean(abs(distanceToBin - avgDistanceToLeaf)), std(abs(distanceToBin - avgDistanceToLeaf)))
        print '    Percent difference to average leaf distance: %.2f +/- %.2f' % (mean(percDiffs), std(percDiffs))
        print ''

    def run(self, numThreads):
        # read reference tree
        print '\n  Reading reference genome tree.'
        treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre')
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        
        # get all genomes with a given taxon label
        metadata = self.img.genomeMetadata()
        taxonToGenomeIds = defaultdict(set)
        for genomeId in metadata:
            for t in metadata[genomeId]['taxonomy']:
                taxonToGenomeIds[t].add(genomeId)

        # read simulation results
        print '  Reading simulation results.'
        
        simResults = defaultdict(dict)
        with open(self.simFile) as f:
            f.readline()
            for line in f:
                lineSplit = line.split('\t')
                
                simId = lineSplit[0] + '-' + lineSplit[1] + '-' + lineSplit[2] + '-' + lineSplit[3]
                uid = lineSplit[5].split('|')[0].strip()
                numDescendants = int(lineSplit[6])
                comp = float(lineSplit[21])
                cont = float(lineSplit[23])
                
                simResults[simId][uid] = [numDescendants, comp, cont]
                
        #print ''
        #print '  Properties of best marker sets:'
        #self.__propertiesOfBestMarkerSets(tree, simResults)
                        
        print '  Evaluating %d test genomes.' % len(simResults)
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for testGenomeId in simResults:
            workerQueue.put(testGenomeId)

        for _ in range(numThreads):
            workerQueue.put(None)

        workerProc = [mp.Process(target = self.__workerThread, args = (tree, simResults, metadata, taxonToGenomeIds, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__writerThread, args = (len(simResults), writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None, None, None))
        writeProc.join()