Exemple #1
0
 def run(self, outputFile):
     img = IMG()
     
     print 'Identifying all IMG prokaryotic genomes with valid data.'
     metadata = img.genomeMetadata()
     genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata)
     genomeMissingData = img.genomesWithMissingData(genomeIds)
     genomeIds -= genomeMissingData
     
     print '  Identified %d valid genomes.' % (len(genomeIds))
     
     print 'Calculating gene copy number for each genome.'
     countTable = img.geneCountTable(genomeIds)
     
     counts = []
     for _, count in countTable['pfam00318'].iteritems():
         counts.append(count)
             
     print len(genomeIds)
     print len(counts)
     print mean(counts)
      
     fout = open(outputFile, 'w')
     fout.write(str(countTable))
     fout.close()
     
     print 'Gene count dictionary to: ' + outputFile
Exemple #2
0
    def run(self, outputFile):
        img = IMG()

        print('Identifying all IMG prokaryotic genomes with valid data.')
        metadata = img.genomeMetadata()
        genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata)
        genomeMissingData = img.genomesWithMissingData(genomeIds)
        genomeIds -= genomeMissingData

        print('  Identified %d valid genomes.' % (len(genomeIds)))

        print('Calculating gene copy number for each genome.')
        countTable = img.geneCountTable(genomeIds)

        counts = []
        for _, count in countTable['pfam00318'].iteritems():
            counts.append(count)

        print(len(genomeIds))
        print(len(counts))
        print(mean(counts))

        fout = open(outputFile, 'w')
        fout.write(str(countTable))
        fout.close()

        print('Gene count dictionary to: ' + outputFile)
Exemple #3
0
class MarkerSetSelection(object):
    def __init__(self):
        self.simFile = './experiments/simulation.tuning.genus.summary.tsv'
        self.looRank = 5

        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()

    def __stabilityTest(self,
                        genomeIds,
                        ubiquityThreshold=0.97,
                        singleCopyThreshold=0.97,
                        stabilityThreshold=0.05):
        """Test stability of marker set for a group of genomes using LOO-testing."""

        # quick escape for lineage that are clearly stable
        if len(genomeIds) > 200:
            return True

        # calculate marker sets using a LOO-testing
        looMarkerGenes = []
        for genomeId in genomeIds:
            looGenomeIds = genomeIds.difference([genomeId])

            # calculate marker genes
            geneCountTable = self.img.geneCountTable(looGenomeIds)
            markerGenes = self.markerSetBuilder.markerGenes(
                looGenomeIds, geneCountTable,
                ubiquityThreshold * len(looGenomeIds),
                singleCopyThreshold * len(looGenomeIds))
            tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)
            markerGenes = markerGenes - tigrToRemove

            looMarkerGenes.append(markerGenes)

        # calculate change in marker set for all pairs
        markerSetSize = []
        diffMarkerSet = []
        for i in range(0, len(looMarkerGenes)):
            markerSetSize.append(len(looMarkerGenes[i]))
            for j in range(i + 1, len(looMarkerGenes)):
                symmDiff = looMarkerGenes[i].symmetric_difference(
                    looMarkerGenes[j])
                diffMarkerSet.append(len(symmDiff))

        print(len(genomeIds), mean(diffMarkerSet), mean(markerSetSize))
        return (float(mean(diffMarkerSet)) /
                mean(markerSetSize)) <= stabilityThreshold

    def __patristicDist(self, tree, taxa1, taxa2):
        mrca = tree.mrca(taxon_labels=[taxa1.taxon.label, taxa2.taxon.label])

        if mrca.parent_node == None:
            # MRCA is the root of the tree
            return taxa1.distance_from_root() + taxa2.distance_from_root()
        else:

            dist = taxa1.edge_length
            parentNode = taxa1.parent_node
            while parentNode != mrca:
                dist += parentNode.edge_length
                parentNode = parentNode.parent_node

            dist += taxa2.edge_length
            parentNode = taxa2.parent_node
            while parentNode != mrca:
                dist += parentNode.edge_length
                parentNode = parentNode.parent_node

            return dist

    def __distToNodePercentileTest(self, genomeNode, markerSetNode, leaves,
                                   percentileTest):

        distToBin = self.__distanceToAncestor(genomeNode, markerSetNode)

        distToLeaves = []
        for leaf in leaves:
            distToLeaves.append(self.__distanceToAncestor(leaf, markerSetNode))

        return distToBin < percentile(distToLeaves, percentileTest)

    def __selectMarkerSetNode(self, tree, genomeId, metadata,
                              taxonToGenomeIds):
        """Determine lineage-specific marker set to use for assessing the giving genome."""

        # read genomes removed from tree as a result of duplicate sequences
        duplicateSeqs = self.markerSetBuilder.readDuplicateSeqs()

        # determine location of genome in tree
        node = tree.find_node_with_taxon_label('IMG_' + genomeId)

        # ascend tree to root looking for suitable marker set
        curNode = node.parent_node
        while curNode != None:
            uniqueId = curNode.label.split('|')[0]

            genomeIds = set()
            for leaf in curNode.leaf_nodes():
                genomeIds.add(leaf.taxon.label.replace('IMG_', ''))

                duplicateGenomes = duplicateSeqs.get(leaf.taxon.label, [])
                for dup in duplicateGenomes:
                    genomeIds.add(dup.replace('IMG_', ''))

            # remove genome (LOO-style analysis)
            print('Full:', len(genomeIds))
            genomeIds.difference_update([genomeId])
            print('LOO:', len(genomeIds))

            # remove all genomes from the same taxonomic group as the genome of interest
            taxon = metadata[genomeId]['taxonomy'][self.looRank]
            genomeIds.difference_update(taxonToGenomeIds[taxon])
            print('Rank reduced:', len(genomeIds))

            print(uniqueId)
            if len(genomeIds) > 10 and self.__stabilityTest(genomeIds):
                uidSelected = uniqueId
                break

            curNode = curNode.parent_node
            if curNode == None:
                # reach root so use universal marker set
                uidSelected = uniqueId

        return uidSelected

    def __bestMarkerSet(self, genomeId, simResults):
        """Get stats for best marker set."""
        curBest = 1000
        bestUID = None
        for uid, results in simResults[genomeId].items():
            numDescendants, dComp, dCont = results
            if (dComp + dCont) < curBest:
                numDescendantsBest = numDescendants
                dCompBest = dComp
                dContBest = dCont
                bestUID = uid
                curBest = dComp + dCont

        return bestUID, numDescendantsBest, dCompBest, dContBest

    def __workerThread(self, tree, simResults, metadata, taxonToGenomeIds,
                       queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            testGenomeId = queueIn.get(block=True, timeout=None)
            if testGenomeId == None:
                break

            uidSelected = self.__selectMarkerSetNode(tree, testGenomeId,
                                                     metadata,
                                                     taxonToGenomeIds)
            numDescendantsSelected, dCompSelected, dContSelected = simResults[
                testGenomeId][uidSelected]

            # find best marker set
            bestUID, numDescendantsBest, dCompBest, dContBest = self.__bestMarkerSet(
                testGenomeId, simResults)

            queueOut.put((testGenomeId, uidSelected, numDescendantsSelected,
                          dCompSelected, dContSelected, bestUID,
                          numDescendantsBest, dCompBest, dContBest))

    def __writerThread(self, numTestGenomes, writerQueue):
        """Store or write results of worker threads in a single thread."""

        fout = open('./experiments/markerSetSelection.tsv', 'w')

        fout.write(
            'Genome Id\tSelected UID\t# descendants\tSelected dComp\tSelected dCont\tBest UID\t# descendants\tBest dComp\tBest dCont\tdDescendants\tdComp\tdCont\n'
        )

        itemsToProcess = 0

        dComps = []
        dConts = []

        dCompsPer = []
        dContsPer = []

        bestComp = []
        bestCont = []

        selectedComp = []
        selectedCont = []

        while True:
            testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest = writerQueue.get(
                block=True, timeout=None)
            if testGenomeId == None:
                break

            itemsToProcess += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test genomes.' % (
                itemsToProcess, numTestGenomes, float(itemsToProcess) * 100 /
                (numTestGenomes))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            dComp = abs(dCompSelected - dCompBest)
            dCont = abs(dContSelected - dContBest)
            dDescendants = abs(numDescendantsSelected - numDescendantsBest)
            fout.write(
                '%s\t%s\t%d\t%.4f\t%.4f\t%s\t%d\t%.4f\t%.4f\t%d\t%.4f\t%.4f\n'
                % (testGenomeId, uidSelected, numDescendantsSelected,
                   dCompSelected, dContSelected, bestUID, numDescendantsBest,
                   dCompBest, dContBest, dDescendants, dComp, dCont))

            dComps.append(dComp)
            dConts.append(dCont)

            dCompsPer.append(dComp * 100.0 / dCompBest)
            dContsPer.append(dCont * 100.0 / max(dContBest, 0.01))

            bestComp.append(dCompBest)
            bestCont.append(dContBest)

            selectedComp.append(dCompSelected)
            selectedCont.append(dContSelected)

        sys.stdout.write('\n')
        fout.close()

        print('')
        print('  General results:')
        print('   Best comp: %.2f +/- %.2f' % (mean(bestComp), std(bestComp)))
        print('   Best cont: %.2f +/- %.2f' % (mean(bestCont), std(bestCont)))
        print('   Selected comp: %.2f +/- %.2f' %
              (mean(selectedComp), std(selectedComp)))
        print('   Selected cont: %.2f +/- %.2f' %
              (mean(selectedCont), std(selectedCont)))
        print('')
        print('   Delta comp: %.2f +/- %.2f' % (mean(dComps), std(dComps)))
        print('   Delta cont: %.2f +/- %.2f' % (mean(dConts), std(dConts)))
        print('   Delta comp per error: %.1f +/- %.1f' %
              (mean(dCompsPer), std(dCompsPer)))
        print('   Delta cont per error: %.1f +/- %.1f' %
              (mean(dContsPer), std(dContsPer)))

    def __distanceToAncestor(self, leaf, ancestor):
        dist = 0

        curNode = leaf
        while curNode != ancestor:
            dist += curNode.edge_length

            curNode = curNode.parent_node

        return dist

    def __bestNodeProperties(self, genomeId, tree, bestUID):
        # determine location of genome in tree
        node = tree.find_node_with_taxon_label('IMG_' + genomeId)

        # find node of best marker set
        curNode = node.parent_node
        nodesToBin = 0
        distanceToBin = node.edge_length
        distanceToLeaves = []
        while curNode != None:
            uniqueId = curNode.label.split('|')[0]

            nodesToBin += 1

            if uniqueId == bestUID:
                for leaf in curNode.leaf_nodes():
                    if leaf != node:
                        dist = self.__distanceToAncestor(leaf, curNode)
                        distanceToLeaves.append(dist)
                break

            distanceToBin += curNode.edge_length

            curNode = curNode.parent_node

        return nodesToBin, distanceToBin, mean(distanceToLeaves)

    def __propertiesOfBestMarkerSets(self, tree, simResults):

        numDescendants = []
        nodesToBin = []
        distanceToBin = []
        avgDistanceToLeaf = []
        percDiffs = []
        for genomeId in simResults:
            bestUID, numDescendantsBest, _, _ = self.__bestMarkerSet(
                genomeId, simResults)
            nodesToBinBest, distanceToBinBest, avgDistanceToLeafBest = self.__bestNodeProperties(
                genomeId, tree, bestUID)

            numDescendants.append(numDescendantsBest)
            nodesToBin.append(nodesToBinBest)
            distanceToBin.append(distanceToBinBest)
            avgDistanceToLeaf.append(avgDistanceToLeafBest)

            percDiff = abs(distanceToBinBest -
                           avgDistanceToLeafBest) * 100 / distanceToBinBest
            percDiffs.append(percDiff)

        print('    # descendants: %.2f +/- %.2f' %
              (mean(numDescendants), std(numDescendants)))
        print('    # nodes to bin: %.2f +/- %.2f' %
              (mean(nodesToBin), std(nodesToBin)))
        print('    Distance to bin: %.2f +/- %.2f' %
              (mean(distanceToBin), std(distanceToBin)))

        distanceToBin = array(distanceToBin)
        avgDistanceToLeaf = array(avgDistanceToLeaf)
        print('    Distance to bin - average distance to leaf: %.2f +/- %.2f' %
              (mean(abs(distanceToBin - avgDistanceToLeaf)),
               std(abs(distanceToBin - avgDistanceToLeaf))))
        print(
            '    Percent difference to average leaf distance: %.2f +/- %.2f' %
            (mean(percDiffs), std(percDiffs)))
        print('')

    def run(self, numThreads):
        # read reference tree
        print('\n  Reading reference genome tree.')
        treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data',
                                'genome_tree', 'genome_tree_prok.refpkg',
                                'genome_tree.final.tre')
        tree = dendropy.Tree.get_from_path(treeFile,
                                           schema='newick',
                                           as_rooted=True,
                                           preserve_underscores=True)

        # get all genomes with a given taxon label
        metadata = self.img.genomeMetadata()
        taxonToGenomeIds = defaultdict(set)
        for genomeId in metadata:
            for t in metadata[genomeId]['taxonomy']:
                taxonToGenomeIds[t].add(genomeId)

        # read simulation results
        print('  Reading simulation results.')

        simResults = defaultdict(dict)
        with open(self.simFile) as f:
            f.readline()
            for line in f:
                lineSplit = line.split('\t')

                simId = lineSplit[0] + '-' + lineSplit[1] + '-' + lineSplit[
                    2] + '-' + lineSplit[3]
                uid = lineSplit[5].split('|')[0].strip()
                numDescendants = int(lineSplit[6])
                comp = float(lineSplit[21])
                cont = float(lineSplit[23])

                simResults[simId][uid] = [numDescendants, comp, cont]

        #print ''
        #print '  Properties of best marker sets:'
        #self.__propertiesOfBestMarkerSets(tree, simResults)

        print('  Evaluating %d test genomes.' % len(simResults))
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for testGenomeId in simResults:
            workerQueue.put(testGenomeId)

        for _ in range(numThreads):
            workerQueue.put(None)

        workerProc = [
            mp.Process(target=self.__workerThread,
                       args=(tree, simResults, metadata, taxonToGenomeIds,
                             workerQueue, writerQueue))
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__writerThread,
                               args=(len(simResults), writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None, None, None))
        writeProc.join()
class Simulation(object):
    def __init__(self):
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()
        
        self.simContigLen = 10000
        
    def __selectMarkerSet(self, tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut):
        """Select marker set for parent edge of specified internal node."""
        
        # get genomes descendant from each child of the specified internal node
        leaves = []
        for child in internalNode.child_nodes(): 
            genomeIds = set()  
            for leaf in child.leaf_nodes():
                genomeId = leaf.taxon.label.replace('IMG_', '')
                genomeIds.add(genomeId)
                
                duplicateGenomes = self.markerSetBuilder.duplicateSeqs.get(leaf.taxon.label, [])
                for dup in duplicateGenomes:
                    dupId = dup.replace('IMG_', '')
                    genomeIds.add(dupId)
                 
            leaves.append(genomeIds)
            
        # make sure each set of leaves contains at least a minimum number of genomes
        orderedLeaves = sorted(leaves, key=len)
        if len(orderedLeaves[0]) < 5:
            queueOut.put(('NA', -1, -1, -1, -1, -1))
            return
                   
        # calculate marker genes with all genomes in lineage with the fewest genomes removed 
        binMarkerGenes, _ = self.markerSetBuilder.buildBinMarkerSet(tree, internalNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = orderedLeaves[0])
        
        # evaluate accuracy of completeness and contamination estimations on different partial genomes from lineage with fewest genomes   
        testGenomeIds = random.sample(orderedLeaves[0], min(len(orderedLeaves[0]), 100))    
        
        deltaComp = defaultdict(list)
        deltaCont = defaultdict(list)
        
        for testGenomeId in testGenomeIds:   
            geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerGenes.getMarkerGenes(), spacingBetweenContigs=0)
            genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna'))
            
            repsPerGenome = 100
            for _ in xrange(0, repsPerGenome): 
                testComp = random.uniform(0.5, 1.0)
                testCont = random.uniform(0, 0.2)
                trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(genomeSize, testComp, testCont, self.simContigLen)   
      
                for ms in binMarkerGenes.markerSetIter():  
                    containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, self.simContigLen)
                    completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True)      
                    if completeness == 0.0:
                        print ms.getMarkerGenes()
                        print geneDistTable[testGenomeId]
                        print startPartialGenomeContigs
                        print genomeSize
                        print '*****************' + testGenomeId
                        sys.exit()
                    deltaComp[ms.lineageStr].append(completeness - trueComp)
                    deltaCont[ms.lineageStr].append(contamination - trueCont)
            
        # determine lineage-specific marker set with best average performance
        curBest = 1000
        bestUID = None
        dCompBest = 0
        dContBest = 0
        
        for lineageStr in deltaComp:
            dComp, dCont = mean(abs(array(deltaComp[lineageStr]))), mean(abs(array(deltaCont[lineageStr])))

            if (dComp + dCont) < curBest:
                dCompBest = dComp
                dContBest = dCont
                dCompStdBest = std(abs(array(deltaComp[lineageStr])))
                dContStdBest = std(abs(array(deltaCont[lineageStr])))
                bestUID = lineageStr.split('|')[0]
                curBest = dComp + dCont

        queueOut.put((internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest))
                        
    def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            internalNode = queueIn.get(block=True, timeout=None)
            if internalNode == None:
                break
            
            self.__selectMarkerSet(tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut)      
                      
    def __writerThread(self, numInternalNodes, writerQueue):
        """Store or write results of worker threads in a single thread."""

        fout = open('/tmp/simInferBestMarkerSet.tsv', 'w')
        fout.write('Internal node ID\tMarker set ID\tmean % delta comp\tstd % delta comp\tmean % delta cont\tstd % delta cont\n')

        itemsProcessed = 0
        while True:
            internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest = writerQueue.get(block=True, timeout=None)
            if internalNode == None:
                break
            
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) internal branches.' % (itemsProcessed, numInternalNodes, float(itemsProcessed)*100/(numInternalNodes))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            if internalNode != 'NA':
                fout.write(internalNode.label + '\t%s\t%.2f\t%.2f\t%.2f\t%.2f\n' % (bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest)) 
            
        fout.close()

        sys.stdout.write('\n')

    def run(self, ubiquityThreshold, singleCopyThreshold, numThreads):
        random.seed(0)
          
        print '\n  Calculating global gene count table.'
        metadata = self.img.genomeMetadata()
        self.markerSetBuilder.globalGeneCountTable = self.img.geneCountTable(metadata.keys())
          
        print '\n  Reading reference genome tree.'
        treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre')
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
            
        print '  Evaluating %d internal nodes.' % len(tree.internal_nodes())
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for internalNode in tree.internal_nodes():
            if internalNode.parent_node != None:
                workerQueue.put(internalNode)

        for _ in range(numThreads):
            workerQueue.put(None)

        metadata = self.img.genomeMetadata()
        workerProc = [mp.Process(target = self.__workerThread, args = (tree, metadata, ubiquityThreshold, singleCopyThreshold, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__writerThread, args = (len(tree.internal_nodes())-1, writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None))
        writeProc.join()
class MarkerSetStability(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold,
                         singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None)
            if lineage == None:
                break

            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata,
                                                     'trusted')

            changeMarkerSetSize = {}
            markerGenes = []
            if len(genomeIds) >= minGenomes:
                # calculate marker set for all genomes in lineage
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(
                    genomeIds, geneCountTable,
                    ubiquityThreshold * len(genomeIds),
                    singleCopyThreshold * len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)
                markerGenes = markerGenes - tigrToRemove

                for selectPer in range(50, 101, 5):
                    numGenomesToSelect = int(
                        float(selectPer) / 100 * len(genomeIds))
                    perChange = []
                    for _ in range(0, 10):
                        # calculate marker set for subset of genomes
                        subsetGenomeIds = random.sample(
                            genomeIds, numGenomesToSelect)
                        geneCountTable = self.img.geneCountTable(
                            subsetGenomeIds)
                        subsetMarkerGenes = self.markerset.markerGenes(
                            subsetGenomeIds, geneCountTable,
                            ubiquityThreshold * numGenomesToSelect,
                            singleCopyThreshold * numGenomesToSelect)
                        tigrToRemove = self.img.identifyRedundantTIGRFAMs(
                            subsetMarkerGenes)
                        subsetMarkerGenes = subsetMarkerGenes - tigrToRemove

                        perChange.append(
                            float(
                                len(
                                    markerGenes.symmetric_difference(
                                        subsetMarkerGenes))) * 100.0 /
                            len(markerGenes))

                    changeMarkerSetSize[selectPer] = [
                        mean(perChange), std(perChange)
                    ]

            queueOut.put((lineage, len(genomeIds), len(markerGenes),
                          changeMarkerSetSize))

    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""

        fout = open(outputFile, 'w')
        fout.write(
            'Lineage\t# genomes\t# markers\tsubsample %\tmean % change\tstd % change\n'
        )

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, changeMarkerSetSize = writerQueue.get(
                block=True, timeout=None)
            if lineage == None:
                break

            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (
                numProcessedLineages, totalLineages,
                float(numProcessedLineages) * 100 / totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            for selectPer in sorted(changeMarkerSetSize.keys()):
                fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' %
                           (lineage, numGenomes, numMarkerGenes, selectPer,
                            changeMarkerSetSize[selectPer][0],
                            changeMarkerSetSize[selectPer][1]))

        sys.stdout.write('\n')

        fout.close()

    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold,
            minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""

        print('  Calculating stability of marker sets:')

        random.seed(1)

        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes,
                                               mostSpecificRank)

        #lineages = ['Bacteria']
        #lineages += ['Bacteria;Proteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;coli']

        #lineages = ['Archaea']
        #lineages += ['Archaea;Euryarchaeota']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosarcinaceae']

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)

        calcProc = [
            mp.Process(target=self.__processLineage,
                       args=(metadata, ubiquityThreshold, singleCopyThreshold,
                             minGenomes, workerQueue, writerQueue))
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__storeResults,
                               args=(outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None))
        writeProc.join()
Exemple #6
0
    def run(self, inputMetadataFile, outputMetadataFile, outputDir,
            ubiquityThreshold, singleCopyThreshold, trustedCompleteness,
            trustedContamination):
        img = IMG()
        markerSetBuilder = MarkerSetBuilder()

        allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w')
        allOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w')
        trustedOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'),
                           'w')
        filteredOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        metadataOut = open(outputMetadataFile, 'w')

        # read input metadata file
        metadata = img.genomeMetadataFromFile(inputMetadataFile)

        finishedGenomes = defaultdict(set)
        allGenomes = defaultdict(set)

        metadataLine = {}

        bHeader = True
        for line in open(inputMetadataFile):
            if bHeader:
                metadataOut.write(line)
                bHeader = False
                continue

            lineSplit = line.split('\t')
            genomeId = lineSplit[0]
            domain = lineSplit[1]
            status = lineSplit[2]

            if status == 'Finished':
                finishedGenomes[domain].add(genomeId)

            allGenomes[domain].add(genomeId)
            metadataLine[genomeId] = line

        allTrustedGenomeIds = set()
        for lineage, allLineageGenomeIds in allGenomes.items():
            print('[' + lineage + ']')
            print('  Number of genomes: %d' % len(allLineageGenomeIds))

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker genes for finished genomes
            print(
                '\nDetermining initial marker gene sets for genome filtering.')
            markerSet = markerSetBuilder.buildMarkerSet(
                finishedGenomes[lineage], ubiquityThreshold,
                singleCopyThreshold)

            print(
                '  Marker set consists of %s marker genes organized into %d sets.'
                % (markerSet.numMarkers(), markerSet.numSets()))
            fout = open(
                os.path.join(outputDir,
                             'trusted_marker_sets_' + lineage + '.txt'), 'w')
            fout.write(str(markerSet.markerSet))
            fout.close()

            # identifying trusted genomes (highly complete, low contamination genomes)
            print('\nIdentifying highly complete, low contamination genomes.')
            trustedGenomeIds = set()
            filteredGenomes = set()
            retainedStatus = {}
            filteredStatus = {}
            geneCountTable = img.geneCountTable(allLineageGenomeIds)
            for genomeId in allLineageGenomeIds:
                completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(
                    markerSet.markerSet, genomeId, geneCountTable)

                genomeStr = self.__genomeString(genomeId, metadata,
                                                completeness, contamination,
                                                missingMarkers,
                                                duplicateMarkers)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)
                    retainedStatus[metadata[genomeId]
                                   ['status']] = retainedStatus.get(
                                       metadata[genomeId]['status'], 0) + 1

                    trustedOut.write(genomeStr)
                    allOut.write(genomeStr)

                    metadataOut.write(metadataLine[genomeId])
                else:
                    filteredGenomes.add(genomeId)
                    filteredStatus[metadata[genomeId]
                                   ['status']] = filteredStatus.get(
                                       metadata[genomeId]['status'], 0) + 1

                    filteredOut.write(genomeStr)
                    allOut.write(genomeStr)

            print('  Filtered genomes: %d (%.2f%%)' %
                  (len(filteredGenomes),
                   len(filteredGenomes) * 100.0 / len(allLineageGenomeIds)))
            print('  ' + str(filteredStatus))
            print('  \nTrusted genomes: %d (%.2f%%)' %
                  (len(trustedGenomeIds),
                   len(trustedGenomeIds) * 100.0 / len(allLineageGenomeIds)))
            print('  ' + str(retainedStatus))

            # determine status of retained genomes
            print('\nTrusted genomes by phylum:')
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon,
                                                                     0) + 1

            for phylum, count in allPhylumCounts.items():
                print('  ' + phylum + ': %d of %d' %
                      (trustedPhylumCounts.get(phylum, 0), count))
            print('')

        allOut.close()
        trustedOut.close()
        filteredOut.close()
        metadataOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in range(0, 6):  # Domain to Genus
            for genomeId, data in metadata.items():
                taxaStr = ';'.join(data['taxonomy'][0:r + 1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted(metadata)

        fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' +
                       str(trustedStats.get(lineage, 0)) + '\n')
        fout.close()
class MarkerSetStabilityTest(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold,
                         singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None)
            if lineage == None:
                break

            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata,
                                                     'trusted')

            markerGenes = []
            perChange = []
            numGenomesToSelect = int(0.9 * len(genomeIds))
            if len(genomeIds) >= minGenomes:
                # calculate marker set for all genomes in lineage
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(
                    genomeIds, geneCountTable,
                    ubiquityThreshold * len(genomeIds),
                    singleCopyThreshold * len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)

                markerGenes = markerGenes - tigrToRemove

                for _ in range(0, 100):
                    # calculate marker set for subset of genomes
                    subsetGenomeIds = random.sample(genomeIds,
                                                    numGenomesToSelect)
                    geneCountTable = self.img.geneCountTable(subsetGenomeIds)
                    subsetMarkerGenes = self.markerset.markerGenes(
                        subsetGenomeIds, geneCountTable,
                        ubiquityThreshold * numGenomesToSelect,
                        singleCopyThreshold * numGenomesToSelect)
                    tigrToRemove = self.img.identifyRedundantTIGRFAMs(
                        subsetMarkerGenes)
                    subsetMarkerGenes = subsetMarkerGenes - tigrToRemove

                    perChange.append(
                        float(
                            len(
                                markerGenes.symmetric_difference(
                                    subsetMarkerGenes))) * 100.0 /
                        len(markerGenes))

            if perChange != []:
                queueOut.put(
                    (lineage, len(genomeIds), len(markerGenes),
                     numGenomesToSelect, mean(perChange), std(perChange)))
            else:
                queueOut.put((lineage, len(genomeIds), len(markerGenes),
                              numGenomesToSelect, -1, -1))

    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""

        fout = open(outputFile, 'w')
        fout.write(
            'Lineage\t# genomes\t# markers\t# sampled genomes\tmean % change\tstd % change\n'
        )

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange = writerQueue.get(
                block=True, timeout=None)
            if lineage == None:
                break

            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (
                numProcessedLineages, totalLineages,
                float(numProcessedLineages) * 100 / totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' %
                       (lineage, numGenomes, numMarkerGenes, numSampledGenomes,
                        meanPerChange, stdPerChange))

        sys.stdout.write('\n')

        fout.close()

    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold,
            minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""

        print('  Testing stability of marker sets:')

        random.seed(1)

        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes,
                                               mostSpecificRank)

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)

        calcProc = [
            mp.Process(target=self.__processLineage,
                       args=(metadata, ubiquityThreshold, singleCopyThreshold,
                             minGenomes, workerQueue, writerQueue))
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__storeResults,
                               args=(outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None))
        writeProc.join()
class MarkerSetStability(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None) 
            if lineage == None:
                break  
            
            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted')
            
            changeMarkerSetSize = {}
            markerGenes = []
            if len(genomeIds) >= minGenomes:  
                # calculate marker set for all genomes in lineage          
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)
                markerGenes = markerGenes - tigrToRemove
     
                for selectPer in xrange(50, 101, 5):
                    numGenomesToSelect = int(float(selectPer)/100 * len(genomeIds))
                    perChange = []
                    for _ in xrange(0, 10):
                        # calculate marker set for subset of genomes
                        subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect)
                        geneCountTable = self.img.geneCountTable(subsetGenomeIds)
                        subsetMarkerGenes = self.markerset.markerGenes(subsetGenomeIds, geneCountTable, ubiquityThreshold*numGenomesToSelect, singleCopyThreshold*numGenomesToSelect)
                        tigrToRemove = self.img.identifyRedundantTIGRFAMs(subsetMarkerGenes)
                        subsetMarkerGenes = subsetMarkerGenes - tigrToRemove
    
                        perChange.append(float(len(markerGenes.symmetric_difference(subsetMarkerGenes)))*100.0 / len(markerGenes))
    
                    changeMarkerSetSize[selectPer] = [mean(perChange), std(perChange)]  

            queueOut.put((lineage, len(genomeIds), len(markerGenes), changeMarkerSetSize))

    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""
        
        fout = open(outputFile, 'w')
        fout.write('Lineage\t# genomes\t# markers\tsubsample %\tmean % change\tstd % change\n')

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, changeMarkerSetSize = writerQueue.get(block=True, timeout=None)
            if lineage == None:
                break
                    
            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (numProcessedLineages, totalLineages, float(numProcessedLineages)*100/totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            for selectPer in sorted(changeMarkerSetSize.keys()): 
                fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, selectPer, changeMarkerSetSize[selectPer][0], changeMarkerSetSize[selectPer][1]))

        sys.stdout.write('\n')
            
        fout.close()
        
        
    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""  
        
        print '  Calculating stability of marker sets:'
        
        random.seed(1)
        
        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank)
        
        #lineages = ['Bacteria']
        #lineages += ['Bacteria;Proteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;coli']
        
        #lineages = ['Archaea']
        #lineages += ['Archaea;Euryarchaeota']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosarcinaceae']

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)
 
        calcProc = [mp.Process(target = self.__processLineage, args = (metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__storeResults, args = (outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None))
        writeProc.join()
Exemple #9
0
    def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination):
        img = IMG()
        markerSetBuilder = MarkerSetBuilder()

        allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w')
        allOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')

        trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w')
        trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')

        filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w')
        filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')

        metadataOut = open(outputMetadataFile, 'w')
        
        # read input metadata file
        metadata = img.genomeMetadataFromFile(inputMetadataFile)
        
        finishedGenomes = defaultdict(set)
        allGenomes = defaultdict(set)
        
        metadataLine = {}
        
        bHeader = True
        for line in open(inputMetadataFile):
            if bHeader:
                metadataOut.write(line)
                bHeader = False
                continue
            
            lineSplit = line.split('\t')
            genomeId = lineSplit[0]
            domain = lineSplit[1]
            status = lineSplit[2]
            
            if status == 'Finished':
                finishedGenomes[domain].add(genomeId)
            
            allGenomes[domain].add(genomeId)
            metadataLine[genomeId] = line

        allTrustedGenomeIds = set()
        for lineage, allLineageGenomeIds in allGenomes.iteritems():
            print '[' + lineage + ']'
            print '  Number of genomes: %d' % len(allLineageGenomeIds)

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker genes for finished genomes
            print '\nDetermining initial marker gene sets for genome filtering.'
            markerSet = markerSetBuilder.buildMarkerSet(finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold)

            print '  Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets())
            fout = open(os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w')
            fout.write(str(markerSet.markerSet))
            fout.close()

            # identifying trusted genomes (highly complete, low contamination genomes)
            print '\nIdentifying highly complete, low contamination genomes.'
            trustedGenomeIds = set()
            filteredGenomes = set()
            retainedStatus = {}
            filteredStatus = {}
            geneCountTable = img.geneCountTable(allLineageGenomeIds)
            for genomeId in allLineageGenomeIds:
                completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(markerSet.markerSet, genomeId, geneCountTable)
                
                genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)
                    retainedStatus[metadata[genomeId]['status']] = retainedStatus.get(metadata[genomeId]['status'], 0) + 1

                    trustedOut.write(genomeStr)
                    allOut.write(genomeStr)
                    
                    metadataOut.write(metadataLine[genomeId])
                else:
                    filteredGenomes.add(genomeId)
                    filteredStatus[metadata[genomeId]['status']] = filteredStatus.get(metadata[genomeId]['status'], 0) + 1

                    filteredOut.write(genomeStr)
                    allOut.write(genomeStr)

            print '  Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes)*100.0 / len(allLineageGenomeIds))
            print '  ' + str(filteredStatus)
            print '  \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds)*100.0 / len(allLineageGenomeIds))
            print '  ' + str(retainedStatus)

            # determine status of retained genomes
            print '\nTrusted genomes by phylum:'
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1

            for phylum, count in allPhylumCounts.iteritems():
                print '  ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count)
            print ''

        allOut.close()
        trustedOut.close()
        filteredOut.close()
        metadataOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in xrange(0, 6): # Domain to Genus
            for genomeId, data in metadata.iteritems():
                taxaStr = ';'.join(data['taxonomy'][0:r+1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted(metadata)

        fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n')
        fout.close()
class MarkerSetStabilityTest(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None) 
            if lineage == None:
                break  
            
            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted')
            
            markerGenes = []
            perChange = []
            numGenomesToSelect = int(0.9*len(genomeIds))
            if len(genomeIds) >= minGenomes:  
                # calculate marker set for all genomes in lineage          
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)

                markerGenes = markerGenes - tigrToRemove

                for _ in xrange(0, 100):
                    # calculate marker set for subset of genomes
                    subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect)
                    geneCountTable = self.img.geneCountTable(subsetGenomeIds)
                    subsetMarkerGenes = self.markerset.markerGenes(subsetGenomeIds, geneCountTable, ubiquityThreshold*numGenomesToSelect, singleCopyThreshold*numGenomesToSelect)
                    tigrToRemove = self.img.identifyRedundantTIGRFAMs(subsetMarkerGenes)
                    subsetMarkerGenes = subsetMarkerGenes - tigrToRemove

                    perChange.append(float(len(markerGenes.symmetric_difference(subsetMarkerGenes)))*100.0 / len(markerGenes))

            if perChange != []:
                queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, mean(perChange), std(perChange)))
            else:
                queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, -1, -1))
                
    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""
        
        fout = open(outputFile, 'w')
        fout.write('Lineage\t# genomes\t# markers\t# sampled genomes\tmean % change\tstd % change\n')

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange = writerQueue.get(block=True, timeout=None)
            if lineage == None:
                break
                    
            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (numProcessedLineages, totalLineages, float(numProcessedLineages)*100/totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            

            fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange))

        sys.stdout.write('\n')
            
        fout.close()
        
        
    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""  
        
        print '  Testing stability of marker sets:'
        
        random.seed(1)
        
        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank)

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)
 
        calcProc = [mp.Process(target = self.__processLineage, args = (metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__storeResults, args = (outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None))
        writeProc.join()
class MarkerSetSelection(object):
    def __init__(self):
        self.simFile = './experiments/simulation.tuning.genus.summary.tsv'
        self.looRank = 5
        
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()
        
    def __stabilityTest(self, genomeIds, ubiquityThreshold = 0.97, singleCopyThreshold = 0.97, stabilityThreshold = 0.05):
        """Test stability of marker set for a group of genomes using LOO-testing."""
        
        # quick escape for lineage that are clearly stable
        if len(genomeIds) > 200:
            return True
        
        # calculate marker sets using a LOO-testing
        looMarkerGenes = []
        for genomeId in genomeIds:
            looGenomeIds = genomeIds.difference([genomeId])
            
            # calculate marker genes
            geneCountTable = self.img.geneCountTable(looGenomeIds)
            markerGenes = self.markerSetBuilder.markerGenes(looGenomeIds, geneCountTable, ubiquityThreshold*len(looGenomeIds), singleCopyThreshold*len(looGenomeIds))
            tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)
            markerGenes = markerGenes - tigrToRemove
            
            looMarkerGenes.append(markerGenes)
            
        # calculate change in marker set for all pairs
        markerSetSize = []
        diffMarkerSet = []
        for i in xrange(0, len(looMarkerGenes)):
            markerSetSize.append(len(looMarkerGenes[i]))
            for j in xrange(i+1, len(looMarkerGenes)):     
                symmDiff = looMarkerGenes[i].symmetric_difference(looMarkerGenes[j])
                diffMarkerSet.append(len(symmDiff))
                            
        print len(genomeIds), mean(diffMarkerSet), mean(markerSetSize)
        return (float(mean(diffMarkerSet)) / mean(markerSetSize)) <= stabilityThreshold
        
    def __patristicDist(self, tree, taxa1, taxa2):
        mrca = tree.mrca(taxon_labels=[taxa1.taxon.label, taxa2.taxon.label])
        
        if mrca.parent_node == None:
            # MRCA is the root of the tree
            return taxa1.distance_from_root() + taxa2.distance_from_root()
        else:
        
            dist = taxa1.edge_length
            parentNode = taxa1.parent_node
            while parentNode != mrca:                  
                dist += parentNode.edge_length    
                parentNode = parentNode.parent_node
    
                
            dist += taxa2.edge_length
            parentNode = taxa2.parent_node
            while parentNode != mrca:                  
                dist += parentNode.edge_length
                parentNode = parentNode.parent_node
                
            return dist
        
    def __distToNodePercentileTest(self, genomeNode, markerSetNode, leaves, percentileTest):
        
        distToBin = self.__distanceToAncestor(genomeNode, markerSetNode)
        
        distToLeaves = []
        for leaf in leaves:
            distToLeaves.append(self.__distanceToAncestor(leaf, markerSetNode))          
               
        return distToBin < percentile(distToLeaves, percentileTest)
     
    def __selectMarkerSetNode(self, tree, genomeId, metadata, taxonToGenomeIds):
        """Determine lineage-specific marker set to use for assessing the giving genome."""
        
        # read genomes removed from tree as a result of duplicate sequences
        duplicateSeqs = self.markerSetBuilder.readDuplicateSeqs()
        
        # determine location of genome in tree     
        node = tree.find_node_with_taxon_label('IMG_' + genomeId)

        # ascend tree to root looking for suitable marker set
        curNode = node.parent_node
        while curNode != None:
            uniqueId = curNode.label.split('|')[0]
            
            genomeIds = set()
            for leaf in curNode.leaf_nodes():
                genomeIds.add(leaf.taxon.label.replace('IMG_', ''))
                
                duplicateGenomes = duplicateSeqs.get(leaf.taxon.label, [])
                for dup in duplicateGenomes:
                    genomeIds.add(dup.replace('IMG_', ''))
                          
            # remove genome (LOO-style analysis)
            print 'Full:', len(genomeIds)
            genomeIds.difference_update([genomeId])
            print 'LOO:', len(genomeIds)
            
            # remove all genomes from the same taxonomic group as the genome of interest
            taxon = metadata[genomeId]['taxonomy'][self.looRank]
            genomeIds.difference_update(taxonToGenomeIds[taxon]) 
            print 'Rank reduced:', len(genomeIds)
              
            print uniqueId
            if len(genomeIds) > 10 and self.__stabilityTest(genomeIds):
                uidSelected = uniqueId
                break
                
            curNode = curNode.parent_node
            if curNode == None:
                # reach root so use universal marker set
                uidSelected = uniqueId
            
        return uidSelected
    
    def __bestMarkerSet(self, genomeId, simResults):
        """Get stats for best marker set."""
        curBest = 1000
        bestUID = None
        for uid, results in simResults[genomeId].iteritems():
            numDescendants, dComp, dCont = results
            if (dComp + dCont) < curBest:
                numDescendantsBest = numDescendants
                dCompBest = dComp
                dContBest = dCont
                bestUID = uid
                curBest = dComp + dCont
                
        return bestUID, numDescendantsBest, dCompBest, dContBest
        
    
    def __workerThread(self, tree, simResults, metadata, taxonToGenomeIds, queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            testGenomeId = queueIn.get(block=True, timeout=None)
            if testGenomeId == None:
                break
            
            uidSelected = self.__selectMarkerSetNode(tree, testGenomeId, metadata, taxonToGenomeIds)
            numDescendantsSelected, dCompSelected, dContSelected = simResults[testGenomeId][uidSelected]
            
            # find best marker set
            bestUID, numDescendantsBest, dCompBest, dContBest = self.__bestMarkerSet(testGenomeId, simResults)

            queueOut.put((testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest))
                      
    def __writerThread(self, numTestGenomes, writerQueue):
        """Store or write results of worker threads in a single thread."""
        
        fout = open('./experiments/markerSetSelection.tsv', 'w')
        
        fout.write('Genome Id\tSelected UID\t# descendants\tSelected dComp\tSelected dCont\tBest UID\t# descendants\tBest dComp\tBest dCont\tdDescendants\tdComp\tdCont\n')
        
        itemsToProcess = 0
        
        dComps = []
        dConts = []
        
        dCompsPer = []
        dContsPer = []
        
        bestComp = []
        bestCont = []
        
        selectedComp = []
        selectedCont = []
        
        while True:
            testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest = writerQueue.get(block=True, timeout=None)
            if testGenomeId == None:
                break

            itemsToProcess += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test genomes.' % (itemsToProcess, numTestGenomes, float(itemsToProcess)*100/(numTestGenomes))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            dComp = abs(dCompSelected - dCompBest)
            dCont = abs(dContSelected - dContBest)
            dDescendants = abs(numDescendantsSelected - numDescendantsBest)
            fout.write('%s\t%s\t%d\t%.4f\t%.4f\t%s\t%d\t%.4f\t%.4f\t%d\t%.4f\t%.4f\n' % (testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest, dDescendants, dComp, dCont))

            dComps.append(dComp)
            dConts.append(dCont)
            
            dCompsPer.append(dComp*100.0 / dCompBest)
            dContsPer.append(dCont*100.0 / max(dContBest, 0.01))
            
            bestComp.append(dCompBest)
            bestCont.append(dContBest)
            
            selectedComp.append(dCompSelected)
            selectedCont.append(dContSelected)

        sys.stdout.write('\n')
        fout.close()
        
        print ''
        print '  General results:'
        print '   Best comp: %.2f +/- %.2f' % (mean(bestComp), std(bestComp))
        print '   Best cont: %.2f +/- %.2f' % (mean(bestCont), std(bestCont))
        print '   Selected comp: %.2f +/- %.2f' % (mean(selectedComp), std(selectedComp))
        print '   Selected cont: %.2f +/- %.2f' % (mean(selectedCont), std(selectedCont))
        print ''
        print '   Delta comp: %.2f +/- %.2f' % (mean(dComps), std(dComps))
        print '   Delta cont: %.2f +/- %.2f' % (mean(dConts), std(dConts))
        print '   Delta comp per error: %.1f +/- %.1f' % (mean(dCompsPer), std(dCompsPer))
        print '   Delta cont per error: %.1f +/- %.1f' % (mean(dContsPer), std(dContsPer))
        
    def __distanceToAncestor(self, leaf, ancestor):
        dist = 0
        
        curNode = leaf
        while curNode != ancestor:
            dist += curNode.edge_length
            
            curNode = curNode.parent_node
        
        return dist
        
    def __bestNodeProperties(self, genomeId, tree, bestUID):
        # determine location of genome in tree     
        node = tree.find_node_with_taxon_label('IMG_' + genomeId)

        # find node of best marker set
        curNode = node.parent_node
        nodesToBin = 0
        distanceToBin = node.edge_length
        distanceToLeaves = []
        while curNode != None:
            uniqueId = curNode.label.split('|')[0]
            
            nodesToBin += 1  
            
            if uniqueId == bestUID:
                for leaf in curNode.leaf_nodes():
                    if leaf != node:
                        dist = self.__distanceToAncestor(leaf, curNode)
                        distanceToLeaves.append(dist)
                break
            
            distanceToBin += curNode.edge_length
            
            curNode = curNode.parent_node
            
        return nodesToBin, distanceToBin, mean(distanceToLeaves)
        
    def __propertiesOfBestMarkerSets(self, tree, simResults):
        
        numDescendants = []
        nodesToBin = []
        distanceToBin = []
        avgDistanceToLeaf = []
        percDiffs = []
        for genomeId in simResults:
            bestUID, numDescendantsBest, _, _ = self.__bestMarkerSet(genomeId, simResults)
            nodesToBinBest, distanceToBinBest, avgDistanceToLeafBest = self.__bestNodeProperties(genomeId, tree, bestUID)
            
            numDescendants.append(numDescendantsBest)
            nodesToBin.append(nodesToBinBest)
            distanceToBin.append(distanceToBinBest)
            avgDistanceToLeaf.append(avgDistanceToLeafBest)
            
            percDiff = abs(distanceToBinBest - avgDistanceToLeafBest) * 100 / distanceToBinBest
            percDiffs.append(percDiff)
            
        print '    # descendants: %.2f +/- %.2f' % (mean(numDescendants), std(numDescendants))
        print '    # nodes to bin: %.2f +/- %.2f' % (mean(nodesToBin), std(nodesToBin))
        print '    Distance to bin: %.2f +/- %.2f' % (mean(distanceToBin), std(distanceToBin))
        
        distanceToBin = array(distanceToBin)
        avgDistanceToLeaf = array(avgDistanceToLeaf)
        print '    Distance to bin - average distance to leaf: %.2f +/- %.2f' % (mean(abs(distanceToBin - avgDistanceToLeaf)), std(abs(distanceToBin - avgDistanceToLeaf)))
        print '    Percent difference to average leaf distance: %.2f +/- %.2f' % (mean(percDiffs), std(percDiffs))
        print ''

    def run(self, numThreads):
        # read reference tree
        print '\n  Reading reference genome tree.'
        treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre')
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        
        # get all genomes with a given taxon label
        metadata = self.img.genomeMetadata()
        taxonToGenomeIds = defaultdict(set)
        for genomeId in metadata:
            for t in metadata[genomeId]['taxonomy']:
                taxonToGenomeIds[t].add(genomeId)

        # read simulation results
        print '  Reading simulation results.'
        
        simResults = defaultdict(dict)
        with open(self.simFile) as f:
            f.readline()
            for line in f:
                lineSplit = line.split('\t')
                
                simId = lineSplit[0] + '-' + lineSplit[1] + '-' + lineSplit[2] + '-' + lineSplit[3]
                uid = lineSplit[5].split('|')[0].strip()
                numDescendants = int(lineSplit[6])
                comp = float(lineSplit[21])
                cont = float(lineSplit[23])
                
                simResults[simId][uid] = [numDescendants, comp, cont]
                
        #print ''
        #print '  Properties of best marker sets:'
        #self.__propertiesOfBestMarkerSets(tree, simResults)
                        
        print '  Evaluating %d test genomes.' % len(simResults)
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for testGenomeId in simResults:
            workerQueue.put(testGenomeId)

        for _ in range(numThreads):
            workerQueue.put(None)

        workerProc = [mp.Process(target = self.__workerThread, args = (tree, simResults, metadata, taxonToGenomeIds, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__writerThread, args = (len(simResults), writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None, None, None))
        writeProc.join()