コード例 #1
0
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold,
            percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
        print('\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) +
              ' genomes.')

        # build marker genes and colocated marker sets
        countTable = img.countTable(genomeIds)
        markerGenes = img.markerGenes(genomeIds, countTable,
                                      ubiquityThreshold * len(genomeIds),
                                      singleCopyThreshold * len(genomeIds))
        print('  Marker genes: ' + str(len(markerGenes)))

        geneDistTable = img.geneDistTable(genomeIds,
                                          markerGenes,
                                          spacingBetweenContigs=1e6)
        colocatedGenes = img.colocatedGenes(geneDistTable)
        colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
        print('  Co-located gene sets: ' + str(len(colocatedSets)))

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            mgCompletion = []
            msCompletion = []
            for _ in range(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(
                    metadata[genomeId]['genome size'], percentCompletion,
                    contigLen)

                # calculate completion with marker genes
                containedMarkerGenes = img.containedMarkerGenes(
                    markerGenes, geneDistTable[genomeId],
                    startPartialGenomeContigs, contigLen)
                mgCompletion.append(
                    float(len(containedMarkerGenes)) / len(markerGenes) -
                    percentCompletion)

                # calculate completion with marker set
                comp = 0.0
                for cs in colocatedSets:
                    present = 0
                    for contigId in cs:
                        if contigId in containedMarkerGenes:
                            present += 1

                    comp += float(present) / len(cs)
                msCompletion.append(comp / len(colocatedSets) -
                                    percentCompletion)

            plotData.append(mgCompletion)
            plotData.append(msCompletion)

            species = ' '.join(
                metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:])

            plotLabels.append(species + ' (' + genomeId + ')')
            plotLabels.append('')

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(
            ';', '_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels,
                     r'$\Delta$' + ' Percent Completion', '', False, title)
コード例 #2
0
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            mostSpecificRank, minMarkers):
        print('Ubiquity threshold: ' + str(ubiquityThreshold))
        print('Single-copy threshold: ' + str(singleCopyThreshold))
        print('Min. genomes: ' + str(minGenomes))
        print('Most specific taxonomic rank: ' + str(mostSpecificRank))

        img = IMG()

        deltaMarkerSetSizes = []

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)
        lineages = ['prokaryotes'] + lineages

        boxPlotLabels = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            trusted = img.trustedGenomes()
            genomeIds = list(genomeIds.intersection(trusted))

            print('')
            print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            pfamTable = img.pfamTable(genomeIds)
            pfamTable = img.filterPfamTable(genomeIds, pfamTable,
                                            ubiquityThreshold * 0.9,
                                            singleCopyThreshold * 0.9)

            markerSet = img.markerGenes(
                genomeIds, pfamTable, ubiquityThreshold * (len(genomeIds) - 1),
                singleCopyThreshold * (len(genomeIds) - 1))
            fullMarkerSetSize = len(markerSet)

            if fullMarkerSetSize < minMarkers:
                continue

            boxPlotLabels.append(
                lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) +
                ', ' + str(fullMarkerSetSize) + ')')

            deltaMarkerSetSize = []
            numGenomes = len(genomeIds) - 1

            for loo in range(0, len(genomeIds)):
                if loo != len(genomeIds) - 1:
                    genomeIdSubset = genomeIds[0:loo] + genomeIds[loo + 1:]
                else:
                    genomeIdSubset = genomeIds[0:loo]

                markerSet = img.markerGenes(
                    genomeIdSubset, pfamTable,
                    ubiquityThreshold * len(genomeIdSubset),
                    singleCopyThreshold * len(genomeIdSubset))
                deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet))

                if fullMarkerSetSize < len(markerSet):
                    print('[Warning] Unexpected!')

            deltaMarkerSetSizes.append(deltaMarkerSetSize)

            m = mean(deltaMarkerSetSize)
            s = std(deltaMarkerSetSize)

            print('  LOO Ubiquity >= ' +
                  str(int(ubiquityThreshold * numGenomes)) +
                  ', LOO Single-copy >= ' +
                  str(int(singleCopyThreshold * numGenomes)))
            print('  Delta Mean: %.2f +/- %.2f' % (m, s))
            print('  Delta Min: %d, Delta Max: %d' %
                  (min(deltaMarkerSetSize), max(deltaMarkerSetSize)))

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(
            singleCopyThreshold) + '.boxplot.png'
        title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels,
                     r'$\Delta$' + ' Marker Set Size', '', False, title)
コード例 #3
0
    def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold,
            singleCopyThreshold, percentCompletion, numReplicates, numGenomes,
            contigLen):
        img = IMG()

        lineages = []
        taxon = taxonomyStr.split(';')
        for r in range(0, len(taxon)):
            lineages.append(';'.join(taxon[0:r + 1]))

        # get all marker sets
        markerGenes = []
        geneDistTable = []
        colocatedSets = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')

            # build marker genes and colocated marker sets
            countTable = img.countTable(genomeIds)
            mg = img.markerGenes(genomeIds, countTable,
                                 ubiquityThreshold * len(genomeIds),
                                 singleCopyThreshold * len(genomeIds))
            print('  Marker genes: ' + str(len(mg)))

            mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(mdt)
            cs = img.colocatedSets(colocatedGenes, mg)
            print('  Co-located gene sets: ' + str(len(cs)))

            markerGenes.append(mg)
            geneDistTable.append(mdt)
            colocatedSets.append(cs)

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            completion = [[] for _ in range(len(lineages))]
            for _ in range(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(
                    metadata[genomeId]['genome size'], percentCompletion,
                    contigLen)

                # calculate completion with marker set
                for i in range(len(lineages)):
                    containedMarkerGenes = img.containedMarkerGenes(
                        markerGenes[i], geneDistTable[i][genomeId],
                        startPartialGenomeContigs, contigLen)

                    comp = 0.0
                    for cs in colocatedSets[i]:
                        present = 0
                        for contigId in cs:
                            if contigId in containedMarkerGenes:
                                present += 1

                        comp += float(present) / len(cs)

                    completion[i].append(comp / len(colocatedSets[i]) -
                                         percentCompletion)

                    plotLabels.append(genomeId + '  - ' + lineages[i])

            for d in completion:
                plotData.append(d)

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.lineages.' + taxonomyStr.replace(
            ';', '_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels,
                     r'$\Delta$' + ' Percent Completion', '', False, title)
コード例 #4
0
ファイル: simComparePlots.py プロジェクト: sufforest/SolidBin
 def taxonomicPlots(self, results):
     # summarize results for different taxonomic groups  
     print('  Tabulating results for taxonomic groups.')
     
     metadata = self.img.genomeMetadata()
     
     itemsProcessed = 0      
     compDataDict = defaultdict(lambda : defaultdict(list))
     contDataDict = defaultdict(lambda : defaultdict(list))
     comps = set()
     conts = set()
     seqLens = set()
     
     ranksToProcess = 3
     taxaByRank = [set() for _ in range(0, ranksToProcess)]
     
     overallComp = []
     overallCont = []
             
     genomeInTaxon = defaultdict(set)
     testCases = 0
     for simId in results:
         itemsProcessed += 1
         statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
         sys.stdout.write('%s\r' % statusStr)
         sys.stdout.flush()
         
         genomeId, seqLen, comp, cont = simId.split('-')
         
         if seqLen != '20000':
             continue
         
         if str(float(comp)) in ['0.5', '0.7', '0.8', '0.9'] and str(float(cont)) in ['0.05', '0.10', '0.1', '0.15']:
             print(comp, cont)
             taxonomy = metadata[genomeId]['taxonomy']
             
             testCases += 1
             
             comps.add(float(comp))
             conts.add(float(cont))
             seqLens.add(int(seqLen))
             
             overallComp += results[simId][10]
             overallCont += results[simId][11]
             
             for r in range(0, ranksToProcess):
                 taxon = taxonomy[r]
                 
                 if r == 0 and taxon == 'unclassified':
                     print('*****************************Unclassified at domain-level*****************')
                     continue
                 
                 if taxon == 'unclassified':
                     continue
                 
                 taxon = rankPrefixes[r] + taxon
                 
                 taxaByRank[r].add(taxon)
                                                 
                 compDataDict[taxon]['best'] += results[simId][2]
                 compDataDict[taxon]['domain'] += results[simId][6]
                 compDataDict[taxon]['selected'] += results[simId][10]
                 
                 contDataDict[taxon]['best'] += results[simId][3]
                 contDataDict[taxon]['domain'] += results[simId][7]
                 contDataDict[taxon]['selected'] += results[simId][11]
                 
                 genomeInTaxon[taxon].add(genomeId)
         
     sys.stdout.write('\n')
     
     print('Test cases', testCases)
     
     print('')        
     print('Creating plots for:')
     print('  comps = ', comps)
     print('  conts = ', conts)
     
     print('')
     print('    There are %d taxa.' % (len(compDataDict)))
     
     print('')
     print('  Overall bias:')
     print('    Selected comp: %.2f' % mean(overallComp))
     print('    Selected cont: %.2f' % mean(overallCont))
     
     # get list of ordered taxa by rank
     orderedTaxa = []
     for taxa in taxaByRank:
         orderedTaxa += sorted(taxa)
             
     # plot data
     print('  Plotting results.')
     compData = []
     contData = []
     rowLabels = []
     for taxon in orderedTaxa:
         for msStr in ['best', 'selected', 'domain']:
             numGenomes = len(genomeInTaxon[taxon])
             if numGenomes < 10: # skip groups with only a few genomes
                 continue
             
             rowLabels.append(msStr + ': ' + taxon + ' (' + str(numGenomes) + ')')
             compData.append(compDataDict[taxon][msStr])
             contData.append(contDataDict[taxon][msStr])        
             
     for i, rowLabel in enumerate(rowLabels):
         print(rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i])))))            
               
     # print taxonomic table of results organized by class
     taxonomyTableOut = open(self.simCompareTaxonomyTableOut, 'w')
     for taxon in orderedTaxa:
         numGenomes = len(genomeInTaxon[taxon])
         if numGenomes < 2: # skip groups with only a few genomes
             continue
             
         taxonomyTableOut.write(taxon + '\t' + str(numGenomes))
         for msStr in ['domain', 'selected']:                
             meanTaxonComp = mean(abs(array(compDataDict[taxon][msStr])))
             stdTaxonComp = std(abs(array(compDataDict[taxon][msStr])))
             meanTaxonCont = mean(abs(array(contDataDict[taxon][msStr])))
             stdTaxonCont = std(abs(array(contDataDict[taxon][msStr])))
             
             taxonomyTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont))
         taxonomyTableOut.write('\n')
     taxonomyTableOut.close()
     
     # create box plot
     boxPlot = BoxPlot()
     plotFilename = self.plotPrefix +  '.taxonomy.png'
     boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                     r'$\Delta$' + ' % Completion', None, 
                     r'$\Delta$' + ' % Contamination', None,
                     rowsPerCategory = 3, dpi = self.dpi)
コード例 #5
0
ファイル: simComparePlots.py プロジェクト: sufforest/SolidBin
    def refinementPlots(self, results):
        # summarize results for different CheckM refinements 
        print('  Tabulating results for different refinements.')
        
        metadata = self.img.genomeMetadata()
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))
        comps = set()
        conts = set()
        seqLens = set()
        
        ranksToProcess = 3
        taxaByRank = [set() for _ in range(0, ranksToProcess)]
        
        overallCompIM = []
        overallContIM = [] 
        
        overallCompMS = []
        overallContMS = [] 
        
        overallCompRMS = []
        overallContRMS = [] 
        
        genomeInTaxon = defaultdict(set)
        
        testCases = 0
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            taxonomy = metadata[genomeId]['taxonomy']
            
            if float(comp) < 0.7 or float(cont) > 0.1:
                continue
            
            comps.add(float(comp))
            conts.add(float(cont))
            seqLens.add(int(seqLen))
            
            overallCompIM.append(results[simId][8])
            overallContIM.append(results[simId][9])
            
            overallCompMS.append(results[simId][10])
            overallContMS.append(results[simId][11])
            
            overallCompRMS.append(results[simId][12])
            overallContRMS.append(results[simId][13])
            
            for r in range(0, ranksToProcess):
                taxon = taxonomy[r]
                
                if taxon == 'unclassified':
                    continue
                
                taxaByRank[r].add(taxon)
                
                compDataDict[taxon]['IM'] += results[simId][8]
                compDataDict[taxon]['MS'] += results[simId][10]
                compDataDict[taxon]['RMS'] += results[simId][12]
                
                contDataDict[taxon]['IM'] += results[simId][9]
                contDataDict[taxon]['MS'] += results[simId][11]
                contDataDict[taxon]['RMS'] += results[simId][13]
                                
                genomeInTaxon[taxon].add(genomeId)
            
        sys.stdout.write('\n')
        
        print('Creating plots for:')
        print('  comps = ', comps)
        print('  conts = ', conts)
        
        print('')
        print('    There are %d taxon.' % (len(compDataDict)))
        print('')
        print('Percentage change MS-IM comp: %.4f' % ((mean(abs(array(overallCompMS))) - mean(abs(array(overallCompIM)))) * 100 / mean(abs(array(overallCompIM)))))
        print('Percentage change MS-IM cont: %.4f' % ((mean(abs(array(overallContMS))) - mean(abs(array(overallContIM)))) * 100 / mean(abs(array(overallContIM)))))
        print('')
        print('Percentage change RMS-MS comp: %.4f' % ((mean(abs(array(overallCompRMS))) - mean(abs(array(overallCompMS)))) * 100 / mean(abs(array(overallCompIM)))))
        print('Percentage change RMS-MS cont: %.4f' % ((mean(abs(array(overallContRMS))) - mean(abs(array(overallContMS)))) * 100 / mean(abs(array(overallContIM)))))
        
        print('')
        
        # get list of ordered taxa by rank
        orderedTaxa = []
        for taxa in taxaByRank:
            orderedTaxa += sorted(taxa)
             
        # print table of results organized by class
        refinmentTableOut = open(self.simCompareRefinementTableOut, 'w')
        for taxon in orderedTaxa:
            numGenomes = len(genomeInTaxon[taxon])
            if numGenomes < 2: # skip groups with only a few genomes
                continue
                
            refinmentTableOut.write(taxon + '\t' + str(numGenomes))
            for refineStr in ['IM', 'MS']:               
                meanTaxonComp = mean(abs(array(compDataDict[taxon][refineStr])))
                stdTaxonComp = std(abs(array(compDataDict[taxon][refineStr])))
                meanTaxonCont = mean(abs(array(contDataDict[taxon][refineStr])))
                stdTaxonCont = std(abs(array(contDataDict[taxon][refineStr])))
                
                refinmentTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont))
            
            perCompChange = (mean(abs(array(compDataDict[taxon]['IM']))) - meanTaxonComp) * 100 / mean(abs(array(compDataDict[taxon]['IM'])))
            perContChange = (mean(abs(array(contDataDict[taxon]['IM']))) - meanTaxonCont) * 100 / mean(abs(array(contDataDict[taxon]['IM'])))
            refinmentTableOut.write('\t%.2f\t%.2f\n' % (perCompChange, perContChange))
        refinmentTableOut.close()
       
        # plot data
        print('  Plotting results.')
        compData = []
        contData = []
        rowLabels = []
        for taxon in orderedTaxa:
            for refineStr in ['RMS', 'MS', 'IM']:
                numGenomes = len(genomeInTaxon[taxon])
                if numGenomes < 10: # skip groups with only a few genomes
                    continue

                rowLabels.append(refineStr + ': ' + taxon + ' (' + str(numGenomes) + ')')
                compData.append(compDataDict[taxon][refineStr])
                contData.append(contDataDict[taxon][refineStr])       
                
        for i, rowLabel in enumerate(rowLabels):
            print(rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i])))))
            
        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.refinements.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', None, 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 3, dpi = self.dpi)
コード例 #6
0
ファイル: simComparePlots.py プロジェクト: sufforest/SolidBin
    def markerSets(self, results):
        # summarize results from IM vs MS
        print('  Tabulating results for domain-level marker genes vs marker sets.')
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))

        genomeIds = set()
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            genomeIds.add(genomeId)
            expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen))
            
            compDataDict[expCondStr]['IM'] += results[simId][4]
            compDataDict[expCondStr]['MS'] += results[simId][6]

            contDataDict[expCondStr]['IM'] += results[simId][5]
            contDataDict[expCondStr]['MS'] += results[simId][7]
                
        print('  There are %d unique genomes.' % len(genomeIds))
              
        sys.stdout.write('\n')
        
        print('    There are %d experimental conditions.' % (len(compDataDict)))
                
        # plot data
        print('  Plotting results.')
        compData = []
        contData = []
        rowLabels = []
        
        for comp in self.compsToConsider:
            for cont in self.contsToConsider:
                for seqLen in [20000]: 
                    for msStr in ['MS', 'IM']:
                        rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100))
                        
                        expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                        compData.append(compDataDict[expCondStr][msStr])
                        contData.append(contDataDict[expCondStr][msStr])  
                                       
        print('MS:\t%.2f\t%.2f' % (mean(abs(array(compData[0::2]))), mean(abs(array(contData[0::2])))))
        print('IM:\t%.2f\t%.2f' % (mean(abs(array(compData[1::2]))), mean(abs(array(contData[1::2])))))   
            
        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.markerSets.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', 'Simulation Conditions', 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 2, dpi = self.dpi)
        
        # print table of results 
        tableOut = open(self.simCompareMarkerSetOut, 'w')
        tableOut.write('Comp. (%)\tCont. (%)\tIM (5kb)\t\tMS (5kb)\t\tIM (20kb)\t\tMS (20kb)\t\tIM (50kb)\t\tMS (50kb)\n')
        
        avgComp = defaultdict(lambda : defaultdict(list))
        avgCont = defaultdict(lambda : defaultdict(list))
        for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]:
            for cont in [0.0, 0.05, 0.1, 0.15, 0.2]:
                
                tableOut.write('%d\t%d' % (comp*100, cont*100))
                
                for seqLen in [5000, 20000, 50000]:
                    expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                     
                    meanCompIM = mean(abs(array(compDataDict[expCondStr]['IM'])))
                    stdCompIM = std(abs(array(compDataDict[expCondStr]['IM'])))
                    meanContIM = mean(abs(array(contDataDict[expCondStr]['IM'])))
                    stdContIM = std(abs(array(contDataDict[expCondStr]['IM'])))
                    
                    avgComp[seqLen]['IM'] += compDataDict[expCondStr]['IM']
                    avgCont[seqLen]['IM'] += contDataDict[expCondStr]['IM']
                    
                    meanCompMS = mean(abs(array(compDataDict[expCondStr]['MS'])))
                    stdCompMS = std(abs(array(compDataDict[expCondStr]['MS'])))
                    meanContMS = mean(abs(array(contDataDict[expCondStr]['MS'])))
                    stdContMS = std(abs(array(contDataDict[expCondStr]['MS'])))
                    
                    avgComp[seqLen]['MS'] += compDataDict[expCondStr]['MS']
                    avgCont[seqLen]['MS'] += contDataDict[expCondStr]['MS']
                    
                    tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS))
                tableOut.write('\n')
                
        tableOut.write('\tAverage:')
        for seqLen in [5000, 20000, 50000]: 
            meanCompIM = mean(abs(array(avgComp[seqLen]['IM'])))
            stdCompIM = std(abs(array(avgComp[seqLen]['IM'])))
            meanContIM = mean(abs(array(avgCont[seqLen]['IM'])))
            stdContIM = std(abs(array(avgCont[seqLen]['IM'])))
            
            meanCompMS = mean(abs(array(avgComp[seqLen]['MS'])))
            stdCompMS = std(abs(array(avgComp[seqLen]['MS'])))
            meanContMS = mean(abs(array(avgCont[seqLen]['MS'])))
            stdContMS = std(abs(array(avgCont[seqLen]['MS'])))
            
            tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS))
                        
        tableOut.write('\n')     
                
        tableOut.close()
コード例 #7
0
ファイル: simComparePlots.py プロジェクト: sufforest/SolidBin
    def conditionsPlot(self, results):
        # summarize results for each experimental condition  
        print('  Tabulating results for each experimental condition using marker sets.')
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))
        comps = set()
        conts = set()
        seqLens = set()
        
        compOutliers = defaultdict(list)
        contOutliers = defaultdict(list)
        
        genomeIds = set()
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            genomeIds.add(genomeId)
            expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen))
            
            comps.add(float(comp))
            conts.add(float(cont))
            seqLens.add(int(seqLen))
            
            compDataDict[expCondStr]['best'] += results[simId][2]
            compDataDict[expCondStr]['domain'] += results[simId][6]
            compDataDict[expCondStr]['selected'] += results[simId][10]
            
            for dComp in results[simId][2]:
                compOutliers[expCondStr] += [[dComp, genomeId]]
            
            contDataDict[expCondStr]['best'] += results[simId][3]
            contDataDict[expCondStr]['domain'] += results[simId][7]
            contDataDict[expCondStr]['selected'] += results[simId][11]
            
            for dCont in results[simId][3]:
                contOutliers[expCondStr] += [[dCont, genomeId]]
                
        print('  There are %d unique genomes.' % len(genomeIds))
              
        sys.stdout.write('\n')
        
        print('    There are %d experimental conditions.' % (len(compDataDict)))
                
        # plot data
        print('  Plotting results.')
        compData = []
        contData = []
        rowLabels = []
        
        foutComp = open('./simulations/simulation.scaffolds.draft.comp_outliers.domain.tsv', 'w')
        foutCont = open('./simulations/simulation.scaffolds.draft.cont_outliers.domain.tsv', 'w')
        for comp in self.compsToConsider:
            for cont in self.contsToConsider:
                for msStr in ['best', 'selected', 'domain']:
                    for seqLen in [20000]: 
                        rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100))
                        
                        expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                        compData.append(compDataDict[expCondStr][msStr])
                        contData.append(contDataDict[expCondStr][msStr])  
                    
                # report completenes outliers
                foutComp.write(expCondStr)

                compOutliers[expCondStr].sort()
                
                dComps = array([r[0] for r in compOutliers[expCondStr]])
                perc1 = scoreatpercentile(dComps, 1)
                perc99 = scoreatpercentile(dComps, 99)
                print(expCondStr, perc1, perc99)
                
                foutComp.write('\t%.2f\t%.2f' % (perc1, perc99))
                
                outliers = []
                for item in compOutliers[expCondStr]:
                    if item[0] < perc1 or item[0] > perc99:
                        outliers.append(item[1])
                        
                outlierCount = Counter(outliers)
                for genomeId, count in outlierCount.most_common():
                    foutComp.write('\t' + genomeId + ': ' + str(count))
                foutComp.write('\n')
                
                # report contamination outliers
                foutCont.write(expCondStr)

                contOutliers[expCondStr].sort()
                
                dConts = array([r[0] for r in contOutliers[expCondStr]])
                perc1 = scoreatpercentile(dConts, 1)
                perc99 = scoreatpercentile(dConts, 99)
                
                foutCont.write('\t%.2f\t%.2f' % (perc1, perc99))
                
                outliers = []
                for item in contOutliers[expCondStr]:
                    if item[0] < perc1 or item[0] > perc99:
                        outliers.append(item[1])
                        
                outlierCount = Counter(outliers)
                for genomeId, count in outlierCount.most_common():
                    foutCont.write('\t' + genomeId + ': ' + str(count))
                foutCont.write('\n')
                
        foutComp.close()
        foutCont.close()
                               
        print('best:\t%.2f\t%.2f' % (mean(abs(array(compData[0::3]))), mean(abs(array(contData[0::3])))))
        print('selected:\t%.2f\t%.2f' % (mean(abs(array(compData[1::3]))), mean(abs(array(contData[1::3])))))   
        print('domain:\t%.2f\t%.2f' % (mean(abs(array(compData[2::3]))), mean(abs(array(contData[2::3])))))   

        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.conditions.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', 'Simulation Conditions', 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 3, dpi = self.dpi)
        
        
        # print table of results 
        tableOut = open(self.simCompareConditionOut, 'w')
        tableOut.write('Comp. (%)\tCont. (%)\tbest (5kb)\t\tselected (5kb)\t\tdomain (5kb)\t\tbest (20kb)\t\tselected (20kb)\t\tdomain (20kb)\t\tbest (50kb)\t\tselected (50kb)\t\tdomain (50kb)\n')
        
        avgComp = defaultdict(lambda : defaultdict(list))
        avgCont = defaultdict(lambda : defaultdict(list))
        for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]:
            for cont in [0.0, 0.05, 0.1, 0.15, 0.2]:
                
                tableOut.write('%d\t%d' % (comp*100, cont*100))
                
                for seqLen in [5000, 20000, 50000]:
                    expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                   
                    meanCompD = mean(abs(array(compDataDict[expCondStr]['domain'])))
                    stdCompD = std(abs(array(compDataDict[expCondStr]['domain'])))
                    meanContD = mean(abs(array(contDataDict[expCondStr]['domain'])))
                    stdContD = std(abs(array(contDataDict[expCondStr]['domain'])))
                    
                    avgComp[seqLen]['domain'] += compDataDict[expCondStr]['domain']
                    avgCont[seqLen]['domain'] += contDataDict[expCondStr]['domain']
                    
                    meanCompS = mean(abs(array(compDataDict[expCondStr]['selected'])))
                    stdCompS = std(abs(array(compDataDict[expCondStr]['selected'])))
                    meanContS = mean(abs(array(contDataDict[expCondStr]['selected'])))
                    stdContS = std(abs(array(contDataDict[expCondStr]['selected'])))
                    
                    avgComp[seqLen]['selected'] += compDataDict[expCondStr]['selected']
                    avgCont[seqLen]['selected'] += contDataDict[expCondStr]['selected']
                    
                    meanCompB = mean(abs(array(compDataDict[expCondStr]['best'])))
                    stdCompB = std(abs(array(compDataDict[expCondStr]['best'])))
                    meanContB = mean(abs(array(contDataDict[expCondStr]['best'])))
                    stdContB = std(abs(array(contDataDict[expCondStr]['best'])))
                    
                    avgComp[seqLen]['best'] += compDataDict[expCondStr]['best']
                    avgCont[seqLen]['best'] += contDataDict[expCondStr]['best']
                    
                    tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB))
                tableOut.write('\n')
                
        tableOut.write('\tAverage:')
        for seqLen in [5000, 20000, 50000]: 
            meanCompD = mean(abs(array(avgComp[seqLen]['domain'])))
            stdCompD = std(abs(array(avgComp[seqLen]['domain'])))
            meanContD = mean(abs(array(avgCont[seqLen]['domain'])))
            stdContD = std(abs(array(avgCont[seqLen]['domain'])))
            
            meanCompS = mean(abs(array(avgComp[seqLen]['selected'])))
            stdCompS = std(abs(array(avgComp[seqLen]['selected'])))
            meanContS = mean(abs(array(avgCont[seqLen]['selected'])))
            stdContS = std(abs(array(avgCont[seqLen]['selected'])))
            
            meanCompB = mean(abs(array(avgComp[seqLen]['best'])))
            stdCompB = std(abs(array(avgComp[seqLen]['best'])))
            meanContB = mean(abs(array(avgCont[seqLen]['best'])))
            stdContB = std(abs(array(avgCont[seqLen]['best'])))
            
            tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB))
                        
        tableOut.write('\n')     
                
        tableOut.close()
コード例 #8
0
ファイル: markerSetTest.py プロジェクト: sufforest/SolidBin
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold,
            replicates, minGenomes, maxGenomes, stepSize):
        img = IMG()
        markergenes = MarkerGenes()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print('Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) +
              ' genomes.')
        if len(genomeIds) < minGenomes:
            sys.stderr.write('[Error] Insufficent number of genomes.\n')
            sys.exit()

        print('')
        print('Ubiquity threshold: ' + str(ubiquityThreshold))
        print('Single-copy threshold: ' + str(singleCopyThreshold))

        meanMarkerSetSize = []
        stdMarkerSetSize = []
        markerSetSizes = []
        if maxGenomes == -1:
            maxGenomes = len(genomeIds)

        if maxGenomes > len(genomeIds):
            maxGenomes = len(genomeIds)

        countTable = img.countTable(genomeIds)
        countTable = img.filterTable(genomeIds, countTable)

        for numGenomes in range(minGenomes, maxGenomes, stepSize):
            markerSetSize = []
            for _ in range(0, replicates):
                genomeIdSubset = random.sample(genomeIds, numGenomes)

                markerGenes = markergenes.identify(
                    genomeIdSubset, countTable,
                    ubiquityThreshold * len(genomeIdSubset),
                    singleCopyThreshold * len(genomeIdSubset))
                geneDistTable = img.geneDistTable(genomeIdSubset,
                                                  markerGenes,
                                                  spacingBetweenContigs=1e6)
                colocatedGenes = img.colocatedGenes(geneDistTable)
                colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

                markerSetSize.append(len(colocatedSets))

            markerSetSizes.append(markerSetSize)

            m = mean(markerSetSize)
            meanMarkerSetSize.append(m)

            s = std(markerSetSize)
            stdMarkerSetSize.append(s)

            print('')
            print('Genomes: ' + str(numGenomes) + ', Ubiquity > ' +
                  str(int(ubiquityThreshold * len(genomeIdSubset))) +
                  ', Single-copy > ' +
                  str(int(singleCopyThreshold * len(genomeIdSubset))))
            print('Mean: %.2f +/- %.2f' % (m, s))
            print('Min: %d, Max: %d' %
                  (min(markerSetSize), max(markerSetSize)))

        # plot data
        errorBar = ErrorBar()
        plotFilename = './images/markerset.' + taxonomyStr.replace(
            ';', '_') + '.' + str(ubiquityThreshold) + '-' + str(
                singleCopyThreshold) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; '
        ) + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize),
                      meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes',
                      'Marker Set Size', title)

        boxPlot = BoxPlot()
        plotFilename = './images/markerset.' + taxonomyStr.replace(
            ';', '_') + '.' + str(ubiquityThreshold) + '-' + str(
                singleCopyThreshold) + '.boxplot.png'
        boxPlot.plot(plotFilename, markerSetSizes,
                     arange(minGenomes, maxGenomes, stepSize),
                     'Number of Genomes', 'Marker Set Size', True, title)
コード例 #9
0
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
        print '\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'

        # build marker genes and colocated marker sets
        countTable = img.countTable(genomeIds)
        markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
        print '  Marker genes: ' + str(len(markerGenes))

        geneDistTable = img.geneDistTable(genomeIds, markerGenes)
        colocatedGenes = img.colocatedGenes(geneDistTable)
        colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
        print '  Co-located gene sets: ' + str(len(colocatedSets))


        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            mgCompletion = []
            msCompletion = []
            for _ in xrange(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen)

                # calculate completion with marker genes
                containedMarkerGenes = img.containedMarkerGenes(markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen)
                mgCompletion.append(float(len(containedMarkerGenes))/len(markerGenes) - percentCompletion)

                # calculate completion with marker set
                comp = 0.0
                for cs in colocatedSets:
                    present = 0
                    for contigId in cs:
                        if contigId in containedMarkerGenes:
                            present += 1

                    comp += float(present) / len(cs)
                msCompletion.append(comp / len(colocatedSets) - percentCompletion)

            plotData.append(mgCompletion)
            plotData.append(msCompletion)

            species = ' '.join(metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:])

            plotLabels.append(species + ' (' + genomeId + ')')
            plotLabels.append('')

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
コード例 #10
0
ファイル: simComparePlots.py プロジェクト: Ecogenomics/CheckM
    def refinementPlots(self, results):
        # summarize results for different CheckM refinements 
        print '  Tabulating results for different refinements.'
        
        metadata = self.img.genomeMetadata()
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))
        comps = set()
        conts = set()
        seqLens = set()
        
        ranksToProcess = 3
        taxaByRank = [set() for _ in xrange(0, ranksToProcess)]
        
        overallCompIM = []
        overallContIM = [] 
        
        overallCompMS = []
        overallContMS = [] 
        
        overallCompRMS = []
        overallContRMS = [] 
        
        genomeInTaxon = defaultdict(set)
        
        testCases = 0
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            taxonomy = metadata[genomeId]['taxonomy']
            
            if float(comp) < 0.7 or float(cont) > 0.1:
                continue
            
            comps.add(float(comp))
            conts.add(float(cont))
            seqLens.add(int(seqLen))
            
            overallCompIM.append(results[simId][8])
            overallContIM.append(results[simId][9])
            
            overallCompMS.append(results[simId][10])
            overallContMS.append(results[simId][11])
            
            overallCompRMS.append(results[simId][12])
            overallContRMS.append(results[simId][13])
            
            for r in xrange(0, ranksToProcess):
                taxon = taxonomy[r]
                
                if taxon == 'unclassified':
                    continue
                
                taxaByRank[r].add(taxon)
                
                compDataDict[taxon]['IM'] += results[simId][8]
                compDataDict[taxon]['MS'] += results[simId][10]
                compDataDict[taxon]['RMS'] += results[simId][12]
                
                contDataDict[taxon]['IM'] += results[simId][9]
                contDataDict[taxon]['MS'] += results[simId][11]
                contDataDict[taxon]['RMS'] += results[simId][13]
                                
                genomeInTaxon[taxon].add(genomeId)
            
        sys.stdout.write('\n')
        
        print 'Creating plots for:'
        print '  comps = ', comps
        print '  conts = ', conts
        
        print ''
        print '    There are %d taxon.' % (len(compDataDict))
        print ''
        print 'Percentage change MS-IM comp: %.4f' % ((mean(abs(array(overallCompMS))) - mean(abs(array(overallCompIM)))) * 100 / mean(abs(array(overallCompIM))))
        print 'Percentage change MS-IM cont: %.4f' % ((mean(abs(array(overallContMS))) - mean(abs(array(overallContIM)))) * 100 / mean(abs(array(overallContIM))))
        print ''
        print 'Percentage change RMS-MS comp: %.4f' % ((mean(abs(array(overallCompRMS))) - mean(abs(array(overallCompMS)))) * 100 / mean(abs(array(overallCompIM))))
        print 'Percentage change RMS-MS cont: %.4f' % ((mean(abs(array(overallContRMS))) - mean(abs(array(overallContMS)))) * 100 / mean(abs(array(overallContIM))))
        
        print ''
        
        # get list of ordered taxa by rank
        orderedTaxa = []
        for taxa in taxaByRank:
            orderedTaxa += sorted(taxa)
             
        # print table of results organized by class
        refinmentTableOut = open(self.simCompareRefinementTableOut, 'w')
        for taxon in orderedTaxa:
            numGenomes = len(genomeInTaxon[taxon])
            if numGenomes < 2: # skip groups with only a few genomes
                continue
                
            refinmentTableOut.write(taxon + '\t' + str(numGenomes))
            for refineStr in ['IM', 'MS']:               
                meanTaxonComp = mean(abs(array(compDataDict[taxon][refineStr])))
                stdTaxonComp = std(abs(array(compDataDict[taxon][refineStr])))
                meanTaxonCont = mean(abs(array(contDataDict[taxon][refineStr])))
                stdTaxonCont = std(abs(array(contDataDict[taxon][refineStr])))
                
                refinmentTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont))
            
            perCompChange = (mean(abs(array(compDataDict[taxon]['IM']))) - meanTaxonComp) * 100 / mean(abs(array(compDataDict[taxon]['IM'])))
            perContChange = (mean(abs(array(contDataDict[taxon]['IM']))) - meanTaxonCont) * 100 / mean(abs(array(contDataDict[taxon]['IM'])))
            refinmentTableOut.write('\t%.2f\t%.2f\n' % (perCompChange, perContChange))
        refinmentTableOut.close()
       
        # plot data
        print '  Plotting results.'
        compData = []
        contData = []
        rowLabels = []
        for taxon in orderedTaxa:
            for refineStr in ['RMS', 'MS', 'IM']:
                numGenomes = len(genomeInTaxon[taxon])
                if numGenomes < 10: # skip groups with only a few genomes
                    continue

                rowLabels.append(refineStr + ': ' + taxon + ' (' + str(numGenomes) + ')')
                compData.append(compDataDict[taxon][refineStr])
                contData.append(contDataDict[taxon][refineStr])       
                
        for i, rowLabel in enumerate(rowLabels):
            print rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i]))))
            
        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.refinements.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', None, 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 3, dpi = self.dpi)
コード例 #11
0
ファイル: markerSetTest.py プロジェクト: Ecogenomics/CheckM
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, replicates, minGenomes, maxGenomes, stepSize):
        img = IMG()
        markergenes = MarkerGenes()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'
        if len(genomeIds) < minGenomes:
            sys.stderr.write('[Error] Insufficent number of genomes.\n')
            sys.exit()

        print ''
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)

        meanMarkerSetSize = []
        stdMarkerSetSize = []
        markerSetSizes = []
        if maxGenomes == -1:
            maxGenomes = len(genomeIds)

        if maxGenomes > len(genomeIds):
            maxGenomes = len(genomeIds)

        countTable = img.countTable(genomeIds)
        countTable = img.filterTable(genomeIds, countTable)

        for numGenomes in xrange(minGenomes, maxGenomes, stepSize):
            markerSetSize = []
            for _ in xrange(0, replicates):
                genomeIdSubset = random.sample(genomeIds, numGenomes)

                markerGenes = markergenes.identify(genomeIdSubset, countTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset))
                geneDistTable = img.geneDistTable(genomeIdSubset, markerGenes, spacingBetweenContigs=1e6)
                colocatedGenes = img.colocatedGenes(geneDistTable)
                colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

                markerSetSize.append(len(colocatedSets))

            markerSetSizes.append(markerSetSize)

            m = mean(markerSetSize)
            meanMarkerSetSize.append(m)

            s = std(markerSetSize)
            stdMarkerSetSize.append(s)

            print ''
            print 'Genomes: ' + str(numGenomes) + ', Ubiquity > ' + str(int(ubiquityThreshold*len(genomeIdSubset))) + ', Single-copy > ' + str(int(singleCopyThreshold*len(genomeIdSubset)))
            print 'Mean: %.2f +/- %.2f' % (m, s)
            print 'Min: %d, Max: %d' %(min(markerSetSize), max(markerSetSize))

        # plot data
        errorBar = ErrorBar()
        plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) +  '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize), meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes', 'Marker Set Size', title)

        boxPlot = BoxPlot()
        plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) +  '.boxplot.png'
        boxPlot.plot(plotFilename, markerSetSizes, arange(minGenomes, maxGenomes, stepSize), 'Number of Genomes', 'Marker Set Size', True, title)
コード例 #12
0
ファイル: simComparePlots.py プロジェクト: Ecogenomics/CheckM
 def taxonomicPlots(self, results):
     # summarize results for different taxonomic groups  
     print '  Tabulating results for taxonomic groups.'
     
     metadata = self.img.genomeMetadata()
     
     itemsProcessed = 0      
     compDataDict = defaultdict(lambda : defaultdict(list))
     contDataDict = defaultdict(lambda : defaultdict(list))
     comps = set()
     conts = set()
     seqLens = set()
     
     ranksToProcess = 3
     taxaByRank = [set() for _ in xrange(0, ranksToProcess)]
     
     overallComp = []
     overallCont = []
             
     genomeInTaxon = defaultdict(set)
     testCases = 0
     for simId in results:
         itemsProcessed += 1
         statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
         sys.stdout.write('%s\r' % statusStr)
         sys.stdout.flush()
         
         genomeId, seqLen, comp, cont = simId.split('-')
         
         if seqLen != '20000':
             continue
         
         if str(float(comp)) in ['0.5', '0.7', '0.8', '0.9'] and str(float(cont)) in ['0.05', '0.10', '0.1', '0.15']:
             print comp, cont
             taxonomy = metadata[genomeId]['taxonomy']
             
             testCases += 1
             
             comps.add(float(comp))
             conts.add(float(cont))
             seqLens.add(int(seqLen))
             
             overallComp += results[simId][10]
             overallCont += results[simId][11]
             
             for r in xrange(0, ranksToProcess):
                 taxon = taxonomy[r]
                 
                 if r == 0 and taxon == 'unclassified':
                     print '*****************************Unclassified at domain-level*****************'
                     continue
                 
                 if taxon == 'unclassified':
                     continue
                 
                 taxon = rankPrefixes[r] + taxon
                 
                 taxaByRank[r].add(taxon)
                                                 
                 compDataDict[taxon]['best'] += results[simId][2]
                 compDataDict[taxon]['domain'] += results[simId][6]
                 compDataDict[taxon]['selected'] += results[simId][10]
                 
                 contDataDict[taxon]['best'] += results[simId][3]
                 contDataDict[taxon]['domain'] += results[simId][7]
                 contDataDict[taxon]['selected'] += results[simId][11]
                 
                 genomeInTaxon[taxon].add(genomeId)
         
     sys.stdout.write('\n')
     
     print 'Test cases', testCases
     
     print ''        
     print 'Creating plots for:'
     print '  comps = ', comps
     print '  conts = ', conts
     
     print ''
     print '    There are %d taxa.' % (len(compDataDict))
     
     print ''
     print '  Overall bias:'
     print '    Selected comp: %.2f' % mean(overallComp)
     print '    Selected cont: %.2f' % mean(overallCont)
     
     # get list of ordered taxa by rank
     orderedTaxa = []
     for taxa in taxaByRank:
         orderedTaxa += sorted(taxa)
             
     # plot data
     print '  Plotting results.'
     compData = []
     contData = []
     rowLabels = []
     for taxon in orderedTaxa:
         for msStr in ['best', 'selected', 'domain']:
             numGenomes = len(genomeInTaxon[taxon])
             if numGenomes < 10: # skip groups with only a few genomes
                 continue
             
             rowLabels.append(msStr + ': ' + taxon + ' (' + str(numGenomes) + ')')
             compData.append(compDataDict[taxon][msStr])
             contData.append(contDataDict[taxon][msStr])        
             
     for i, rowLabel in enumerate(rowLabels):
         print rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i]))))            
               
     # print taxonomic table of results organized by class
     taxonomyTableOut = open(self.simCompareTaxonomyTableOut, 'w')
     for taxon in orderedTaxa:
         numGenomes = len(genomeInTaxon[taxon])
         if numGenomes < 2: # skip groups with only a few genomes
             continue
             
         taxonomyTableOut.write(taxon + '\t' + str(numGenomes))
         for msStr in ['domain', 'selected']:                
             meanTaxonComp = mean(abs(array(compDataDict[taxon][msStr])))
             stdTaxonComp = std(abs(array(compDataDict[taxon][msStr])))
             meanTaxonCont = mean(abs(array(contDataDict[taxon][msStr])))
             stdTaxonCont = std(abs(array(contDataDict[taxon][msStr])))
             
             taxonomyTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont))
         taxonomyTableOut.write('\n')
     taxonomyTableOut.close()
     
     # create box plot
     boxPlot = BoxPlot()
     plotFilename = self.plotPrefix +  '.taxonomy.png'
     boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                     r'$\Delta$' + ' % Completion', None, 
                     r'$\Delta$' + ' % Contamination', None,
                     rowsPerCategory = 3, dpi = self.dpi)
コード例 #13
0
ファイル: simComparePlots.py プロジェクト: Ecogenomics/CheckM
    def conditionsPlot(self, results):
        # summarize results for each experimental condition  
        print '  Tabulating results for each experimental condition using marker sets.'
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))
        comps = set()
        conts = set()
        seqLens = set()
        
        compOutliers = defaultdict(list)
        contOutliers = defaultdict(list)
        
        genomeIds = set()
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            genomeIds.add(genomeId)
            expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen))
            
            comps.add(float(comp))
            conts.add(float(cont))
            seqLens.add(int(seqLen))
            
            compDataDict[expCondStr]['best'] += results[simId][2]
            compDataDict[expCondStr]['domain'] += results[simId][6]
            compDataDict[expCondStr]['selected'] += results[simId][10]
            
            for dComp in results[simId][2]:
                compOutliers[expCondStr] += [[dComp, genomeId]]
            
            contDataDict[expCondStr]['best'] += results[simId][3]
            contDataDict[expCondStr]['domain'] += results[simId][7]
            contDataDict[expCondStr]['selected'] += results[simId][11]
            
            for dCont in results[simId][3]:
                contOutliers[expCondStr] += [[dCont, genomeId]]
                
        print '  There are %d unique genomes.' % len(genomeIds)
              
        sys.stdout.write('\n')
        
        print '    There are %d experimental conditions.' % (len(compDataDict))
                
        # plot data
        print '  Plotting results.'
        compData = []
        contData = []
        rowLabels = []
        
        foutComp = open('./simulations/simulation.scaffolds.draft.comp_outliers.domain.tsv', 'w')
        foutCont = open('./simulations/simulation.scaffolds.draft.cont_outliers.domain.tsv', 'w')
        for comp in self.compsToConsider:
            for cont in self.contsToConsider:
                for msStr in ['best', 'selected', 'domain']:
                    for seqLen in [20000]: 
                        rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100))
                        
                        expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                        compData.append(compDataDict[expCondStr][msStr])
                        contData.append(contDataDict[expCondStr][msStr])  
                    
                # report completenes outliers
                foutComp.write(expCondStr)

                compOutliers[expCondStr].sort()
                
                dComps = array([r[0] for r in compOutliers[expCondStr]])
                perc1 = scoreatpercentile(dComps, 1)
                perc99 = scoreatpercentile(dComps, 99)
                print expCondStr, perc1, perc99
                
                foutComp.write('\t%.2f\t%.2f' % (perc1, perc99))
                
                outliers = []
                for item in compOutliers[expCondStr]:
                    if item[0] < perc1 or item[0] > perc99:
                        outliers.append(item[1])
                        
                outlierCount = Counter(outliers)
                for genomeId, count in outlierCount.most_common():
                    foutComp.write('\t' + genomeId + ': ' + str(count))
                foutComp.write('\n')
                
                # report contamination outliers
                foutCont.write(expCondStr)

                contOutliers[expCondStr].sort()
                
                dConts = array([r[0] for r in contOutliers[expCondStr]])
                perc1 = scoreatpercentile(dConts, 1)
                perc99 = scoreatpercentile(dConts, 99)
                
                foutCont.write('\t%.2f\t%.2f' % (perc1, perc99))
                
                outliers = []
                for item in contOutliers[expCondStr]:
                    if item[0] < perc1 or item[0] > perc99:
                        outliers.append(item[1])
                        
                outlierCount = Counter(outliers)
                for genomeId, count in outlierCount.most_common():
                    foutCont.write('\t' + genomeId + ': ' + str(count))
                foutCont.write('\n')
                
        foutComp.close()
        foutCont.close()
                               
        print 'best:\t%.2f\t%.2f' % (mean(abs(array(compData[0::3]))), mean(abs(array(contData[0::3]))))
        print 'selected:\t%.2f\t%.2f' % (mean(abs(array(compData[1::3]))), mean(abs(array(contData[1::3]))))   
        print 'domain:\t%.2f\t%.2f' % (mean(abs(array(compData[2::3]))), mean(abs(array(contData[2::3]))))   

        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.conditions.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', 'Simulation Conditions', 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 3, dpi = self.dpi)
        
        
        # print table of results 
        tableOut = open(self.simCompareConditionOut, 'w')
        tableOut.write('Comp. (%)\tCont. (%)\tbest (5kb)\t\tselected (5kb)\t\tdomain (5kb)\t\tbest (20kb)\t\tselected (20kb)\t\tdomain (20kb)\t\tbest (50kb)\t\tselected (50kb)\t\tdomain (50kb)\n')
        
        avgComp = defaultdict(lambda : defaultdict(list))
        avgCont = defaultdict(lambda : defaultdict(list))
        for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]:
            for cont in [0.0, 0.05, 0.1, 0.15, 0.2]:
                
                tableOut.write('%d\t%d' % (comp*100, cont*100))
                
                for seqLen in [5000, 20000, 50000]:
                    expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                   
                    meanCompD = mean(abs(array(compDataDict[expCondStr]['domain'])))
                    stdCompD = std(abs(array(compDataDict[expCondStr]['domain'])))
                    meanContD = mean(abs(array(contDataDict[expCondStr]['domain'])))
                    stdContD = std(abs(array(contDataDict[expCondStr]['domain'])))
                    
                    avgComp[seqLen]['domain'] += compDataDict[expCondStr]['domain']
                    avgCont[seqLen]['domain'] += contDataDict[expCondStr]['domain']
                    
                    meanCompS = mean(abs(array(compDataDict[expCondStr]['selected'])))
                    stdCompS = std(abs(array(compDataDict[expCondStr]['selected'])))
                    meanContS = mean(abs(array(contDataDict[expCondStr]['selected'])))
                    stdContS = std(abs(array(contDataDict[expCondStr]['selected'])))
                    
                    avgComp[seqLen]['selected'] += compDataDict[expCondStr]['selected']
                    avgCont[seqLen]['selected'] += contDataDict[expCondStr]['selected']
                    
                    meanCompB = mean(abs(array(compDataDict[expCondStr]['best'])))
                    stdCompB = std(abs(array(compDataDict[expCondStr]['best'])))
                    meanContB = mean(abs(array(contDataDict[expCondStr]['best'])))
                    stdContB = std(abs(array(contDataDict[expCondStr]['best'])))
                    
                    avgComp[seqLen]['best'] += compDataDict[expCondStr]['best']
                    avgCont[seqLen]['best'] += contDataDict[expCondStr]['best']
                    
                    tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB))
                tableOut.write('\n')
                
        tableOut.write('\tAverage:')
        for seqLen in [5000, 20000, 50000]: 
            meanCompD = mean(abs(array(avgComp[seqLen]['domain'])))
            stdCompD = std(abs(array(avgComp[seqLen]['domain'])))
            meanContD = mean(abs(array(avgCont[seqLen]['domain'])))
            stdContD = std(abs(array(avgCont[seqLen]['domain'])))
            
            meanCompS = mean(abs(array(avgComp[seqLen]['selected'])))
            stdCompS = std(abs(array(avgComp[seqLen]['selected'])))
            meanContS = mean(abs(array(avgCont[seqLen]['selected'])))
            stdContS = std(abs(array(avgCont[seqLen]['selected'])))
            
            meanCompB = mean(abs(array(avgComp[seqLen]['best'])))
            stdCompB = std(abs(array(avgComp[seqLen]['best'])))
            meanContB = mean(abs(array(avgCont[seqLen]['best'])))
            stdContB = std(abs(array(avgCont[seqLen]['best'])))
            
            tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB))
                        
        tableOut.write('\n')     
                
        tableOut.close()
コード例 #14
0
ファイル: simComparePlots.py プロジェクト: Ecogenomics/CheckM
    def markerSets(self, results):
        # summarize results from IM vs MS
        print '  Tabulating results for domain-level marker genes vs marker sets.'
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))

        genomeIds = set()
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            genomeIds.add(genomeId)
            expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen))
            
            compDataDict[expCondStr]['IM'] += results[simId][4]
            compDataDict[expCondStr]['MS'] += results[simId][6]

            contDataDict[expCondStr]['IM'] += results[simId][5]
            contDataDict[expCondStr]['MS'] += results[simId][7]
                
        print '  There are %d unique genomes.' % len(genomeIds)
              
        sys.stdout.write('\n')
        
        print '    There are %d experimental conditions.' % (len(compDataDict))
                
        # plot data
        print '  Plotting results.'
        compData = []
        contData = []
        rowLabels = []
        
        for comp in self.compsToConsider:
            for cont in self.contsToConsider:
                for seqLen in [20000]: 
                    for msStr in ['MS', 'IM']:
                        rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100))
                        
                        expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                        compData.append(compDataDict[expCondStr][msStr])
                        contData.append(contDataDict[expCondStr][msStr])  
                                       
        print 'MS:\t%.2f\t%.2f' % (mean(abs(array(compData[0::2]))), mean(abs(array(contData[0::2]))))
        print 'IM:\t%.2f\t%.2f' % (mean(abs(array(compData[1::2]))), mean(abs(array(contData[1::2]))))   
            
        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.markerSets.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', 'Simulation Conditions', 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 2, dpi = self.dpi)
        
        # print table of results 
        tableOut = open(self.simCompareMarkerSetOut, 'w')
        tableOut.write('Comp. (%)\tCont. (%)\tIM (5kb)\t\tMS (5kb)\t\tIM (20kb)\t\tMS (20kb)\t\tIM (50kb)\t\tMS (50kb)\n')
        
        avgComp = defaultdict(lambda : defaultdict(list))
        avgCont = defaultdict(lambda : defaultdict(list))
        for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]:
            for cont in [0.0, 0.05, 0.1, 0.15, 0.2]:
                
                tableOut.write('%d\t%d' % (comp*100, cont*100))
                
                for seqLen in [5000, 20000, 50000]:
                    expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                     
                    meanCompIM = mean(abs(array(compDataDict[expCondStr]['IM'])))
                    stdCompIM = std(abs(array(compDataDict[expCondStr]['IM'])))
                    meanContIM = mean(abs(array(contDataDict[expCondStr]['IM'])))
                    stdContIM = std(abs(array(contDataDict[expCondStr]['IM'])))
                    
                    avgComp[seqLen]['IM'] += compDataDict[expCondStr]['IM']
                    avgCont[seqLen]['IM'] += contDataDict[expCondStr]['IM']
                    
                    meanCompMS = mean(abs(array(compDataDict[expCondStr]['MS'])))
                    stdCompMS = std(abs(array(compDataDict[expCondStr]['MS'])))
                    meanContMS = mean(abs(array(contDataDict[expCondStr]['MS'])))
                    stdContMS = std(abs(array(contDataDict[expCondStr]['MS'])))
                    
                    avgComp[seqLen]['MS'] += compDataDict[expCondStr]['MS']
                    avgCont[seqLen]['MS'] += contDataDict[expCondStr]['MS']
                    
                    tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS))
                tableOut.write('\n')
                
        tableOut.write('\tAverage:')
        for seqLen in [5000, 20000, 50000]: 
            meanCompIM = mean(abs(array(avgComp[seqLen]['IM'])))
            stdCompIM = std(abs(array(avgComp[seqLen]['IM'])))
            meanContIM = mean(abs(array(avgCont[seqLen]['IM'])))
            stdContIM = std(abs(array(avgCont[seqLen]['IM'])))
            
            meanCompMS = mean(abs(array(avgComp[seqLen]['MS'])))
            stdCompMS = std(abs(array(avgComp[seqLen]['MS'])))
            meanContMS = mean(abs(array(avgCont[seqLen]['MS'])))
            stdContMS = std(abs(array(avgCont[seqLen]['MS'])))
            
            tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS))
                        
        tableOut.write('\n')     
                
        tableOut.close()
コード例 #15
0
ファイル: markerSetLOO.py プロジェクト: Ecogenomics/CheckM
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers):
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)
        print 'Min. genomes: ' + str(minGenomes)
        print 'Most specific taxonomic rank: ' + str(mostSpecificRank)

        img = IMG()

        deltaMarkerSetSizes = []

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)
        lineages = ['prokaryotes'] + lineages

        boxPlotLabels = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            trusted = img.trustedGenomes()
            genomeIds = list(genomeIds.intersection(trusted))

            print ''
            print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            pfamTable = img.pfamTable(genomeIds)
            pfamTable = img.filterPfamTable(genomeIds, pfamTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9)

            markerSet = img.markerGenes(genomeIds, pfamTable, ubiquityThreshold*(len(genomeIds)-1), singleCopyThreshold*(len(genomeIds)-1))
            fullMarkerSetSize = len(markerSet)

            if fullMarkerSetSize < minMarkers:
                continue

            boxPlotLabels.append(lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) + ', ' + str(fullMarkerSetSize) + ')')

            deltaMarkerSetSize = []
            numGenomes = len(genomeIds)-1

            for loo in xrange(0, len(genomeIds)):
                if loo != len(genomeIds) - 1:
                    genomeIdSubset = genomeIds[0:loo] + genomeIds[loo+1:]
                else:
                    genomeIdSubset = genomeIds[0:loo]

                markerSet = img.markerGenes(genomeIdSubset, pfamTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset))
                deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet))

                if fullMarkerSetSize < len(markerSet):
                    print '[Warning] Unexpected!'

            deltaMarkerSetSizes.append(deltaMarkerSetSize)

            m = mean(deltaMarkerSetSize)
            s = std(deltaMarkerSetSize)

            print '  LOO Ubiquity >= ' + str(int(ubiquityThreshold*numGenomes)) + ', LOO Single-copy >= ' + str(int(singleCopyThreshold*numGenomes))
            print '  Delta Mean: %.2f +/- %.2f' % (m, s)
            print '  Delta Min: %d, Delta Max: %d' % (min(deltaMarkerSetSize), max(deltaMarkerSetSize))

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png'
        title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels, r'$\Delta$' + ' Marker Set Size', '', False, title)
コード例 #16
0
    def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        lineages = []
        taxon = taxonomyStr.split(';')
        for r in xrange(0, len(taxon)):
            lineages.append(';'.join(taxon[0:r+1]))

        # get all marker sets
        markerGenes = []
        geneDistTable = []
        colocatedSets = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # build marker genes and colocated marker sets
            countTable = img.countTable(genomeIds)
            mg = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
            print '  Marker genes: ' + str(len(mg))

            mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(mdt)
            cs = img.colocatedSets(colocatedGenes, mg)
            print '  Co-located gene sets: ' + str(len(cs))

            markerGenes.append(mg)
            geneDistTable.append(mdt)
            colocatedSets.append(cs)

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            completion = [[] for _ in xrange(len(lineages))]
            for _ in xrange(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen)

                # calculate completion with marker set
                for i in xrange(len(lineages)):
                    containedMarkerGenes = img.containedMarkerGenes(markerGenes[i], geneDistTable[i][genomeId], startPartialGenomeContigs, contigLen)

                    comp = 0.0
                    for cs in colocatedSets[i]:
                        present = 0
                        for contigId in cs:
                            if contigId in containedMarkerGenes:
                                present += 1

                        comp += float(present) / len(cs)

                    completion[i].append(comp / len(colocatedSets[i]) - percentCompletion)

                    plotLabels.append(genomeId + '  - ' + lineages[i])

            for d in completion:
                plotData.append(d)

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.lineages.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)