Beispiel #1
0
    def run(self, minThreshold, maxThreshold, stepSize, minGenomes, mostSpecificRanks):
        img = IMG()

        trustedGenomeIds = img.trustedGenomes()

        fout = open("./data/markerSetSize.tsv", "w")
        fout.write("Lineage\t# genomes")
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            fout.write("\t" + str(threshold))
        fout.write("\n")

        lineages = img.lineagesSorted(mostSpecificRanks)
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            genomeIds = list(genomeIds.intersection(trustedGenomeIds))

            if len(genomeIds) < minGenomes:
                continue

            print "\nLineage " + lineage + " contains " + str(len(genomeIds)) + " genomes."
            fout.write(lineage + "\t" + str(len(genomeIds)))

            pfamTable = img.pfamTable(genomeIds)
            for threshold in arange(maxThreshold, minThreshold, -stepSize):
                markerSet = img.markerGenes(
                    genomeIds, pfamTable, threshold * len(genomeIds), threshold * len(genomeIds)
                )
                fout.write("\t" + str(len(markerSet)))
                print "  Threshold = %.2f, marker set size = %d" % (threshold, len(markerSet))
            fout.write("\n")

        fout.close()
Beispiel #2
0
    def run(self, taxonomyStr, minThreshold, maxThreshold, stepSize):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print('Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.')

        markerSetSizes = []

        countTable = img.countTable(genomeIds)
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            markerGenes = img.markerGenes(genomeIds, countTable, threshold*len(genomeIds), threshold*len(genomeIds))

            geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(geneDistTable)
            colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

            markerSetSizes.append(len(colocatedSets))

            print('  Threshold = %.2f, marker set size = %d' % (threshold, len(markerGenes)))

        # plot data
        plot = LinePlot()
        plotFilename = './images/markerSetSize.' + taxonomyStr.replace(';','_') + '.png'
        title = taxonomyStr.replace(';', '; ')
        plot.plot(plotFilename, arange(maxThreshold, minThreshold, -stepSize), markerSetSizes, 'Threshold', 'Marker Set Size', title)
Beispiel #3
0
    def run(self, minThreshold, maxThreshold, stepSize, minGenomes,
            mostSpecificRanks):
        img = IMG()

        trustedGenomeIds = img.trustedGenomes()

        fout = open('./data/markerSetSize.tsv', 'w')
        fout.write('Lineage\t# genomes')
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            fout.write('\t' + str(threshold))
        fout.write('\n')

        lineages = img.lineagesSorted(mostSpecificRanks)
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            genomeIds = list(genomeIds.intersection(trustedGenomeIds))

            if len(genomeIds) < minGenomes:
                continue

            print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')
            fout.write(lineage + '\t' + str(len(genomeIds)))

            pfamTable = img.pfamTable(genomeIds)
            for threshold in arange(maxThreshold, minThreshold, -stepSize):
                markerSet = img.markerGenes(genomeIds, pfamTable,
                                            threshold * len(genomeIds),
                                            threshold * len(genomeIds))
                fout.write('\t' + str(len(markerSet)))
                print('  Threshold = %.2f, marker set size = %d' %
                      (threshold, len(markerSet)))
            fout.write('\n')

        fout.close()
Beispiel #4
0
    def run(self, taxonomyStr, minThreshold, maxThreshold, stepSize):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'

        markerSetSizes = []

        countTable = img.countTable(genomeIds)
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            markerGenes = img.markerGenes(genomeIds, countTable, threshold*len(genomeIds), threshold*len(genomeIds))

            geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(geneDistTable)
            colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

            markerSetSizes.append(len(colocatedSets))

            print '  Threshold = %.2f, marker set size = %d' % (threshold, len(markerGenes))

        # plot data
        plot = LinePlot()
        plotFilename = './images/markerSetSize.' + taxonomyStr.replace(';','_') + '.png'
        title = taxonomyStr.replace(';', '; ')
        plot.plot(plotFilename, arange(maxThreshold, minThreshold, -stepSize), markerSetSizes, 'Threshold', 'Marker Set Size', title)
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, percentGenomes, numReplicates):
        img = IMG()

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)

        fout = open('./data/lineage_evaluation.tsv', 'w')
        fout.write('Lineage\t# genomes\t# markers\tpercentage\tnum replicates\tmean\tstd\tmean %\tmean + std%\tmean + 2*std %\n')

        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9)

            # calculate marker set for all genomes
            markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
            if len(markerGenes) < minMarkers:
                continue

            print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'
            print '  Marker genes: ' + str(len(markerGenes))

            fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t%.2f' % percentGenomes + '\t' + str(numReplicates))

            # withhold select percentage of genomes and calculate new marker set
            changeMarkerSetSize = []
            for _ in xrange(0, numReplicates):
                subsetGenomeIds = random.sample(genomeIds, int((1.0-percentGenomes)*len(genomeIds) + 0.5))

                newMarkerGenes = img.markerGenes(subsetGenomeIds, countTable, ubiquityThreshold*len(subsetGenomeIds), singleCopyThreshold*len(subsetGenomeIds))

                changeMarkerSetSize.append(len(newMarkerGenes.symmetric_difference(markerGenes)))

            m = mean(changeMarkerSetSize)
            s = std(changeMarkerSetSize)

            print '  Mean: %.2f, Std: %.2f, Per: %.2f' % (m, s, (m+ 2*s) * 100 / len(markerGenes))
            fout.write('\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f' % (m, s, m * 100 / len(markerGenes), (m + s) * 100 / len(markerGenes), (m + 2*s) * 100 / len(markerGenes)) + '\n')

        fout.close()
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold,
            percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
        print('\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) +
              ' genomes.')

        # build marker genes and colocated marker sets
        countTable = img.countTable(genomeIds)
        markerGenes = img.markerGenes(genomeIds, countTable,
                                      ubiquityThreshold * len(genomeIds),
                                      singleCopyThreshold * len(genomeIds))
        print('  Marker genes: ' + str(len(markerGenes)))

        geneDistTable = img.geneDistTable(genomeIds,
                                          markerGenes,
                                          spacingBetweenContigs=1e6)
        colocatedGenes = img.colocatedGenes(geneDistTable)
        colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
        print('  Co-located gene sets: ' + str(len(colocatedSets)))

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            mgCompletion = []
            msCompletion = []
            for _ in range(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(
                    metadata[genomeId]['genome size'], percentCompletion,
                    contigLen)

                # calculate completion with marker genes
                containedMarkerGenes = img.containedMarkerGenes(
                    markerGenes, geneDistTable[genomeId],
                    startPartialGenomeContigs, contigLen)
                mgCompletion.append(
                    float(len(containedMarkerGenes)) / len(markerGenes) -
                    percentCompletion)

                # calculate completion with marker set
                comp = 0.0
                for cs in colocatedSets:
                    present = 0
                    for contigId in cs:
                        if contigId in containedMarkerGenes:
                            present += 1

                    comp += float(present) / len(cs)
                msCompletion.append(comp / len(colocatedSets) -
                                    percentCompletion)

            plotData.append(mgCompletion)
            plotData.append(msCompletion)

            species = ' '.join(
                metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:])

            plotLabels.append(species + ' (' + genomeId + ')')
            plotLabels.append('')

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(
            ';', '_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels,
                     r'$\Delta$' + ' Percent Completion', '', False, title)
    def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold,
            singleCopyThreshold, percentCompletion, numReplicates, numGenomes,
            contigLen):
        img = IMG()

        lineages = []
        taxon = taxonomyStr.split(';')
        for r in range(0, len(taxon)):
            lineages.append(';'.join(taxon[0:r + 1]))

        # get all marker sets
        markerGenes = []
        geneDistTable = []
        colocatedSets = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')

            # build marker genes and colocated marker sets
            countTable = img.countTable(genomeIds)
            mg = img.markerGenes(genomeIds, countTable,
                                 ubiquityThreshold * len(genomeIds),
                                 singleCopyThreshold * len(genomeIds))
            print('  Marker genes: ' + str(len(mg)))

            mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(mdt)
            cs = img.colocatedSets(colocatedGenes, mg)
            print('  Co-located gene sets: ' + str(len(cs)))

            markerGenes.append(mg)
            geneDistTable.append(mdt)
            colocatedSets.append(cs)

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            completion = [[] for _ in range(len(lineages))]
            for _ in range(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(
                    metadata[genomeId]['genome size'], percentCompletion,
                    contigLen)

                # calculate completion with marker set
                for i in range(len(lineages)):
                    containedMarkerGenes = img.containedMarkerGenes(
                        markerGenes[i], geneDistTable[i][genomeId],
                        startPartialGenomeContigs, contigLen)

                    comp = 0.0
                    for cs in colocatedSets[i]:
                        present = 0
                        for contigId in cs:
                            if contigId in containedMarkerGenes:
                                present += 1

                        comp += float(present) / len(cs)

                    completion[i].append(comp / len(colocatedSets[i]) -
                                         percentCompletion)

                    plotLabels.append(genomeId + '  - ' + lineages[i])

            for d in completion:
                plotData.append(d)

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.lineages.' + taxonomyStr.replace(
            ';', '_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels,
                     r'$\Delta$' + ' Percent Completion', '', False, title)
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            minMarkers, mostSpecificRank, percentGenomes, numReplicates):
        img = IMG()

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)

        fout = open('./data/lineage_evaluation.tsv', 'w')
        fout.write(
            'Lineage\t# genomes\t# markers\tpercentage\tnum replicates\tmean\tstd\tmean %\tmean + std%\tmean + 2*std %\n'
        )

        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            countTable = img.filterTable(genomeIds, countTable,
                                         ubiquityThreshold * 0.9,
                                         singleCopyThreshold * 0.9)

            # calculate marker set for all genomes
            markerGenes = img.markerGenes(genomeIds, countTable,
                                          ubiquityThreshold * len(genomeIds),
                                          singleCopyThreshold * len(genomeIds))
            if len(markerGenes) < minMarkers:
                continue

            print '\nLineage ' + lineage + ' contains ' + str(
                len(genomeIds)) + ' genomes.'
            print '  Marker genes: ' + str(len(markerGenes))

            fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' +
                       str(len(markerGenes)) + '\t%.2f' % percentGenomes +
                       '\t' + str(numReplicates))

            # withhold select percentage of genomes and calculate new marker set
            changeMarkerSetSize = []
            for _ in xrange(0, numReplicates):
                subsetGenomeIds = random.sample(
                    genomeIds,
                    int((1.0 - percentGenomes) * len(genomeIds) + 0.5))

                newMarkerGenes = img.markerGenes(
                    subsetGenomeIds, countTable,
                    ubiquityThreshold * len(subsetGenomeIds),
                    singleCopyThreshold * len(subsetGenomeIds))

                changeMarkerSetSize.append(
                    len(newMarkerGenes.symmetric_difference(markerGenes)))

            m = mean(changeMarkerSetSize)
            s = std(changeMarkerSetSize)

            print '  Mean: %.2f, Std: %.2f, Per: %.2f' % (m, s,
                                                          (m + 2 * s) * 100 /
                                                          len(markerGenes))
            fout.write('\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f' %
                       (m, s, m * 100 / len(markerGenes),
                        (m + s) * 100 / len(markerGenes),
                        (m + 2 * s) * 100 / len(markerGenes)) + '\n')

        fout.close()
Beispiel #9
0
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            mostSpecificRank, minMarkers):
        print('Ubiquity threshold: ' + str(ubiquityThreshold))
        print('Single-copy threshold: ' + str(singleCopyThreshold))
        print('Min. genomes: ' + str(minGenomes))
        print('Most specific taxonomic rank: ' + str(mostSpecificRank))

        img = IMG()

        deltaMarkerSetSizes = []

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)
        lineages = ['prokaryotes'] + lineages

        boxPlotLabels = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            trusted = img.trustedGenomes()
            genomeIds = list(genomeIds.intersection(trusted))

            print('')
            print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            pfamTable = img.pfamTable(genomeIds)
            pfamTable = img.filterPfamTable(genomeIds, pfamTable,
                                            ubiquityThreshold * 0.9,
                                            singleCopyThreshold * 0.9)

            markerSet = img.markerGenes(
                genomeIds, pfamTable, ubiquityThreshold * (len(genomeIds) - 1),
                singleCopyThreshold * (len(genomeIds) - 1))
            fullMarkerSetSize = len(markerSet)

            if fullMarkerSetSize < minMarkers:
                continue

            boxPlotLabels.append(
                lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) +
                ', ' + str(fullMarkerSetSize) + ')')

            deltaMarkerSetSize = []
            numGenomes = len(genomeIds) - 1

            for loo in range(0, len(genomeIds)):
                if loo != len(genomeIds) - 1:
                    genomeIdSubset = genomeIds[0:loo] + genomeIds[loo + 1:]
                else:
                    genomeIdSubset = genomeIds[0:loo]

                markerSet = img.markerGenes(
                    genomeIdSubset, pfamTable,
                    ubiquityThreshold * len(genomeIdSubset),
                    singleCopyThreshold * len(genomeIdSubset))
                deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet))

                if fullMarkerSetSize < len(markerSet):
                    print('[Warning] Unexpected!')

            deltaMarkerSetSizes.append(deltaMarkerSetSize)

            m = mean(deltaMarkerSetSize)
            s = std(deltaMarkerSetSize)

            print('  LOO Ubiquity >= ' +
                  str(int(ubiquityThreshold * numGenomes)) +
                  ', LOO Single-copy >= ' +
                  str(int(singleCopyThreshold * numGenomes)))
            print('  Delta Mean: %.2f +/- %.2f' % (m, s))
            print('  Delta Min: %d, Delta Max: %d' %
                  (min(deltaMarkerSetSize), max(deltaMarkerSetSize)))

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(
            singleCopyThreshold) + '.boxplot.png'
        title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels,
                     r'$\Delta$' + ' Marker Set Size', '', False, title)
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
        print '\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'

        # build marker genes and colocated marker sets
        countTable = img.countTable(genomeIds)
        markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
        print '  Marker genes: ' + str(len(markerGenes))

        geneDistTable = img.geneDistTable(genomeIds, markerGenes)
        colocatedGenes = img.colocatedGenes(geneDistTable)
        colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
        print '  Co-located gene sets: ' + str(len(colocatedSets))


        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            mgCompletion = []
            msCompletion = []
            for _ in xrange(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen)

                # calculate completion with marker genes
                containedMarkerGenes = img.containedMarkerGenes(markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen)
                mgCompletion.append(float(len(containedMarkerGenes))/len(markerGenes) - percentCompletion)

                # calculate completion with marker set
                comp = 0.0
                for cs in colocatedSets:
                    present = 0
                    for contigId in cs:
                        if contigId in containedMarkerGenes:
                            present += 1

                    comp += float(present) / len(cs)
                msCompletion.append(comp / len(colocatedSets) - percentCompletion)

            plotData.append(mgCompletion)
            plotData.append(msCompletion)

            species = ' '.join(metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:])

            plotLabels.append(species + ' (' + genomeId + ')')
            plotLabels.append('')

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
Beispiel #11
0
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers):
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)
        print 'Min. genomes: ' + str(minGenomes)
        print 'Most specific taxonomic rank: ' + str(mostSpecificRank)

        img = IMG()

        deltaMarkerSetSizes = []

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)
        lineages = ['prokaryotes'] + lineages

        boxPlotLabels = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            trusted = img.trustedGenomes()
            genomeIds = list(genomeIds.intersection(trusted))

            print ''
            print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            pfamTable = img.pfamTable(genomeIds)
            pfamTable = img.filterPfamTable(genomeIds, pfamTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9)

            markerSet = img.markerGenes(genomeIds, pfamTable, ubiquityThreshold*(len(genomeIds)-1), singleCopyThreshold*(len(genomeIds)-1))
            fullMarkerSetSize = len(markerSet)

            if fullMarkerSetSize < minMarkers:
                continue

            boxPlotLabels.append(lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) + ', ' + str(fullMarkerSetSize) + ')')

            deltaMarkerSetSize = []
            numGenomes = len(genomeIds)-1

            for loo in xrange(0, len(genomeIds)):
                if loo != len(genomeIds) - 1:
                    genomeIdSubset = genomeIds[0:loo] + genomeIds[loo+1:]
                else:
                    genomeIdSubset = genomeIds[0:loo]

                markerSet = img.markerGenes(genomeIdSubset, pfamTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset))
                deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet))

                if fullMarkerSetSize < len(markerSet):
                    print '[Warning] Unexpected!'

            deltaMarkerSetSizes.append(deltaMarkerSetSize)

            m = mean(deltaMarkerSetSize)
            s = std(deltaMarkerSetSize)

            print '  LOO Ubiquity >= ' + str(int(ubiquityThreshold*numGenomes)) + ', LOO Single-copy >= ' + str(int(singleCopyThreshold*numGenomes))
            print '  Delta Mean: %.2f +/- %.2f' % (m, s)
            print '  Delta Min: %d, Delta Max: %d' % (min(deltaMarkerSetSize), max(deltaMarkerSetSize))

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png'
        title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels, r'$\Delta$' + ' Marker Set Size', '', False, title)
    def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        lineages = []
        taxon = taxonomyStr.split(';')
        for r in xrange(0, len(taxon)):
            lineages.append(';'.join(taxon[0:r+1]))

        # get all marker sets
        markerGenes = []
        geneDistTable = []
        colocatedSets = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # build marker genes and colocated marker sets
            countTable = img.countTable(genomeIds)
            mg = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
            print '  Marker genes: ' + str(len(mg))

            mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(mdt)
            cs = img.colocatedSets(colocatedGenes, mg)
            print '  Co-located gene sets: ' + str(len(cs))

            markerGenes.append(mg)
            geneDistTable.append(mdt)
            colocatedSets.append(cs)

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            completion = [[] for _ in xrange(len(lineages))]
            for _ in xrange(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen)

                # calculate completion with marker set
                for i in xrange(len(lineages)):
                    containedMarkerGenes = img.containedMarkerGenes(markerGenes[i], geneDistTable[i][genomeId], startPartialGenomeContigs, contigLen)

                    comp = 0.0
                    for cs in colocatedSets[i]:
                        present = 0
                        for contigId in cs:
                            if contigId in containedMarkerGenes:
                                present += 1

                        comp += float(present) / len(cs)

                    completion[i].append(comp / len(colocatedSets[i]) - percentCompletion)

                    plotLabels.append(genomeId + '  - ' + lineages[i])

            for d in completion:
                plotData.append(d)

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.lineages.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)