Esempio n. 1
0
    def run(self, taxonomyStr, minThreshold, maxThreshold, stepSize):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'

        markerSetSizes = []

        countTable = img.countTable(genomeIds)
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            markerGenes = img.markerGenes(genomeIds, countTable, threshold*len(genomeIds), threshold*len(genomeIds))

            geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(geneDistTable)
            colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

            markerSetSizes.append(len(colocatedSets))

            print '  Threshold = %.2f, marker set size = %d' % (threshold, len(markerGenes))

        # plot data
        plot = LinePlot()
        plotFilename = './images/markerSetSize.' + taxonomyStr.replace(';','_') + '.png'
        title = taxonomyStr.replace(';', '; ')
        plot.plot(plotFilename, arange(maxThreshold, minThreshold, -stepSize), markerSetSizes, 'Threshold', 'Marker Set Size', title)
Esempio n. 2
0
    def run(self, taxonomyStr, minThreshold, maxThreshold, stepSize):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print('Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.')

        markerSetSizes = []

        countTable = img.countTable(genomeIds)
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            markerGenes = img.markerGenes(genomeIds, countTable, threshold*len(genomeIds), threshold*len(genomeIds))

            geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(geneDistTable)
            colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

            markerSetSizes.append(len(colocatedSets))

            print('  Threshold = %.2f, marker set size = %d' % (threshold, len(markerGenes)))

        # plot data
        plot = LinePlot()
        plotFilename = './images/markerSetSize.' + taxonomyStr.replace(';','_') + '.png'
        title = taxonomyStr.replace(';', '; ')
        plot.plot(plotFilename, arange(maxThreshold, minThreshold, -stepSize), markerSetSizes, 'Threshold', 'Marker Set Size', title)
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold,
            percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
        print('\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) +
              ' genomes.')

        # build marker genes and colocated marker sets
        countTable = img.countTable(genomeIds)
        markerGenes = img.markerGenes(genomeIds, countTable,
                                      ubiquityThreshold * len(genomeIds),
                                      singleCopyThreshold * len(genomeIds))
        print('  Marker genes: ' + str(len(markerGenes)))

        geneDistTable = img.geneDistTable(genomeIds,
                                          markerGenes,
                                          spacingBetweenContigs=1e6)
        colocatedGenes = img.colocatedGenes(geneDistTable)
        colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
        print('  Co-located gene sets: ' + str(len(colocatedSets)))

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            mgCompletion = []
            msCompletion = []
            for _ in range(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(
                    metadata[genomeId]['genome size'], percentCompletion,
                    contigLen)

                # calculate completion with marker genes
                containedMarkerGenes = img.containedMarkerGenes(
                    markerGenes, geneDistTable[genomeId],
                    startPartialGenomeContigs, contigLen)
                mgCompletion.append(
                    float(len(containedMarkerGenes)) / len(markerGenes) -
                    percentCompletion)

                # calculate completion with marker set
                comp = 0.0
                for cs in colocatedSets:
                    present = 0
                    for contigId in cs:
                        if contigId in containedMarkerGenes:
                            present += 1

                    comp += float(present) / len(cs)
                msCompletion.append(comp / len(colocatedSets) -
                                    percentCompletion)

            plotData.append(mgCompletion)
            plotData.append(msCompletion)

            species = ' '.join(
                metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:])

            plotLabels.append(species + ' (' + genomeId + ')')
            plotLabels.append('')

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(
            ';', '_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels,
                     r'$\Delta$' + ' Percent Completion', '', False, title)
    def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold,
            singleCopyThreshold, percentCompletion, numReplicates, numGenomes,
            contigLen):
        img = IMG()

        lineages = []
        taxon = taxonomyStr.split(';')
        for r in range(0, len(taxon)):
            lineages.append(';'.join(taxon[0:r + 1]))

        # get all marker sets
        markerGenes = []
        geneDistTable = []
        colocatedSets = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')

            # build marker genes and colocated marker sets
            countTable = img.countTable(genomeIds)
            mg = img.markerGenes(genomeIds, countTable,
                                 ubiquityThreshold * len(genomeIds),
                                 singleCopyThreshold * len(genomeIds))
            print('  Marker genes: ' + str(len(mg)))

            mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(mdt)
            cs = img.colocatedSets(colocatedGenes, mg)
            print('  Co-located gene sets: ' + str(len(cs)))

            markerGenes.append(mg)
            geneDistTable.append(mdt)
            colocatedSets.append(cs)

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            completion = [[] for _ in range(len(lineages))]
            for _ in range(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(
                    metadata[genomeId]['genome size'], percentCompletion,
                    contigLen)

                # calculate completion with marker set
                for i in range(len(lineages)):
                    containedMarkerGenes = img.containedMarkerGenes(
                        markerGenes[i], geneDistTable[i][genomeId],
                        startPartialGenomeContigs, contigLen)

                    comp = 0.0
                    for cs in colocatedSets[i]:
                        present = 0
                        for contigId in cs:
                            if contigId in containedMarkerGenes:
                                present += 1

                        comp += float(present) / len(cs)

                    completion[i].append(comp / len(colocatedSets[i]) -
                                         percentCompletion)

                    plotLabels.append(genomeId + '  - ' + lineages[i])

            for d in completion:
                plotData.append(d)

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.lineages.' + taxonomyStr.replace(
            ';', '_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels,
                     r'$\Delta$' + ' Percent Completion', '', False, title)
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
        print '\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'

        # build marker genes and colocated marker sets
        countTable = img.countTable(genomeIds)
        markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
        print '  Marker genes: ' + str(len(markerGenes))

        geneDistTable = img.geneDistTable(genomeIds, markerGenes)
        colocatedGenes = img.colocatedGenes(geneDistTable)
        colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
        print '  Co-located gene sets: ' + str(len(colocatedSets))


        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            mgCompletion = []
            msCompletion = []
            for _ in xrange(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen)

                # calculate completion with marker genes
                containedMarkerGenes = img.containedMarkerGenes(markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen)
                mgCompletion.append(float(len(containedMarkerGenes))/len(markerGenes) - percentCompletion)

                # calculate completion with marker set
                comp = 0.0
                for cs in colocatedSets:
                    present = 0
                    for contigId in cs:
                        if contigId in containedMarkerGenes:
                            present += 1

                    comp += float(present) / len(cs)
                msCompletion.append(comp / len(colocatedSets) - percentCompletion)

            plotData.append(mgCompletion)
            plotData.append(msCompletion)

            species = ' '.join(metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:])

            plotLabels.append(species + ' (' + genomeId + ')')
            plotLabels.append('')

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
Esempio n. 6
0
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold,
            replicates, minGenomes, maxGenomes, stepSize):
        img = IMG()
        markergenes = MarkerGenes()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print('Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) +
              ' genomes.')
        if len(genomeIds) < minGenomes:
            sys.stderr.write('[Error] Insufficent number of genomes.\n')
            sys.exit()

        print('')
        print('Ubiquity threshold: ' + str(ubiquityThreshold))
        print('Single-copy threshold: ' + str(singleCopyThreshold))

        meanMarkerSetSize = []
        stdMarkerSetSize = []
        markerSetSizes = []
        if maxGenomes == -1:
            maxGenomes = len(genomeIds)

        if maxGenomes > len(genomeIds):
            maxGenomes = len(genomeIds)

        countTable = img.countTable(genomeIds)
        countTable = img.filterTable(genomeIds, countTable)

        for numGenomes in range(minGenomes, maxGenomes, stepSize):
            markerSetSize = []
            for _ in range(0, replicates):
                genomeIdSubset = random.sample(genomeIds, numGenomes)

                markerGenes = markergenes.identify(
                    genomeIdSubset, countTable,
                    ubiquityThreshold * len(genomeIdSubset),
                    singleCopyThreshold * len(genomeIdSubset))
                geneDistTable = img.geneDistTable(genomeIdSubset,
                                                  markerGenes,
                                                  spacingBetweenContigs=1e6)
                colocatedGenes = img.colocatedGenes(geneDistTable)
                colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

                markerSetSize.append(len(colocatedSets))

            markerSetSizes.append(markerSetSize)

            m = mean(markerSetSize)
            meanMarkerSetSize.append(m)

            s = std(markerSetSize)
            stdMarkerSetSize.append(s)

            print('')
            print('Genomes: ' + str(numGenomes) + ', Ubiquity > ' +
                  str(int(ubiquityThreshold * len(genomeIdSubset))) +
                  ', Single-copy > ' +
                  str(int(singleCopyThreshold * len(genomeIdSubset))))
            print('Mean: %.2f +/- %.2f' % (m, s))
            print('Min: %d, Max: %d' %
                  (min(markerSetSize), max(markerSetSize)))

        # plot data
        errorBar = ErrorBar()
        plotFilename = './images/markerset.' + taxonomyStr.replace(
            ';', '_') + '.' + str(ubiquityThreshold) + '-' + str(
                singleCopyThreshold) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; '
        ) + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize),
                      meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes',
                      'Marker Set Size', title)

        boxPlot = BoxPlot()
        plotFilename = './images/markerset.' + taxonomyStr.replace(
            ';', '_') + '.' + str(ubiquityThreshold) + '-' + str(
                singleCopyThreshold) + '.boxplot.png'
        boxPlot.plot(plotFilename, markerSetSizes,
                     arange(minGenomes, maxGenomes, stepSize),
                     'Number of Genomes', 'Marker Set Size', True, title)
Esempio n. 7
0
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, replicates, minGenomes, maxGenomes, stepSize):
        img = IMG()
        markergenes = MarkerGenes()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'
        if len(genomeIds) < minGenomes:
            sys.stderr.write('[Error] Insufficent number of genomes.\n')
            sys.exit()

        print ''
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)

        meanMarkerSetSize = []
        stdMarkerSetSize = []
        markerSetSizes = []
        if maxGenomes == -1:
            maxGenomes = len(genomeIds)

        if maxGenomes > len(genomeIds):
            maxGenomes = len(genomeIds)

        countTable = img.countTable(genomeIds)
        countTable = img.filterTable(genomeIds, countTable)

        for numGenomes in xrange(minGenomes, maxGenomes, stepSize):
            markerSetSize = []
            for _ in xrange(0, replicates):
                genomeIdSubset = random.sample(genomeIds, numGenomes)

                markerGenes = markergenes.identify(genomeIdSubset, countTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset))
                geneDistTable = img.geneDistTable(genomeIdSubset, markerGenes, spacingBetweenContigs=1e6)
                colocatedGenes = img.colocatedGenes(geneDistTable)
                colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

                markerSetSize.append(len(colocatedSets))

            markerSetSizes.append(markerSetSize)

            m = mean(markerSetSize)
            meanMarkerSetSize.append(m)

            s = std(markerSetSize)
            stdMarkerSetSize.append(s)

            print ''
            print 'Genomes: ' + str(numGenomes) + ', Ubiquity > ' + str(int(ubiquityThreshold*len(genomeIdSubset))) + ', Single-copy > ' + str(int(singleCopyThreshold*len(genomeIdSubset)))
            print 'Mean: %.2f +/- %.2f' % (m, s)
            print 'Min: %d, Max: %d' %(min(markerSetSize), max(markerSetSize))

        # plot data
        errorBar = ErrorBar()
        plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) +  '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize), meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes', 'Marker Set Size', title)

        boxPlot = BoxPlot()
        plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) +  '.boxplot.png'
        boxPlot.plot(plotFilename, markerSetSizes, arange(minGenomes, maxGenomes, stepSize), 'Number of Genomes', 'Marker Set Size', True, title)
    def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        lineages = []
        taxon = taxonomyStr.split(';')
        for r in xrange(0, len(taxon)):
            lineages.append(';'.join(taxon[0:r+1]))

        # get all marker sets
        markerGenes = []
        geneDistTable = []
        colocatedSets = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # build marker genes and colocated marker sets
            countTable = img.countTable(genomeIds)
            mg = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
            print '  Marker genes: ' + str(len(mg))

            mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(mdt)
            cs = img.colocatedSets(colocatedGenes, mg)
            print '  Co-located gene sets: ' + str(len(cs))

            markerGenes.append(mg)
            geneDistTable.append(mdt)
            colocatedSets.append(cs)

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            completion = [[] for _ in xrange(len(lineages))]
            for _ in xrange(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen)

                # calculate completion with marker set
                for i in xrange(len(lineages)):
                    containedMarkerGenes = img.containedMarkerGenes(markerGenes[i], geneDistTable[i][genomeId], startPartialGenomeContigs, contigLen)

                    comp = 0.0
                    for cs in colocatedSets[i]:
                        present = 0
                        for contigId in cs:
                            if contigId in containedMarkerGenes:
                                present += 1

                        comp += float(present) / len(cs)

                    completion[i].append(comp / len(colocatedSets[i]) - percentCompletion)

                    plotLabels.append(genomeId + '  - ' + lineages[i])

            for d in completion:
                plotData.append(d)

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.lineages.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)