コード例 #1
0
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold,
            percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
        print('\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) +
              ' genomes.')

        # build marker genes and colocated marker sets
        countTable = img.countTable(genomeIds)
        markerGenes = img.markerGenes(genomeIds, countTable,
                                      ubiquityThreshold * len(genomeIds),
                                      singleCopyThreshold * len(genomeIds))
        print('  Marker genes: ' + str(len(markerGenes)))

        geneDistTable = img.geneDistTable(genomeIds,
                                          markerGenes,
                                          spacingBetweenContigs=1e6)
        colocatedGenes = img.colocatedGenes(geneDistTable)
        colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
        print('  Co-located gene sets: ' + str(len(colocatedSets)))

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            mgCompletion = []
            msCompletion = []
            for _ in range(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(
                    metadata[genomeId]['genome size'], percentCompletion,
                    contigLen)

                # calculate completion with marker genes
                containedMarkerGenes = img.containedMarkerGenes(
                    markerGenes, geneDistTable[genomeId],
                    startPartialGenomeContigs, contigLen)
                mgCompletion.append(
                    float(len(containedMarkerGenes)) / len(markerGenes) -
                    percentCompletion)

                # calculate completion with marker set
                comp = 0.0
                for cs in colocatedSets:
                    present = 0
                    for contigId in cs:
                        if contigId in containedMarkerGenes:
                            present += 1

                    comp += float(present) / len(cs)
                msCompletion.append(comp / len(colocatedSets) -
                                    percentCompletion)

            plotData.append(mgCompletion)
            plotData.append(msCompletion)

            species = ' '.join(
                metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:])

            plotLabels.append(species + ' (' + genomeId + ')')
            plotLabels.append('')

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(
            ';', '_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels,
                     r'$\Delta$' + ' Percent Completion', '', False, title)
コード例 #2
0
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
        print '\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'

        # build marker genes and colocated marker sets
        countTable = img.countTable(genomeIds)
        markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
        print '  Marker genes: ' + str(len(markerGenes))

        geneDistTable = img.geneDistTable(genomeIds, markerGenes)
        colocatedGenes = img.colocatedGenes(geneDistTable)
        colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
        print '  Co-located gene sets: ' + str(len(colocatedSets))


        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            mgCompletion = []
            msCompletion = []
            for _ in xrange(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen)

                # calculate completion with marker genes
                containedMarkerGenes = img.containedMarkerGenes(markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen)
                mgCompletion.append(float(len(containedMarkerGenes))/len(markerGenes) - percentCompletion)

                # calculate completion with marker set
                comp = 0.0
                for cs in colocatedSets:
                    present = 0
                    for contigId in cs:
                        if contigId in containedMarkerGenes:
                            present += 1

                    comp += float(present) / len(cs)
                msCompletion.append(comp / len(colocatedSets) - percentCompletion)

            plotData.append(mgCompletion)
            plotData.append(msCompletion)

            species = ' '.join(metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:])

            plotLabels.append(species + ' (' + genomeId + ')')
            plotLabels.append('')

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
コード例 #3
0
    def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold,
            singleCopyThreshold, percentCompletion, numReplicates, numGenomes,
            contigLen):
        img = IMG()

        lineages = []
        taxon = taxonomyStr.split(';')
        for r in range(0, len(taxon)):
            lineages.append(';'.join(taxon[0:r + 1]))

        # get all marker sets
        markerGenes = []
        geneDistTable = []
        colocatedSets = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')

            # build marker genes and colocated marker sets
            countTable = img.countTable(genomeIds)
            mg = img.markerGenes(genomeIds, countTable,
                                 ubiquityThreshold * len(genomeIds),
                                 singleCopyThreshold * len(genomeIds))
            print('  Marker genes: ' + str(len(mg)))

            mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(mdt)
            cs = img.colocatedSets(colocatedGenes, mg)
            print('  Co-located gene sets: ' + str(len(cs)))

            markerGenes.append(mg)
            geneDistTable.append(mdt)
            colocatedSets.append(cs)

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            completion = [[] for _ in range(len(lineages))]
            for _ in range(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(
                    metadata[genomeId]['genome size'], percentCompletion,
                    contigLen)

                # calculate completion with marker set
                for i in range(len(lineages)):
                    containedMarkerGenes = img.containedMarkerGenes(
                        markerGenes[i], geneDistTable[i][genomeId],
                        startPartialGenomeContigs, contigLen)

                    comp = 0.0
                    for cs in colocatedSets[i]:
                        present = 0
                        for contigId in cs:
                            if contigId in containedMarkerGenes:
                                present += 1

                        comp += float(present) / len(cs)

                    completion[i].append(comp / len(colocatedSets[i]) -
                                         percentCompletion)

                    plotLabels.append(genomeId + '  - ' + lineages[i])

            for d in completion:
                plotData.append(d)

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.lineages.' + taxonomyStr.replace(
            ';', '_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels,
                     r'$\Delta$' + ' Percent Completion', '', False, title)
コード例 #4
0
    def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        lineages = []
        taxon = taxonomyStr.split(';')
        for r in xrange(0, len(taxon)):
            lineages.append(';'.join(taxon[0:r+1]))

        # get all marker sets
        markerGenes = []
        geneDistTable = []
        colocatedSets = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # build marker genes and colocated marker sets
            countTable = img.countTable(genomeIds)
            mg = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
            print '  Marker genes: ' + str(len(mg))

            mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(mdt)
            cs = img.colocatedSets(colocatedGenes, mg)
            print '  Co-located gene sets: ' + str(len(cs))

            markerGenes.append(mg)
            geneDistTable.append(mdt)
            colocatedSets.append(cs)

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            completion = [[] for _ in xrange(len(lineages))]
            for _ in xrange(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen)

                # calculate completion with marker set
                for i in xrange(len(lineages)):
                    containedMarkerGenes = img.containedMarkerGenes(markerGenes[i], geneDistTable[i][genomeId], startPartialGenomeContigs, contigLen)

                    comp = 0.0
                    for cs in colocatedSets[i]:
                        present = 0
                        for contigId in cs:
                            if contigId in containedMarkerGenes:
                                present += 1

                        comp += float(present) / len(cs)

                    completion[i].append(comp / len(colocatedSets[i]) - percentCompletion)

                    plotLabels.append(genomeId + '  - ' + lineages[i])

            for d in completion:
                plotData.append(d)

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.lineages.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)