コード例 #1
0
ファイル: markerSetSizes.py プロジェクト: HadrienG/CheckM
    def run(self, minThreshold, maxThreshold, stepSize, minGenomes,
            mostSpecificRanks):
        img = IMG()

        trustedGenomeIds = img.trustedGenomes()

        fout = open('./data/markerSetSize.tsv', 'w')
        fout.write('Lineage\t# genomes')
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            fout.write('\t' + str(threshold))
        fout.write('\n')

        lineages = img.lineagesSorted(mostSpecificRanks)
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            genomeIds = list(genomeIds.intersection(trustedGenomeIds))

            if len(genomeIds) < minGenomes:
                continue

            print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')
            fout.write(lineage + '\t' + str(len(genomeIds)))

            pfamTable = img.pfamTable(genomeIds)
            for threshold in arange(maxThreshold, minThreshold, -stepSize):
                markerSet = img.markerGenes(genomeIds, pfamTable,
                                            threshold * len(genomeIds),
                                            threshold * len(genomeIds))
                fout.write('\t' + str(len(markerSet)))
                print('  Threshold = %.2f, marker set size = %d' %
                      (threshold, len(markerSet)))
            fout.write('\n')

        fout.close()
コード例 #2
0
ファイル: markerSetSizes.py プロジェクト: IUEayhu/CheckM
    def run(self, minThreshold, maxThreshold, stepSize, minGenomes, mostSpecificRanks):
        img = IMG()

        trustedGenomeIds = img.trustedGenomes()

        fout = open("./data/markerSetSize.tsv", "w")
        fout.write("Lineage\t# genomes")
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            fout.write("\t" + str(threshold))
        fout.write("\n")

        lineages = img.lineagesSorted(mostSpecificRanks)
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            genomeIds = list(genomeIds.intersection(trustedGenomeIds))

            if len(genomeIds) < minGenomes:
                continue

            print "\nLineage " + lineage + " contains " + str(len(genomeIds)) + " genomes."
            fout.write(lineage + "\t" + str(len(genomeIds)))

            pfamTable = img.pfamTable(genomeIds)
            for threshold in arange(maxThreshold, minThreshold, -stepSize):
                markerSet = img.markerGenes(
                    genomeIds, pfamTable, threshold * len(genomeIds), threshold * len(genomeIds)
                )
                fout.write("\t" + str(len(markerSet)))
                print "  Threshold = %.2f, marker set size = %d" % (threshold, len(markerSet))
            fout.write("\n")

        fout.close()
コード例 #3
0
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            mostSpecificRank, minMarkers):
        print('Ubiquity threshold: ' + str(ubiquityThreshold))
        print('Single-copy threshold: ' + str(singleCopyThreshold))
        print('Min. genomes: ' + str(minGenomes))
        print('Most specific taxonomic rank: ' + str(mostSpecificRank))

        img = IMG()

        deltaMarkerSetSizes = []

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)
        lineages = ['prokaryotes'] + lineages

        boxPlotLabels = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            trusted = img.trustedGenomes()
            genomeIds = list(genomeIds.intersection(trusted))

            print('')
            print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            pfamTable = img.pfamTable(genomeIds)
            pfamTable = img.filterPfamTable(genomeIds, pfamTable,
                                            ubiquityThreshold * 0.9,
                                            singleCopyThreshold * 0.9)

            markerSet = img.markerGenes(
                genomeIds, pfamTable, ubiquityThreshold * (len(genomeIds) - 1),
                singleCopyThreshold * (len(genomeIds) - 1))
            fullMarkerSetSize = len(markerSet)

            if fullMarkerSetSize < minMarkers:
                continue

            boxPlotLabels.append(
                lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) +
                ', ' + str(fullMarkerSetSize) + ')')

            deltaMarkerSetSize = []
            numGenomes = len(genomeIds) - 1

            for loo in range(0, len(genomeIds)):
                if loo != len(genomeIds) - 1:
                    genomeIdSubset = genomeIds[0:loo] + genomeIds[loo + 1:]
                else:
                    genomeIdSubset = genomeIds[0:loo]

                markerSet = img.markerGenes(
                    genomeIdSubset, pfamTable,
                    ubiquityThreshold * len(genomeIdSubset),
                    singleCopyThreshold * len(genomeIdSubset))
                deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet))

                if fullMarkerSetSize < len(markerSet):
                    print('[Warning] Unexpected!')

            deltaMarkerSetSizes.append(deltaMarkerSetSize)

            m = mean(deltaMarkerSetSize)
            s = std(deltaMarkerSetSize)

            print('  LOO Ubiquity >= ' +
                  str(int(ubiquityThreshold * numGenomes)) +
                  ', LOO Single-copy >= ' +
                  str(int(singleCopyThreshold * numGenomes)))
            print('  Delta Mean: %.2f +/- %.2f' % (m, s))
            print('  Delta Min: %d, Delta Max: %d' %
                  (min(deltaMarkerSetSize), max(deltaMarkerSetSize)))

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(
            singleCopyThreshold) + '.boxplot.png'
        title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels,
                     r'$\Delta$' + ' Marker Set Size', '', False, title)
コード例 #4
0
ファイル: markerSetLOO.py プロジェクト: Ecogenomics/CheckM
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers):
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)
        print 'Min. genomes: ' + str(minGenomes)
        print 'Most specific taxonomic rank: ' + str(mostSpecificRank)

        img = IMG()

        deltaMarkerSetSizes = []

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)
        lineages = ['prokaryotes'] + lineages

        boxPlotLabels = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            trusted = img.trustedGenomes()
            genomeIds = list(genomeIds.intersection(trusted))

            print ''
            print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            pfamTable = img.pfamTable(genomeIds)
            pfamTable = img.filterPfamTable(genomeIds, pfamTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9)

            markerSet = img.markerGenes(genomeIds, pfamTable, ubiquityThreshold*(len(genomeIds)-1), singleCopyThreshold*(len(genomeIds)-1))
            fullMarkerSetSize = len(markerSet)

            if fullMarkerSetSize < minMarkers:
                continue

            boxPlotLabels.append(lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) + ', ' + str(fullMarkerSetSize) + ')')

            deltaMarkerSetSize = []
            numGenomes = len(genomeIds)-1

            for loo in xrange(0, len(genomeIds)):
                if loo != len(genomeIds) - 1:
                    genomeIdSubset = genomeIds[0:loo] + genomeIds[loo+1:]
                else:
                    genomeIdSubset = genomeIds[0:loo]

                markerSet = img.markerGenes(genomeIdSubset, pfamTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset))
                deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet))

                if fullMarkerSetSize < len(markerSet):
                    print '[Warning] Unexpected!'

            deltaMarkerSetSizes.append(deltaMarkerSetSize)

            m = mean(deltaMarkerSetSize)
            s = std(deltaMarkerSetSize)

            print '  LOO Ubiquity >= ' + str(int(ubiquityThreshold*numGenomes)) + ', LOO Single-copy >= ' + str(int(singleCopyThreshold*numGenomes))
            print '  Delta Mean: %.2f +/- %.2f' % (m, s)
            print '  Delta Min: %d, Delta Max: %d' % (min(deltaMarkerSetSize), max(deltaMarkerSetSize))

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png'
        title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels, r'$\Delta$' + ' Marker Set Size', '', False, title)