def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers, completenessThreshold, contaminationThreshold):
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)
        print 'Min. genomes: ' + str(minGenomes)
        print 'Most specific taxonomic rank: ' + str(mostSpecificRank)
        print 'Min markers: ' + str(minMarkers)
        print 'Completeness threshold: ' + str(completenessThreshold)
        print 'Contamination threshold: ' + str(contaminationThreshold)

        img = IMG()
        markerset = MarkerSet()

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)

        degenerateGenomes = {}
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')

            print ''
            print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            countTable = img.countTable(genomeIds)
            countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9)

            markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
            if len(markerGenes) < minMarkers:
                continue

            geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable)
            colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes)

            for genomeId in genomeIds:
                completeness, contamination = markerset.genomeCheck(colocatedSets, genomeId, countTable)

                if completeness < completenessThreshold or contamination > contaminationThreshold:
                    degenerateGenomes[genomeId] = degenerateGenomes.get(genomeId, []) + [[lineage.split(';')[-1].strip(), len(genomeIds), len(colocatedSets), completeness, contamination]]

        # write out degenerate genomes
        metadata = img.genomeMetadata('Final')

        fout = open('./data/degenerate_genomes.tsv', 'w')
        fout.write('Genome Id\tTaxonomy\tGenome Size (Gbps)\tScaffolds\tBiotic Relationships\tStatus\tLineage\t# genomes\tMarker set size\tCompleteness\tContamination\n')
        for genomeId, data in degenerateGenomes.iteritems():
            fout.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']) + '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6) + '\t' + str(metadata[genomeId]['scaffold count']))
            fout.write('\t' + metadata[genomeId]['biotic relationships'] + '\t' + metadata[genomeId]['status'])

            for d in data:
                fout.write('\t' + d[0] + '\t' + str(d[1]) + '\t' + str(d[2]) + '\t%.3f\t%.3f' % (d[3], d[4]))
            fout.write('\n')

        fout.close()
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, percentGenomes, numReplicates):
        img = IMG()

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)

        fout = open('./data/lineage_evaluation.tsv', 'w')
        fout.write('Lineage\t# genomes\t# markers\tpercentage\tnum replicates\tmean\tstd\tmean %\tmean + std%\tmean + 2*std %\n')

        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9)

            # calculate marker set for all genomes
            markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
            if len(markerGenes) < minMarkers:
                continue

            print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'
            print '  Marker genes: ' + str(len(markerGenes))

            fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t%.2f' % percentGenomes + '\t' + str(numReplicates))

            # withhold select percentage of genomes and calculate new marker set
            changeMarkerSetSize = []
            for _ in xrange(0, numReplicates):
                subsetGenomeIds = random.sample(genomeIds, int((1.0-percentGenomes)*len(genomeIds) + 0.5))

                newMarkerGenes = img.markerGenes(subsetGenomeIds, countTable, ubiquityThreshold*len(subsetGenomeIds), singleCopyThreshold*len(subsetGenomeIds))

                changeMarkerSetSize.append(len(newMarkerGenes.symmetric_difference(markerGenes)))

            m = mean(changeMarkerSetSize)
            s = std(changeMarkerSetSize)

            print '  Mean: %.2f, Std: %.2f, Per: %.2f' % (m, s, (m+ 2*s) * 100 / len(markerGenes))
            fout.write('\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f' % (m, s, m * 100 / len(markerGenes), (m + s) * 100 / len(markerGenes), (m + 2*s) * 100 / len(markerGenes)) + '\n')

        fout.close()
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            mostSpecificRank, minMarkers, completenessThreshold,
            contaminationThreshold):
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)
        print 'Min. genomes: ' + str(minGenomes)
        print 'Most specific taxonomic rank: ' + str(mostSpecificRank)
        print 'Min markers: ' + str(minMarkers)
        print 'Completeness threshold: ' + str(completenessThreshold)
        print 'Contamination threshold: ' + str(contaminationThreshold)

        img = IMG()
        markerset = MarkerSet()

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)

        degenerateGenomes = {}
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')

            print ''
            print 'Lineage ' + lineage + ' contains ' + str(
                len(genomeIds)) + ' genomes.'

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            countTable = img.countTable(genomeIds)
            countTable = img.filterTable(genomeIds, countTable,
                                         ubiquityThreshold * 0.9,
                                         singleCopyThreshold * 0.9)

            markerGenes = markerset.markerGenes(
                genomeIds, countTable, ubiquityThreshold * len(genomeIds),
                singleCopyThreshold * len(genomeIds))
            if len(markerGenes) < minMarkers:
                continue

            geneDistTable = img.geneDistTable(genomeIds,
                                              markerGenes,
                                              spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable)
            colocatedSets = markerset.colocatedSets(colocatedGenes,
                                                    markerGenes)

            for genomeId in genomeIds:
                completeness, contamination = markerset.genomeCheck(
                    colocatedSets, genomeId, countTable)

                if completeness < completenessThreshold or contamination > contaminationThreshold:
                    degenerateGenomes[genomeId] = degenerateGenomes.get(
                        genomeId, []) + [[
                            lineage.split(';')[-1].strip(),
                            len(genomeIds),
                            len(colocatedSets), completeness, contamination
                        ]]

        # write out degenerate genomes
        metadata = img.genomeMetadata('Final')

        fout = open('./data/degenerate_genomes.tsv', 'w')
        fout.write(
            'Genome Id\tTaxonomy\tGenome Size (Gbps)\tScaffolds\tBiotic Relationships\tStatus\tLineage\t# genomes\tMarker set size\tCompleteness\tContamination\n'
        )
        for genomeId, data in degenerateGenomes.iteritems():
            fout.write(genomeId + '\t' +
                       '; '.join(metadata[genomeId]['taxonomy']) + '\t%.2f' %
                       (float(metadata[genomeId]['genome size']) / 1e6) +
                       '\t' + str(metadata[genomeId]['scaffold count']))
            fout.write('\t' + metadata[genomeId]['biotic relationships'] +
                       '\t' + metadata[genomeId]['status'])

            for d in data:
                fout.write('\t' + d[0] + '\t' + str(d[1]) + '\t' + str(d[2]) +
                           '\t%.3f\t%.3f' % (d[3], d[4]))
            fout.write('\n')

        fout.close()
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            minMarkers, mostSpecificRank, percentGenomes, numReplicates):
        img = IMG()

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)

        fout = open('./data/lineage_evaluation.tsv', 'w')
        fout.write(
            'Lineage\t# genomes\t# markers\tpercentage\tnum replicates\tmean\tstd\tmean %\tmean + std%\tmean + 2*std %\n'
        )

        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            countTable = img.filterTable(genomeIds, countTable,
                                         ubiquityThreshold * 0.9,
                                         singleCopyThreshold * 0.9)

            # calculate marker set for all genomes
            markerGenes = img.markerGenes(genomeIds, countTable,
                                          ubiquityThreshold * len(genomeIds),
                                          singleCopyThreshold * len(genomeIds))
            if len(markerGenes) < minMarkers:
                continue

            print '\nLineage ' + lineage + ' contains ' + str(
                len(genomeIds)) + ' genomes.'
            print '  Marker genes: ' + str(len(markerGenes))

            fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' +
                       str(len(markerGenes)) + '\t%.2f' % percentGenomes +
                       '\t' + str(numReplicates))

            # withhold select percentage of genomes and calculate new marker set
            changeMarkerSetSize = []
            for _ in xrange(0, numReplicates):
                subsetGenomeIds = random.sample(
                    genomeIds,
                    int((1.0 - percentGenomes) * len(genomeIds) + 0.5))

                newMarkerGenes = img.markerGenes(
                    subsetGenomeIds, countTable,
                    ubiquityThreshold * len(subsetGenomeIds),
                    singleCopyThreshold * len(subsetGenomeIds))

                changeMarkerSetSize.append(
                    len(newMarkerGenes.symmetric_difference(markerGenes)))

            m = mean(changeMarkerSetSize)
            s = std(changeMarkerSetSize)

            print '  Mean: %.2f, Std: %.2f, Per: %.2f' % (m, s,
                                                          (m + 2 * s) * 100 /
                                                          len(markerGenes))
            fout.write('\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f' %
                       (m, s, m * 100 / len(markerGenes),
                        (m + s) * 100 / len(markerGenes),
                        (m + 2 * s) * 100 / len(markerGenes)) + '\n')

        fout.close()
Example #5
0
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            mostSpecificRank, minMarkers):
        print('Ubiquity threshold: ' + str(ubiquityThreshold))
        print('Single-copy threshold: ' + str(singleCopyThreshold))
        print('Min. genomes: ' + str(minGenomes))
        print('Most specific taxonomic rank: ' + str(mostSpecificRank))

        img = IMG()

        deltaMarkerSetSizes = []

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)
        lineages = ['prokaryotes'] + lineages

        boxPlotLabels = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            trusted = img.trustedGenomes()
            genomeIds = list(genomeIds.intersection(trusted))

            print('')
            print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            pfamTable = img.pfamTable(genomeIds)
            pfamTable = img.filterPfamTable(genomeIds, pfamTable,
                                            ubiquityThreshold * 0.9,
                                            singleCopyThreshold * 0.9)

            markerSet = img.markerGenes(
                genomeIds, pfamTable, ubiquityThreshold * (len(genomeIds) - 1),
                singleCopyThreshold * (len(genomeIds) - 1))
            fullMarkerSetSize = len(markerSet)

            if fullMarkerSetSize < minMarkers:
                continue

            boxPlotLabels.append(
                lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) +
                ', ' + str(fullMarkerSetSize) + ')')

            deltaMarkerSetSize = []
            numGenomes = len(genomeIds) - 1

            for loo in range(0, len(genomeIds)):
                if loo != len(genomeIds) - 1:
                    genomeIdSubset = genomeIds[0:loo] + genomeIds[loo + 1:]
                else:
                    genomeIdSubset = genomeIds[0:loo]

                markerSet = img.markerGenes(
                    genomeIdSubset, pfamTable,
                    ubiquityThreshold * len(genomeIdSubset),
                    singleCopyThreshold * len(genomeIdSubset))
                deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet))

                if fullMarkerSetSize < len(markerSet):
                    print('[Warning] Unexpected!')

            deltaMarkerSetSizes.append(deltaMarkerSetSize)

            m = mean(deltaMarkerSetSize)
            s = std(deltaMarkerSetSize)

            print('  LOO Ubiquity >= ' +
                  str(int(ubiquityThreshold * numGenomes)) +
                  ', LOO Single-copy >= ' +
                  str(int(singleCopyThreshold * numGenomes)))
            print('  Delta Mean: %.2f +/- %.2f' % (m, s))
            print('  Delta Min: %d, Delta Max: %d' %
                  (min(deltaMarkerSetSize), max(deltaMarkerSetSize)))

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(
            singleCopyThreshold) + '.boxplot.png'
        title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels,
                     r'$\Delta$' + ' Marker Set Size', '', False, title)
Example #6
0
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers):
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)
        print 'Min. genomes: ' + str(minGenomes)
        print 'Most specific taxonomic rank: ' + str(mostSpecificRank)

        img = IMG()

        deltaMarkerSetSizes = []

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)
        lineages = ['prokaryotes'] + lineages

        boxPlotLabels = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            trusted = img.trustedGenomes()
            genomeIds = list(genomeIds.intersection(trusted))

            print ''
            print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            pfamTable = img.pfamTable(genomeIds)
            pfamTable = img.filterPfamTable(genomeIds, pfamTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9)

            markerSet = img.markerGenes(genomeIds, pfamTable, ubiquityThreshold*(len(genomeIds)-1), singleCopyThreshold*(len(genomeIds)-1))
            fullMarkerSetSize = len(markerSet)

            if fullMarkerSetSize < minMarkers:
                continue

            boxPlotLabels.append(lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) + ', ' + str(fullMarkerSetSize) + ')')

            deltaMarkerSetSize = []
            numGenomes = len(genomeIds)-1

            for loo in xrange(0, len(genomeIds)):
                if loo != len(genomeIds) - 1:
                    genomeIdSubset = genomeIds[0:loo] + genomeIds[loo+1:]
                else:
                    genomeIdSubset = genomeIds[0:loo]

                markerSet = img.markerGenes(genomeIdSubset, pfamTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset))
                deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet))

                if fullMarkerSetSize < len(markerSet):
                    print '[Warning] Unexpected!'

            deltaMarkerSetSizes.append(deltaMarkerSetSize)

            m = mean(deltaMarkerSetSize)
            s = std(deltaMarkerSetSize)

            print '  LOO Ubiquity >= ' + str(int(ubiquityThreshold*numGenomes)) + ', LOO Single-copy >= ' + str(int(singleCopyThreshold*numGenomes))
            print '  Delta Mean: %.2f +/- %.2f' % (m, s)
            print '  Delta Min: %d, Delta Max: %d' % (min(deltaMarkerSetSize), max(deltaMarkerSetSize))

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png'
        title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels, r'$\Delta$' + ' Marker Set Size', '', False, title)