Example #1
0
    def run(self, minThreshold, maxThreshold, stepSize, minGenomes, mostSpecificRanks):
        img = IMG()

        trustedGenomeIds = img.trustedGenomes()

        fout = open("./data/markerSetSize.tsv", "w")
        fout.write("Lineage\t# genomes")
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            fout.write("\t" + str(threshold))
        fout.write("\n")

        lineages = img.lineagesSorted(mostSpecificRanks)
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            genomeIds = list(genomeIds.intersection(trustedGenomeIds))

            if len(genomeIds) < minGenomes:
                continue

            print "\nLineage " + lineage + " contains " + str(len(genomeIds)) + " genomes."
            fout.write(lineage + "\t" + str(len(genomeIds)))

            pfamTable = img.pfamTable(genomeIds)
            for threshold in arange(maxThreshold, minThreshold, -stepSize):
                markerSet = img.markerGenes(
                    genomeIds, pfamTable, threshold * len(genomeIds), threshold * len(genomeIds)
                )
                fout.write("\t" + str(len(markerSet)))
                print "  Threshold = %.2f, marker set size = %d" % (threshold, len(markerSet))
            fout.write("\n")

        fout.close()
Example #2
0
    def run(self, minThreshold, maxThreshold, stepSize, minGenomes,
            mostSpecificRanks):
        img = IMG()

        trustedGenomeIds = img.trustedGenomes()

        fout = open('./data/markerSetSize.tsv', 'w')
        fout.write('Lineage\t# genomes')
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            fout.write('\t' + str(threshold))
        fout.write('\n')

        lineages = img.lineagesSorted(mostSpecificRanks)
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            genomeIds = list(genomeIds.intersection(trustedGenomeIds))

            if len(genomeIds) < minGenomes:
                continue

            print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')
            fout.write(lineage + '\t' + str(len(genomeIds)))

            pfamTable = img.pfamTable(genomeIds)
            for threshold in arange(maxThreshold, minThreshold, -stepSize):
                markerSet = img.markerGenes(genomeIds, pfamTable,
                                            threshold * len(genomeIds),
                                            threshold * len(genomeIds))
                fout.write('\t' + str(len(markerSet)))
                print('  Threshold = %.2f, marker set size = %d' %
                      (threshold, len(markerSet)))
            fout.write('\n')

        fout.close()
Example #3
0
    def run(self, ubiquityThreshold, singleCopyThreshold, rank):
        img = IMG()
        markerset = MarkerSet()

        print('Reading metadata.')
        metadata = img.genomeMetadata()
        print('  Genomes with metadata: ' + str(len(metadata)))

        # calculate marker set for each lineage at the specified rank
        sortedLineages = img.lineagesSorted(metadata, rank)
        markerGeneLists = {}
        for lineage in sortedLineages:
            taxonomy = lineage.split(';')
            if len(taxonomy) != rank + 1:
                continue

        genomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'Final')
        countTable = img.countTable(genomeIds)

        if len(genomeIds) < 3:
            continue

        print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) +
              ' genomes.')

        markerGenes = markerset.markerGenes(
            genomeIds, countTable, ubiquityThreshold * len(genomeIds),
            singleCopyThreshold * len(genomeIds))

        print('  Marker genes: ' + str(len(markerGenes)))
        print('')

        markerGeneLists[lineage] = markerGenes

        # calculate union of marker gene list for higher taxonomic groups
        for r in range(rank - 1, -1, -1):
            print('Processing rank ' + str(r))
            rankMarkerGeneLists = {}
            for lineage, markerGenes in markerGeneLists.iteritems():
                taxonomy = lineage.split(';')
                if len(taxonomy) != r + 2:
                    continue

                curLineage = '; '.join(taxonomy[0:r + 1])
                if curLineage not in rankMarkerGeneLists:
                    rankMarkerGeneLists[curLineage] = markerGenes
                else:
                    curMarkerGenes = rankMarkerGeneLists[curLineage]
                    curMarkerGenes = curMarkerGenes.intersection(markerGenes)
                    rankMarkerGeneLists[curLineage] = curMarkerGenes

            # combine marker gene list dictionaries
            markerGeneLists.update(rankMarkerGeneLists)
Example #4
0
    def run(
        self,
        ubiquityThreshold,
        singleCopyThreshold,
        minGenomes,
        minMarkers,
        mostSpecificRank,
        distThreshold,
        genomeThreshold,
    ):
        img = IMG()
        markerset = MarkerSet()

        lineages = img.lineagesSorted(mostSpecificRank)

        fout = open("./data/colocated.tsv", "w", 1)
        fout.write("Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n")

        lineageCount = 0
        for lineage in lineages:
            lineageCount += 1

            genomeIds = img.genomeIdsByTaxonomy(lineage, "Final")
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            markerGenes = markerset.markerGenes(
                genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)
            )

            geneDistTable = img.geneDistTable(genomeIds, markerGenes)
            colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold)
            colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes)
            if len(colocatedSets) < minMarkers:
                continue

            print "\nLineage " + lineage + " contains " + str(len(genomeIds)) + " genomes (" + str(
                lineageCount
            ) + " of " + str(len(lineages)) + ")."
            print "  Marker genes: " + str(len(markerGenes))
            print "  Co-located gene sets: " + str(len(colocatedSets))

            fout.write(
                lineage + "\t" + str(len(genomeIds)) + "\t" + str(len(markerGenes)) + "\t" + str(len(colocatedSets))
            )
            for cs in colocatedSets:
                fout.write("\t" + ", ".join(cs))
            fout.write("\n")

        fout.close()
    def run(self, ubiquityThreshold, singleCopyThreshold, rank):
        img = IMG()
        markerset = MarkerSet()

        print 'Reading metadata.'
        metadata = img.genomeMetadata()
        print '  Genomes with metadata: ' + str(len(metadata))

        # calculate marker set for each lineage at the specified rank
        sortedLineages = img.lineagesSorted(metadata, rank)
        markerGeneLists = {}
        for lineage in sortedLineages:
            taxonomy = lineage.split(';')
            if len(taxonomy) != rank+1:
                continue

        genomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'Final')
        countTable = img.countTable(genomeIds)

        if len(genomeIds) < 3:
            continue

        print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

        markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))

        print '  Marker genes: ' + str(len(markerGenes))
        print ''

        markerGeneLists[lineage] = markerGenes

        # calculate union of marker gene list for higher taxonomic groups
        for r in xrange(rank-1, -1, -1):
            print 'Processing rank ' + str(r)
            rankMarkerGeneLists = {}
            for lineage, markerGenes in markerGeneLists.iteritems():
                taxonomy = lineage.split(';')
                if len(taxonomy) != r+2:
                    continue

                curLineage = '; '.join(taxonomy[0:r+1])
                if curLineage not in rankMarkerGeneLists:
                    rankMarkerGeneLists[curLineage] = markerGenes
                else:
                    curMarkerGenes = rankMarkerGeneLists[curLineage]
                    curMarkerGenes = curMarkerGenes.intersection(markerGenes)
                    rankMarkerGeneLists[curLineage] = curMarkerGenes

            # combine marker gene list dictionaries
            markerGeneLists.update(rankMarkerGeneLists)
Example #6
0
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            minMarkers, mostSpecificRank, distThreshold, genomeThreshold):
        img = IMG()
        markerset = MarkerSet()

        lineages = img.lineagesSorted(mostSpecificRank)

        fout = open('./data/colocated.tsv', 'w', 1)
        fout.write(
            'Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n'
        )

        lineageCount = 0
        for lineage in lineages:
            lineageCount += 1

            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            markerGenes = markerset.markerGenes(
                genomeIds, countTable, ubiquityThreshold * len(genomeIds),
                singleCopyThreshold * len(genomeIds))

            geneDistTable = img.geneDistTable(genomeIds,
                                              markerGenes,
                                              spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable,
                                                      distThreshold,
                                                      genomeThreshold)
            colocatedSets = markerset.colocatedSets(colocatedGenes,
                                                    markerGenes)
            if len(colocatedSets) < minMarkers:
                continue

            print '\nLineage ' + lineage + ' contains ' + str(len(
                genomeIds)) + ' genomes (' + str(lineageCount) + ' of ' + str(
                    len(lineages)) + ').'
            print '  Marker genes: ' + str(len(markerGenes))
            print '  Co-located gene sets: ' + str(len(colocatedSets))

            fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' +
                       str(len(markerGenes)) + '\t' + str(len(colocatedSets)))
            for cs in colocatedSets:
                fout.write('\t' + ', '.join(cs))
            fout.write('\n')

        fout.close()
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, distThreshold, genomeThreshold):
        img = IMG()
        markerset = MarkerSet()

        lineages = img.lineagesSorted(mostSpecificRank)

        fout = open('./data/colocated.tsv', 'w', 1)
        fout.write('Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n')

        lineageCount = 0
        for lineage in lineages:
            lineageCount += 1

            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))

            geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold)
            colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes)
            if len(colocatedSets) < minMarkers:
                continue

            print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes (' + str(lineageCount) + ' of ' + str(len(lineages)) + ').'
            print '  Marker genes: ' + str(len(markerGenes))
            print '  Co-located gene sets: ' + str(len(colocatedSets))

            fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t' + str(len(colocatedSets)))
            for cs in colocatedSets:
                fout.write('\t' + ', '.join(cs))
            fout.write('\n')

        fout.close()
    def run(self, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination, genomeCompleteness, genomeContamination):
        img = IMG()
        markerset = MarkerSet()

        metadata = img.genomeMetadata()

        trustedOut = open('./data/trusted_genomes.tsv', 'w')
        trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n')

        filteredOut = open('./data/filtered_genomes.tsv', 'w')
        filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n')

        allGenomeIds = set()
        allTrustedGenomeIds = set()
        for lineage in ['Archaea', 'Bacteria']:
            # get all genomes in lineage and build gene count table
            print '\nBuilding gene count table.'
            allLineageGenomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'All')
            countTable = img.countTable(allLineageGenomeIds)
            countTable = img.filterTable(allLineageGenomeIds, countTable, 0.9*ubiquityThreshold, 0.9*singleCopyThreshold)

            # get all genomes from specific lineage
            allGenomeIds = allGenomeIds.union(allLineageGenomeIds)

            print 'Lineage ' + lineage + ' contains ' + str(len(allLineageGenomeIds)) + ' genomes.'

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker set for genomes
            markerGenes = markerset.markerGenes(allLineageGenomeIds, countTable, ubiquityThreshold*len(allLineageGenomeIds), singleCopyThreshold*len(allLineageGenomeIds))
            print '  Marker genes: ' + str(len(markerGenes))

            geneDistTable = img.geneDistTable(allLineageGenomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable, metadata)
            colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes)
            print '  Marker set size: ' + str(len(colocatedSets))

            # identifying trusted genomes (highly complete, low contamination genomes)
            trustedGenomeIds = set()
            for genomeId in allLineageGenomeIds:
                completeness, contamination = markerset.genomeCheck(colocatedSets, genomeId, countTable)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)

                    trustedOut.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']))
                    trustedOut.write('\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6))
                    trustedOut.write('\t' + str(metadata[genomeId]['scaffold count']))
                    trustedOut.write('\t' + metadata[genomeId]['biotic relationships'])
                    trustedOut.write('\t' + metadata[genomeId]['status'])
                    trustedOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n')
                else:
                    filteredOut.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']))
                    filteredOut.write('\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6))
                    filteredOut.write('\t' + str(metadata[genomeId]['scaffold count']))
                    filteredOut.write('\t' + metadata[genomeId]['biotic relationships'])
                    filteredOut.write('\t' + metadata[genomeId]['status'])
                    filteredOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n')

            print '  Trusted genomes: ' + str(len(trustedGenomeIds))

            # determine status of trusted genomes
            statusBreakdown = {}
            for genomeId in trustedGenomeIds:
                statusBreakdown[metadata[genomeId]['status']] = statusBreakdown.get(metadata[genomeId]['status'], 0) + 1

            print '  Trusted genome status breakdown: '
            for status, count in statusBreakdown.iteritems():
                print '    ' + status + ': ' + str(count)

            # determine status of retained genomes
            proposalNameBreakdown = {}
            for genomeId in trustedGenomeIds:
                proposalNameBreakdown[metadata[genomeId]['proposal name']] = proposalNameBreakdown.get(metadata[genomeId]['proposal name'], 0) + 1

            print '  Retained genome proposal name breakdown: '
            for pn, count in proposalNameBreakdown.iteritems():
                if 'KMG' in pn or 'GEBA' in pn or 'HMP' in pn:
                    print '    ' + pn + ': ' + str(count)

            print '  Filtered genomes by phylum:'
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1

            for phylum, count in allPhylumCounts.iteritems():
                print phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count)

        trustedOut.close()
        filteredOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in xrange(0, 6): # Domain to Genus
            for genomeId, data in metadata.iteritems():
                taxaStr = '; '.join(data['taxonomy'][0:r+1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted()

        fout = open('./data/lineage_stats.tsv', 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n')
        fout.close()
Example #9
0
    def run(self, ubiquityThreshold, singleCopyThreshold, trustedCompleteness,
            trustedContamination, genomeCompleteness, genomeContamination):
        img = IMG()
        markerset = MarkerSet()

        metadata = img.genomeMetadata()

        trustedOut = open('./data/trusted_genomes.tsv', 'w')
        trustedOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n'
        )

        filteredOut = open('./data/filtered_genomes.tsv', 'w')
        filteredOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n'
        )

        allGenomeIds = set()
        allTrustedGenomeIds = set()
        for lineage in ['Archaea', 'Bacteria']:
            # get all genomes in lineage and build gene count table
            print '\nBuilding gene count table.'
            allLineageGenomeIds = img.genomeIdsByTaxonomy(
                lineage, metadata, 'All')
            countTable = img.countTable(allLineageGenomeIds)
            countTable = img.filterTable(allLineageGenomeIds, countTable,
                                         0.9 * ubiquityThreshold,
                                         0.9 * singleCopyThreshold)

            # get all genomes from specific lineage
            allGenomeIds = allGenomeIds.union(allLineageGenomeIds)

            print 'Lineage ' + lineage + ' contains ' + str(
                len(allLineageGenomeIds)) + ' genomes.'

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker set for genomes
            markerGenes = markerset.markerGenes(
                allLineageGenomeIds, countTable,
                ubiquityThreshold * len(allLineageGenomeIds),
                singleCopyThreshold * len(allLineageGenomeIds))
            print '  Marker genes: ' + str(len(markerGenes))

            geneDistTable = img.geneDistTable(allLineageGenomeIds,
                                              markerGenes,
                                              spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable, metadata)
            colocatedSets = markerset.colocatedSets(colocatedGenes,
                                                    markerGenes)
            print '  Marker set size: ' + str(len(colocatedSets))

            # identifying trusted genomes (highly complete, low contamination genomes)
            trustedGenomeIds = set()
            for genomeId in allLineageGenomeIds:
                completeness, contamination = markerset.genomeCheck(
                    colocatedSets, genomeId, countTable)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)

                    trustedOut.write(genomeId + '\t' +
                                     '; '.join(metadata[genomeId]['taxonomy']))
                    trustedOut.write(
                        '\t%.2f' %
                        (float(metadata[genomeId]['genome size']) / 1e6))
                    trustedOut.write('\t' +
                                     str(metadata[genomeId]['scaffold count']))
                    trustedOut.write(
                        '\t' + metadata[genomeId]['biotic relationships'])
                    trustedOut.write('\t' + metadata[genomeId]['status'])
                    trustedOut.write('\t%.3f\t%.3f' %
                                     (completeness, contamination) + '\n')
                else:
                    filteredOut.write(
                        genomeId + '\t' +
                        '; '.join(metadata[genomeId]['taxonomy']))
                    filteredOut.write(
                        '\t%.2f' %
                        (float(metadata[genomeId]['genome size']) / 1e6))
                    filteredOut.write(
                        '\t' + str(metadata[genomeId]['scaffold count']))
                    filteredOut.write(
                        '\t' + metadata[genomeId]['biotic relationships'])
                    filteredOut.write('\t' + metadata[genomeId]['status'])
                    filteredOut.write('\t%.3f\t%.3f' %
                                      (completeness, contamination) + '\n')

            print '  Trusted genomes: ' + str(len(trustedGenomeIds))

            # determine status of trusted genomes
            statusBreakdown = {}
            for genomeId in trustedGenomeIds:
                statusBreakdown[metadata[genomeId]
                                ['status']] = statusBreakdown.get(
                                    metadata[genomeId]['status'], 0) + 1

            print '  Trusted genome status breakdown: '
            for status, count in statusBreakdown.iteritems():
                print '    ' + status + ': ' + str(count)

            # determine status of retained genomes
            proposalNameBreakdown = {}
            for genomeId in trustedGenomeIds:
                proposalNameBreakdown[metadata[genomeId][
                    'proposal name']] = proposalNameBreakdown.get(
                        metadata[genomeId]['proposal name'], 0) + 1

            print '  Retained genome proposal name breakdown: '
            for pn, count in proposalNameBreakdown.iteritems():
                if 'KMG' in pn or 'GEBA' in pn or 'HMP' in pn:
                    print '    ' + pn + ': ' + str(count)

            print '  Filtered genomes by phylum:'
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon,
                                                                     0) + 1

            for phylum, count in allPhylumCounts.iteritems():
                print phylum + ': %d of %d' % (trustedPhylumCounts.get(
                    phylum, 0), count)

        trustedOut.close()
        filteredOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in xrange(0, 6):  # Domain to Genus
            for genomeId, data in metadata.iteritems():
                taxaStr = '; '.join(data['taxonomy'][0:r + 1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted()

        fout = open('./data/lineage_stats.tsv', 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' +
                       str(trustedStats.get(lineage, 0)) + '\n')
        fout.close()