Beispiel #1
0
    def run(self):
        img = IMG()

        fout = open('./data/evaluate_prodigal.txt', 'w', 1)

        # get list of all marker genes
        markerset = MarkerSet()
        pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes()

        print('PFAM marker genes: ' + str(len(tigrMarkers)))
        print('TIGR marker genes: ' + str(len(pfamMarkers)))
        print('')

        # run HMMs on each of the finished genomes
        genomeIds = img.genomeIds('Finished')
        for genomeId in genomeIds:
            print(genomeId + ':')
            fout.write(genomeId + ':\n')

            self.runProdigal(genomeId)
            self.runGeneMark(genomeId)

            self.runPFAM(genomeId)
            self.runTIGRFAM(genomeId)

            self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout)

        fout.close()
Beispiel #2
0
    def run(self):
        img = IMG()

        fout = open('./data/evaluate_prodigal.txt', 'w', 1)

        # get list of all marker genes
        markerset = MarkerSet()
        pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes()

        print 'PFAM marker genes: ' + str(len(tigrMarkers))
        print 'TIGR marker genes: ' + str(len(pfamMarkers))
        print ''

        # run HMMs on each of the finished genomes
        genomeIds = img.genomeIds('Finished')
        for genomeId in genomeIds:
            print genomeId + ':'
            fout.write(genomeId + ':\n')

            self.runProdigal(genomeId)
            self.runGeneMark(genomeId)

            self.runPFAM(genomeId)
            self.runTIGRFAM(genomeId)

            self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout)

        fout.close()
    def run(
        self,
        ubiquityThreshold,
        singleCopyThreshold,
        minGenomes,
        minMarkers,
        mostSpecificRank,
        distThreshold,
        genomeThreshold,
    ):
        img = IMG()
        markerset = MarkerSet()

        lineages = img.lineagesSorted(mostSpecificRank)

        fout = open("./data/colocated.tsv", "w", 1)
        fout.write("Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n")

        lineageCount = 0
        for lineage in lineages:
            lineageCount += 1

            genomeIds = img.genomeIdsByTaxonomy(lineage, "Final")
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            markerGenes = markerset.markerGenes(
                genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)
            )

            geneDistTable = img.geneDistTable(genomeIds, markerGenes)
            colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold)
            colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes)
            if len(colocatedSets) < minMarkers:
                continue

            print "\nLineage " + lineage + " contains " + str(len(genomeIds)) + " genomes (" + str(
                lineageCount
            ) + " of " + str(len(lineages)) + ")."
            print "  Marker genes: " + str(len(markerGenes))
            print "  Co-located gene sets: " + str(len(colocatedSets))

            fout.write(
                lineage + "\t" + str(len(genomeIds)) + "\t" + str(len(markerGenes)) + "\t" + str(len(colocatedSets))
            )
            for cs in colocatedSets:
                fout.write("\t" + ", ".join(cs))
            fout.write("\n")

        fout.close()
    def run(self, ubiquityThreshold, singleCopyThreshold, rank):
        img = IMG()
        markerset = MarkerSet()

        print 'Reading metadata.'
        metadata = img.genomeMetadata()
        print '  Genomes with metadata: ' + str(len(metadata))

        # calculate marker set for each lineage at the specified rank
        sortedLineages = img.lineagesSorted(metadata, rank)
        markerGeneLists = {}
        for lineage in sortedLineages:
            taxonomy = lineage.split(';')
            if len(taxonomy) != rank+1:
                continue

        genomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'Final')
        countTable = img.countTable(genomeIds)

        if len(genomeIds) < 3:
            continue

        print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

        markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))

        print '  Marker genes: ' + str(len(markerGenes))
        print ''

        markerGeneLists[lineage] = markerGenes

        # calculate union of marker gene list for higher taxonomic groups
        for r in xrange(rank-1, -1, -1):
            print 'Processing rank ' + str(r)
            rankMarkerGeneLists = {}
            for lineage, markerGenes in markerGeneLists.iteritems():
                taxonomy = lineage.split(';')
                if len(taxonomy) != r+2:
                    continue

                curLineage = '; '.join(taxonomy[0:r+1])
                if curLineage not in rankMarkerGeneLists:
                    rankMarkerGeneLists[curLineage] = markerGenes
                else:
                    curMarkerGenes = rankMarkerGeneLists[curLineage]
                    curMarkerGenes = curMarkerGenes.intersection(markerGenes)
                    rankMarkerGeneLists[curLineage] = curMarkerGenes

            # combine marker gene list dictionaries
            markerGeneLists.update(rankMarkerGeneLists)
Beispiel #5
0
    def run(self, taxonomyStr, minThreshold, maxThreshold, stepSize):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print('Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.')

        markerSetSizes = []

        countTable = img.countTable(genomeIds)
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            markerGenes = img.markerGenes(genomeIds, countTable, threshold*len(genomeIds), threshold*len(genomeIds))

            geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(geneDistTable)
            colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

            markerSetSizes.append(len(colocatedSets))

            print('  Threshold = %.2f, marker set size = %d' % (threshold, len(markerGenes)))

        # plot data
        plot = LinePlot()
        plotFilename = './images/markerSetSize.' + taxonomyStr.replace(';','_') + '.png'
        title = taxonomyStr.replace(';', '; ')
        plot.plot(plotFilename, arange(maxThreshold, minThreshold, -stepSize), markerSetSizes, 'Threshold', 'Marker Set Size', title)
Beispiel #6
0
    def run(self, minThreshold, maxThreshold, stepSize, minGenomes,
            mostSpecificRanks):
        img = IMG()

        trustedGenomeIds = img.trustedGenomes()

        fout = open('./data/markerSetSize.tsv', 'w')
        fout.write('Lineage\t# genomes')
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            fout.write('\t' + str(threshold))
        fout.write('\n')

        lineages = img.lineagesSorted(mostSpecificRanks)
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            genomeIds = list(genomeIds.intersection(trustedGenomeIds))

            if len(genomeIds) < minGenomes:
                continue

            print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')
            fout.write(lineage + '\t' + str(len(genomeIds)))

            pfamTable = img.pfamTable(genomeIds)
            for threshold in arange(maxThreshold, minThreshold, -stepSize):
                markerSet = img.markerGenes(genomeIds, pfamTable,
                                            threshold * len(genomeIds),
                                            threshold * len(genomeIds))
                fout.write('\t' + str(len(markerSet)))
                print('  Threshold = %.2f, marker set size = %d' %
                      (threshold, len(markerSet)))
            fout.write('\n')

        fout.close()
Beispiel #7
0
    def run(self, taxonomyStr, minThreshold, maxThreshold, stepSize):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'

        markerSetSizes = []

        countTable = img.countTable(genomeIds)
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            markerGenes = img.markerGenes(genomeIds, countTable, threshold*len(genomeIds), threshold*len(genomeIds))

            geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(geneDistTable)
            colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

            markerSetSizes.append(len(colocatedSets))

            print '  Threshold = %.2f, marker set size = %d' % (threshold, len(markerGenes))

        # plot data
        plot = LinePlot()
        plotFilename = './images/markerSetSize.' + taxonomyStr.replace(';','_') + '.png'
        title = taxonomyStr.replace(';', '; ')
        plot.plot(plotFilename, arange(maxThreshold, minThreshold, -stepSize), markerSetSizes, 'Threshold', 'Marker Set Size', title)
Beispiel #8
0
    def run(self, minGenomes, minMarkerSets):
        img = IMG()
        pfam = PFAM()

        # get list of all marker genes
        markerset = MarkerSet()
        pfamIds, tigrIds = markerset.getCalculatedMarkerGenes()

        print 'TIGR marker genes: ' + str(len(tigrIds))
        print 'PFAM marker genes: ' + str(len(pfamIds))

        # get all PFAM HMMs that are in the same clan
        # as any of the marker genes
        pfamIdToClanId = pfam.pfamIdToClanId()
        clans = set()
        for pfamId in pfamIds:
            if pfamId.replace('PF', 'pfam') in pfamIdToClanId:
                clans.add(pfamIdToClanId[pfamId])

        for pfamId, clanId in pfamIdToClanId.iteritems():
            if clanId in clans:
                pfamIds.add(pfamId)

        print '  PFAM HMMs require to cover marker gene clans: ' + str(
            len(pfamIds))

        # get name of each PFAM HMM
        fout = open('./hmm/pfam.keyfile.txt', 'w')
        pfamNames = []
        for line in open(img.pfamHMMs):
            if 'NAME' in line:
                name = line[line.find(' '):].strip()
            elif 'ACC' in line:
                acc = line[line.find(' '):line.rfind('.')].strip()
                if acc.replace('PF', 'pfam') in pfamIds:
                    pfamNames.append(name)
                    fout.write(name + '\n')
        fout.close()

        print 'PFAM names: ' + str(len(pfamNames))

        # extract each PFAM HMM
        os.system('hmmfetch -f ' + img.pfamHMMs +
                  ' ./hmm/pfam.keyfile.txt > ./hmm/pfam_markers.hmm')

        # get name of each PFAM HMM
        fout = open('./hmm/tigr.keyfile.txt', 'w')
        for tigrId in tigrIds:
            fout.write(tigrId + '\n')
        fout.close()

        # extract each PFAM HMM
        os.system('hmmfetch -f ' + img.tigrHMMs +
                  ' ./hmm/tigr.keyfile.txt > ./hmm/tigr_markers.hmm')
Beispiel #9
0
    def run(self, minThreshold, maxThreshold, stepSize, minGenomes, mostSpecificRanks):
        img = IMG()

        trustedGenomeIds = img.trustedGenomes()

        fout = open("./data/markerSetSize.tsv", "w")
        fout.write("Lineage\t# genomes")
        for threshold in arange(maxThreshold, minThreshold, -stepSize):
            fout.write("\t" + str(threshold))
        fout.write("\n")

        lineages = img.lineagesSorted(mostSpecificRanks)
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            genomeIds = list(genomeIds.intersection(trustedGenomeIds))

            if len(genomeIds) < minGenomes:
                continue

            print "\nLineage " + lineage + " contains " + str(len(genomeIds)) + " genomes."
            fout.write(lineage + "\t" + str(len(genomeIds)))

            pfamTable = img.pfamTable(genomeIds)
            for threshold in arange(maxThreshold, minThreshold, -stepSize):
                markerSet = img.markerGenes(
                    genomeIds, pfamTable, threshold * len(genomeIds), threshold * len(genomeIds)
                )
                fout.write("\t" + str(len(markerSet)))
                print "  Threshold = %.2f, marker set size = %d" % (threshold, len(markerSet))
            fout.write("\n")

        fout.close()
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers, completenessThreshold, contaminationThreshold):
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)
        print 'Min. genomes: ' + str(minGenomes)
        print 'Most specific taxonomic rank: ' + str(mostSpecificRank)
        print 'Min markers: ' + str(minMarkers)
        print 'Completeness threshold: ' + str(completenessThreshold)
        print 'Contamination threshold: ' + str(contaminationThreshold)

        img = IMG()
        markerset = MarkerSet()

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)

        degenerateGenomes = {}
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')

            print ''
            print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            countTable = img.countTable(genomeIds)
            countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9)

            markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
            if len(markerGenes) < minMarkers:
                continue

            geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable)
            colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes)

            for genomeId in genomeIds:
                completeness, contamination = markerset.genomeCheck(colocatedSets, genomeId, countTable)

                if completeness < completenessThreshold or contamination > contaminationThreshold:
                    degenerateGenomes[genomeId] = degenerateGenomes.get(genomeId, []) + [[lineage.split(';')[-1].strip(), len(genomeIds), len(colocatedSets), completeness, contamination]]

        # write out degenerate genomes
        metadata = img.genomeMetadata('Final')

        fout = open('./data/degenerate_genomes.tsv', 'w')
        fout.write('Genome Id\tTaxonomy\tGenome Size (Gbps)\tScaffolds\tBiotic Relationships\tStatus\tLineage\t# genomes\tMarker set size\tCompleteness\tContamination\n')
        for genomeId, data in degenerateGenomes.iteritems():
            fout.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']) + '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6) + '\t' + str(metadata[genomeId]['scaffold count']))
            fout.write('\t' + metadata[genomeId]['biotic relationships'] + '\t' + metadata[genomeId]['status'])

            for d in data:
                fout.write('\t' + d[0] + '\t' + str(d[1]) + '\t' + str(d[2]) + '\t%.3f\t%.3f' % (d[3], d[4]))
            fout.write('\n')

        fout.close()
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, distThreshold, genomeThreshold):
        img = IMG()
        markerset = MarkerSet()

        lineages = img.lineagesSorted(mostSpecificRank)

        fout = open('./data/colocated.tsv', 'w', 1)
        fout.write('Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n')

        lineageCount = 0
        for lineage in lineages:
            lineageCount += 1

            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))

            geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold)
            colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes)
            if len(colocatedSets) < minMarkers:
                continue

            print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes (' + str(lineageCount) + ' of ' + str(len(lineages)) + ').'
            print '  Marker genes: ' + str(len(markerGenes))
            print '  Co-located gene sets: ' + str(len(colocatedSets))

            fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t' + str(len(colocatedSets)))
            for cs in colocatedSets:
                fout.write('\t' + ', '.join(cs))
            fout.write('\n')

        fout.close()
Beispiel #12
0
    def run(self, ubiquityThreshold, singleCopyThreshold, rank):
        img = IMG()
        markerset = MarkerSet()

        print('Reading metadata.')
        metadata = img.genomeMetadata()
        print('  Genomes with metadata: ' + str(len(metadata)))

        # calculate marker set for each lineage at the specified rank
        sortedLineages = img.lineagesSorted(metadata, rank)
        markerGeneLists = {}
        for lineage in sortedLineages:
            taxonomy = lineage.split(';')
            if len(taxonomy) != rank + 1:
                continue

        genomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'Final')
        countTable = img.countTable(genomeIds)

        if len(genomeIds) < 3:
            continue

        print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) +
              ' genomes.')

        markerGenes = markerset.markerGenes(
            genomeIds, countTable, ubiquityThreshold * len(genomeIds),
            singleCopyThreshold * len(genomeIds))

        print('  Marker genes: ' + str(len(markerGenes)))
        print('')

        markerGeneLists[lineage] = markerGenes

        # calculate union of marker gene list for higher taxonomic groups
        for r in range(rank - 1, -1, -1):
            print('Processing rank ' + str(r))
            rankMarkerGeneLists = {}
            for lineage, markerGenes in markerGeneLists.iteritems():
                taxonomy = lineage.split(';')
                if len(taxonomy) != r + 2:
                    continue

                curLineage = '; '.join(taxonomy[0:r + 1])
                if curLineage not in rankMarkerGeneLists:
                    rankMarkerGeneLists[curLineage] = markerGenes
                else:
                    curMarkerGenes = rankMarkerGeneLists[curLineage]
                    curMarkerGenes = curMarkerGenes.intersection(markerGenes)
                    rankMarkerGeneLists[curLineage] = curMarkerGenes

            # combine marker gene list dictionaries
            markerGeneLists.update(rankMarkerGeneLists)
Beispiel #13
0
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            minMarkers, mostSpecificRank, distThreshold, genomeThreshold):
        img = IMG()
        markerset = MarkerSet()

        lineages = img.lineagesSorted(mostSpecificRank)

        fout = open('./data/colocated.tsv', 'w', 1)
        fout.write(
            'Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n'
        )

        lineageCount = 0
        for lineage in lineages:
            lineageCount += 1

            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            markerGenes = markerset.markerGenes(
                genomeIds, countTable, ubiquityThreshold * len(genomeIds),
                singleCopyThreshold * len(genomeIds))

            geneDistTable = img.geneDistTable(genomeIds,
                                              markerGenes,
                                              spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable,
                                                      distThreshold,
                                                      genomeThreshold)
            colocatedSets = markerset.colocatedSets(colocatedGenes,
                                                    markerGenes)
            if len(colocatedSets) < minMarkers:
                continue

            print '\nLineage ' + lineage + ' contains ' + str(len(
                genomeIds)) + ' genomes (' + str(lineageCount) + ' of ' + str(
                    len(lineages)) + ').'
            print '  Marker genes: ' + str(len(markerGenes))
            print '  Co-located gene sets: ' + str(len(colocatedSets))

            fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' +
                       str(len(markerGenes)) + '\t' + str(len(colocatedSets)))
            for cs in colocatedSets:
                fout.write('\t' + ', '.join(cs))
            fout.write('\n')

        fout.close()
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, percentGenomes, numReplicates):
        img = IMG()

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)

        fout = open('./data/lineage_evaluation.tsv', 'w')
        fout.write('Lineage\t# genomes\t# markers\tpercentage\tnum replicates\tmean\tstd\tmean %\tmean + std%\tmean + 2*std %\n')

        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9)

            # calculate marker set for all genomes
            markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
            if len(markerGenes) < minMarkers:
                continue

            print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'
            print '  Marker genes: ' + str(len(markerGenes))

            fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t%.2f' % percentGenomes + '\t' + str(numReplicates))

            # withhold select percentage of genomes and calculate new marker set
            changeMarkerSetSize = []
            for _ in xrange(0, numReplicates):
                subsetGenomeIds = random.sample(genomeIds, int((1.0-percentGenomes)*len(genomeIds) + 0.5))

                newMarkerGenes = img.markerGenes(subsetGenomeIds, countTable, ubiquityThreshold*len(subsetGenomeIds), singleCopyThreshold*len(subsetGenomeIds))

                changeMarkerSetSize.append(len(newMarkerGenes.symmetric_difference(markerGenes)))

            m = mean(changeMarkerSetSize)
            s = std(changeMarkerSetSize)

            print '  Mean: %.2f, Std: %.2f, Per: %.2f' % (m, s, (m+ 2*s) * 100 / len(markerGenes))
            fout.write('\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f' % (m, s, m * 100 / len(markerGenes), (m + s) * 100 / len(markerGenes), (m + 2*s) * 100 / len(markerGenes)) + '\n')

        fout.close()
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, numBins, numRndGenomes):
        img = IMG()
        markerSet = MarkerSet()

        metadata = img.genomeMetadata()
        lineageGenomeIds = img.genomeIdsByTaxonomy(taxonomyStr, metadata)

        # build marker set from finished prokaryotic genomes
        genomeIds = []
        for genomeId in lineageGenomeIds:
            if metadata[genomeId]['status'] == 'Finished' and (metadata[genomeId]['taxonomy'][0] == 'Bacteria' or metadata[genomeId]['taxonomy'][0] == 'Archaea'):
                genomeIds.append(genomeId)
        genomeIds = set(genomeIds) - img.genomesWithMissingData(genomeIds)

        print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'

        # get marker set
        countTable = img.countTable(genomeIds)
        countTable = img.filterTable(genomeIds, countTable, 0.9*ubiquityThreshold, 0.9*singleCopyThreshold)
        markerGenes = markerSet.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
        tigrToRemove = img.identifyRedundantTIGRFAMs(markerGenes)
        markerGenes = markerGenes - tigrToRemove
        geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)

        print 'Number of marker genes: ' + str(len(markerGenes))

        # randomly set genomes to plot
        if numRndGenomes != -1:
            genomeIds = random.sample(list(genomeIds), numRndGenomes)
        genomeIds = set(genomeIds)

        # plot distribution of marker genes
        filename = 'geneDistribution.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.tsv'
        fout = open(filename, 'w')
        fout.write('Genome ID\tLineage\tNumber of Genes\tUniformity\tDistribution\n')
        matrix = []
        rowLabels = []
        for genomeId in genomeIds:
            binSize = float(metadata[genomeId]['genome size']) / numBins

            binCounts = [0]*numBins
            pts = []
            for _, data in geneDistTable[genomeId].iteritems():
                for genePos in data:
                    binNum = int(genePos[1] / binSize)
                    binCounts[binNum] += 1
                    pts.append(genePos[1])
            matrix.append(binCounts)

            u = markerSet.uniformity(metadata[genomeId]['genome size'], pts)

            fout.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']) + '\t' + str(len(geneDistTable[genomeId])) + '\t%.3f' % u)
            for b in xrange(0, numBins):
                fout.write('\t' + str(binCounts[b]))
            fout.write('\n')

            rowLabels.append('%.2f' % u + ', ' + str(genomeId) + ' - ' + '; '.join(metadata[genomeId]['taxonomy'][0:5]))

        fout.close()

        # plot data
        heatmap = Heatmap()
        plotFilename = 'geneDistribution.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.png'
        heatmap.plot(plotFilename, matrix, rowLabels, 0.6)
Beispiel #16
0
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            mostSpecificRank, minMarkers):
        print('Ubiquity threshold: ' + str(ubiquityThreshold))
        print('Single-copy threshold: ' + str(singleCopyThreshold))
        print('Min. genomes: ' + str(minGenomes))
        print('Most specific taxonomic rank: ' + str(mostSpecificRank))

        img = IMG()

        deltaMarkerSetSizes = []

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)
        lineages = ['prokaryotes'] + lineages

        boxPlotLabels = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            trusted = img.trustedGenomes()
            genomeIds = list(genomeIds.intersection(trusted))

            print('')
            print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            pfamTable = img.pfamTable(genomeIds)
            pfamTable = img.filterPfamTable(genomeIds, pfamTable,
                                            ubiquityThreshold * 0.9,
                                            singleCopyThreshold * 0.9)

            markerSet = img.markerGenes(
                genomeIds, pfamTable, ubiquityThreshold * (len(genomeIds) - 1),
                singleCopyThreshold * (len(genomeIds) - 1))
            fullMarkerSetSize = len(markerSet)

            if fullMarkerSetSize < minMarkers:
                continue

            boxPlotLabels.append(
                lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) +
                ', ' + str(fullMarkerSetSize) + ')')

            deltaMarkerSetSize = []
            numGenomes = len(genomeIds) - 1

            for loo in range(0, len(genomeIds)):
                if loo != len(genomeIds) - 1:
                    genomeIdSubset = genomeIds[0:loo] + genomeIds[loo + 1:]
                else:
                    genomeIdSubset = genomeIds[0:loo]

                markerSet = img.markerGenes(
                    genomeIdSubset, pfamTable,
                    ubiquityThreshold * len(genomeIdSubset),
                    singleCopyThreshold * len(genomeIdSubset))
                deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet))

                if fullMarkerSetSize < len(markerSet):
                    print('[Warning] Unexpected!')

            deltaMarkerSetSizes.append(deltaMarkerSetSize)

            m = mean(deltaMarkerSetSize)
            s = std(deltaMarkerSetSize)

            print('  LOO Ubiquity >= ' +
                  str(int(ubiquityThreshold * numGenomes)) +
                  ', LOO Single-copy >= ' +
                  str(int(singleCopyThreshold * numGenomes)))
            print('  Delta Mean: %.2f +/- %.2f' % (m, s))
            print('  Delta Min: %d, Delta Max: %d' %
                  (min(deltaMarkerSetSize), max(deltaMarkerSetSize)))

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(
            singleCopyThreshold) + '.boxplot.png'
        title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels,
                     r'$\Delta$' + ' Marker Set Size', '', False, title)
Beispiel #17
0
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, replicates, minGenomes, maxGenomes, stepSize):
        img = IMG()
        markergenes = MarkerGenes()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'
        if len(genomeIds) < minGenomes:
            sys.stderr.write('[Error] Insufficent number of genomes.\n')
            sys.exit()

        print ''
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)

        meanMarkerSetSize = []
        stdMarkerSetSize = []
        markerSetSizes = []
        if maxGenomes == -1:
            maxGenomes = len(genomeIds)

        if maxGenomes > len(genomeIds):
            maxGenomes = len(genomeIds)

        countTable = img.countTable(genomeIds)
        countTable = img.filterTable(genomeIds, countTable)

        for numGenomes in xrange(minGenomes, maxGenomes, stepSize):
            markerSetSize = []
            for _ in xrange(0, replicates):
                genomeIdSubset = random.sample(genomeIds, numGenomes)

                markerGenes = markergenes.identify(genomeIdSubset, countTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset))
                geneDistTable = img.geneDistTable(genomeIdSubset, markerGenes, spacingBetweenContigs=1e6)
                colocatedGenes = img.colocatedGenes(geneDistTable)
                colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

                markerSetSize.append(len(colocatedSets))

            markerSetSizes.append(markerSetSize)

            m = mean(markerSetSize)
            meanMarkerSetSize.append(m)

            s = std(markerSetSize)
            stdMarkerSetSize.append(s)

            print ''
            print 'Genomes: ' + str(numGenomes) + ', Ubiquity > ' + str(int(ubiquityThreshold*len(genomeIdSubset))) + ', Single-copy > ' + str(int(singleCopyThreshold*len(genomeIdSubset)))
            print 'Mean: %.2f +/- %.2f' % (m, s)
            print 'Min: %d, Max: %d' %(min(markerSetSize), max(markerSetSize))

        # plot data
        errorBar = ErrorBar()
        plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) +  '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize), meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes', 'Marker Set Size', title)

        boxPlot = BoxPlot()
        plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) +  '.boxplot.png'
        boxPlot.plot(plotFilename, markerSetSizes, arange(minGenomes, maxGenomes, stepSize), 'Number of Genomes', 'Marker Set Size', True, title)
Beispiel #18
0
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold,
            replicates, minGenomes, maxGenomes, stepSize):
        img = IMG()
        markergenes = MarkerGenes()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')

        print('Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) +
              ' genomes.')
        if len(genomeIds) < minGenomes:
            sys.stderr.write('[Error] Insufficent number of genomes.\n')
            sys.exit()

        print('')
        print('Ubiquity threshold: ' + str(ubiquityThreshold))
        print('Single-copy threshold: ' + str(singleCopyThreshold))

        meanMarkerSetSize = []
        stdMarkerSetSize = []
        markerSetSizes = []
        if maxGenomes == -1:
            maxGenomes = len(genomeIds)

        if maxGenomes > len(genomeIds):
            maxGenomes = len(genomeIds)

        countTable = img.countTable(genomeIds)
        countTable = img.filterTable(genomeIds, countTable)

        for numGenomes in range(minGenomes, maxGenomes, stepSize):
            markerSetSize = []
            for _ in range(0, replicates):
                genomeIdSubset = random.sample(genomeIds, numGenomes)

                markerGenes = markergenes.identify(
                    genomeIdSubset, countTable,
                    ubiquityThreshold * len(genomeIdSubset),
                    singleCopyThreshold * len(genomeIdSubset))
                geneDistTable = img.geneDistTable(genomeIdSubset,
                                                  markerGenes,
                                                  spacingBetweenContigs=1e6)
                colocatedGenes = img.colocatedGenes(geneDistTable)
                colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)

                markerSetSize.append(len(colocatedSets))

            markerSetSizes.append(markerSetSize)

            m = mean(markerSetSize)
            meanMarkerSetSize.append(m)

            s = std(markerSetSize)
            stdMarkerSetSize.append(s)

            print('')
            print('Genomes: ' + str(numGenomes) + ', Ubiquity > ' +
                  str(int(ubiquityThreshold * len(genomeIdSubset))) +
                  ', Single-copy > ' +
                  str(int(singleCopyThreshold * len(genomeIdSubset))))
            print('Mean: %.2f +/- %.2f' % (m, s))
            print('Min: %d, Max: %d' %
                  (min(markerSetSize), max(markerSetSize)))

        # plot data
        errorBar = ErrorBar()
        plotFilename = './images/markerset.' + taxonomyStr.replace(
            ';', '_') + '.' + str(ubiquityThreshold) + '-' + str(
                singleCopyThreshold) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; '
        ) + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize),
                      meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes',
                      'Marker Set Size', title)

        boxPlot = BoxPlot()
        plotFilename = './images/markerset.' + taxonomyStr.replace(
            ';', '_') + '.' + str(ubiquityThreshold) + '-' + str(
                singleCopyThreshold) + '.boxplot.png'
        boxPlot.plot(plotFilename, markerSetSizes,
                     arange(minGenomes, maxGenomes, stepSize),
                     'Number of Genomes', 'Marker Set Size', True, title)
    def run(self):
        img = IMG()

        # get genome ids of all prokaryotes and euks
        print('Reading Proks and Euks metadata.')
        bHeader = True
        genomeIdToGroup = {}
        missingGenomeData = {}
        for line in open(img.metadataFile):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')
            genomeId = lineSplit[0].strip()
            domain = lineSplit[1].strip()

            if os.path.exists(IMG.genomeDir + genomeId + '/' + genomeId +
                              '.fna'):
                if domain == 'Bacteria' or domain == 'Archaea':
                    genomeIdToGroup[genomeId] = 'd__' + domain
                elif domain == 'Eukaryota':
                    phylum = lineSplit[6].strip()

                    if phylum == 'Apicomplexa' or phylum == 'Arthropoda' or phylum == 'Ascomycota' or phylum == 'Chlorophyta' or phylum == 'Chordata' or phylum == 'Streptophyta':
                        genomeIdToGroup[genomeId] = 'p__' + phylum
            else:
                missingGenomeData[domain] = missingGenomeData.get(domain,
                                                                  0) + 1

        # get genome ids of all viruses
        print('Reading Virus metadata.')
        bHeader = True
        for line in open(img.virusMetadataFile):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')
            genomeId = lineSplit[0].strip()
            domain = lineSplit[1].strip()

            if os.path.exists(IMG.genomeDir + genomeId + '/' + genomeId +
                              '.fna'):
                genomeIdToGroup[genomeId] = 'd__' + domain
            else:
                missingGenomeData[domain] = missingGenomeData.get(domain,
                                                                  0) + 1

        # report results
        print('Number of valid genomes: ' + str(len(genomeIdToGroup)))
        print('Number of genomes missing genomic data: ')
        for domain, count in missingGenomeData.items():
            print('  ' + domain + ': ' + str(count))

        # process all genomes
        charByGroup = {}
        for genomeId, groupName in genomeIdToGroup.items():
            print(genomeId)
            self.runProdigal(genomeId)
            self.getCharacteristics(genomeId, groupName, charByGroup)

        # write out results
        self.writeResults(charByGroup)
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            mostSpecificRank, minMarkers, completenessThreshold,
            contaminationThreshold):
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)
        print 'Min. genomes: ' + str(minGenomes)
        print 'Most specific taxonomic rank: ' + str(mostSpecificRank)
        print 'Min markers: ' + str(minMarkers)
        print 'Completeness threshold: ' + str(completenessThreshold)
        print 'Contamination threshold: ' + str(contaminationThreshold)

        img = IMG()
        markerset = MarkerSet()

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)

        degenerateGenomes = {}
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')

            print ''
            print 'Lineage ' + lineage + ' contains ' + str(
                len(genomeIds)) + ' genomes.'

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            countTable = img.countTable(genomeIds)
            countTable = img.filterTable(genomeIds, countTable,
                                         ubiquityThreshold * 0.9,
                                         singleCopyThreshold * 0.9)

            markerGenes = markerset.markerGenes(
                genomeIds, countTable, ubiquityThreshold * len(genomeIds),
                singleCopyThreshold * len(genomeIds))
            if len(markerGenes) < minMarkers:
                continue

            geneDistTable = img.geneDistTable(genomeIds,
                                              markerGenes,
                                              spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable)
            colocatedSets = markerset.colocatedSets(colocatedGenes,
                                                    markerGenes)

            for genomeId in genomeIds:
                completeness, contamination = markerset.genomeCheck(
                    colocatedSets, genomeId, countTable)

                if completeness < completenessThreshold or contamination > contaminationThreshold:
                    degenerateGenomes[genomeId] = degenerateGenomes.get(
                        genomeId, []) + [[
                            lineage.split(';')[-1].strip(),
                            len(genomeIds),
                            len(colocatedSets), completeness, contamination
                        ]]

        # write out degenerate genomes
        metadata = img.genomeMetadata('Final')

        fout = open('./data/degenerate_genomes.tsv', 'w')
        fout.write(
            'Genome Id\tTaxonomy\tGenome Size (Gbps)\tScaffolds\tBiotic Relationships\tStatus\tLineage\t# genomes\tMarker set size\tCompleteness\tContamination\n'
        )
        for genomeId, data in degenerateGenomes.iteritems():
            fout.write(genomeId + '\t' +
                       '; '.join(metadata[genomeId]['taxonomy']) + '\t%.2f' %
                       (float(metadata[genomeId]['genome size']) / 1e6) +
                       '\t' + str(metadata[genomeId]['scaffold count']))
            fout.write('\t' + metadata[genomeId]['biotic relationships'] +
                       '\t' + metadata[genomeId]['status'])

            for d in data:
                fout.write('\t' + d[0] + '\t' + str(d[1]) + '\t' + str(d[2]) +
                           '\t%.3f\t%.3f' % (d[3], d[4]))
            fout.write('\n')

        fout.close()
    def run(self):
        img = IMG()
        markerset = MarkerSet()

        print 'Reading metadata.'
        metadata = img.genomeMetadata('Final')

        print 'Getting marker genes.'
        pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea')
        markerGenes = pfamMarkers.union(tigrMarkers)
        print '  Marker genes: ' + str(len(markerGenes))

        print 'Getting genomes of interest.'
        genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final')
        print '  Genomes: ' + str(len(genomeIds))

        print 'Getting position of each marker gene.'
        geneDistTable = img.geneDistTable(genomeIds, markerGenes)

        spearmanValues = []
        pearsonValues = []
        genomeIds = list(genomeIds)
        for i in xrange(0, len(genomeIds)):
            print str(i+1) + ' of ' + str(len(genomeIds))

            geneOrderI = []
            maskI = []
            for markerGenesId in markerGenes:
                if markerGenesId in geneDistTable[genomeIds[i]]:
                    geneOrderI.append(float(geneDistTable[genomeIds[i]][markerGenesId][0][0]) / metadata[genomeIds[i]]['genome size'])
                    maskI.append(0)
                else:
                    geneOrderI.append(-1)
                    maskI.append(1)


            for j in xrange(i+1, len(genomeIds)):
                geneOrderJ = []
                maskJ = []
                for markerGenesId in markerGenes:
                    if markerGenesId in geneDistTable[genomeIds[j]]:
                        geneOrderJ.append(float(geneDistTable[genomeIds[j]][markerGenesId][0][0]) / metadata[genomeIds[j]]['genome size'])
                        maskJ.append(0)
                    else:
                        geneOrderJ.append(-1)
                        maskJ.append(1)

                # test all translations
                bestSpearman = 0
                bestPearson = 0
                for _ in xrange(0, len(markerGenes)):
                    maskedI = []
                    maskedJ = []
                    for k in xrange(0, len(maskI)):
                        if maskI[k] == 0 and maskJ[k] == 0:
                            maskedI.append(geneOrderI[k])
                            maskedJ.append(geneOrderJ[k])
                    r, _ = spearmanr(maskedI, maskedJ)
                    if abs(r) > bestSpearman:
                        bestSpearman = abs(r)

                    r, _ = pearsonr(maskedI, maskedJ)
                    if abs(r) > bestPearson:
                        bestPearson = abs(r)

                    geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]]
                    maskJ = maskJ[1:] + [maskJ[0]]

                spearmanValues.append(bestSpearman)
                pearsonValues.append(bestPearson)

        print 'Spearman: %.2f +/- %.2f: ' % (mean(spearmanValues), std(spearmanValues))
        print 'Pearson: %.2f +/- %.2f: ' % (mean(pearsonValues), std(pearsonValues))
    def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold,
            singleCopyThreshold, percentCompletion, numReplicates, numGenomes,
            contigLen):
        img = IMG()

        lineages = []
        taxon = taxonomyStr.split(';')
        for r in range(0, len(taxon)):
            lineages.append(';'.join(taxon[0:r + 1]))

        # get all marker sets
        markerGenes = []
        geneDistTable = []
        colocatedSets = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) +
                  ' genomes.')

            # build marker genes and colocated marker sets
            countTable = img.countTable(genomeIds)
            mg = img.markerGenes(genomeIds, countTable,
                                 ubiquityThreshold * len(genomeIds),
                                 singleCopyThreshold * len(genomeIds))
            print('  Marker genes: ' + str(len(mg)))

            mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(mdt)
            cs = img.colocatedSets(colocatedGenes, mg)
            print('  Co-located gene sets: ' + str(len(cs)))

            markerGenes.append(mg)
            geneDistTable.append(mdt)
            colocatedSets.append(cs)

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            completion = [[] for _ in range(len(lineages))]
            for _ in range(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(
                    metadata[genomeId]['genome size'], percentCompletion,
                    contigLen)

                # calculate completion with marker set
                for i in range(len(lineages)):
                    containedMarkerGenes = img.containedMarkerGenes(
                        markerGenes[i], geneDistTable[i][genomeId],
                        startPartialGenomeContigs, contigLen)

                    comp = 0.0
                    for cs in colocatedSets[i]:
                        present = 0
                        for contigId in cs:
                            if contigId in containedMarkerGenes:
                                present += 1

                        comp += float(present) / len(cs)

                    completion[i].append(comp / len(colocatedSets[i]) -
                                         percentCompletion)

                    plotLabels.append(genomeId + '  - ' + lineages[i])

            for d in completion:
                plotData.append(d)

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.lineages.' + taxonomyStr.replace(
            ';', '_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels,
                     r'$\Delta$' + ' Percent Completion', '', False, title)
    def run(self):
        img = IMG()
        markerset = MarkerSet()

        print('Reading metadata.')
        metadata = img.genomeMetadata('Final')

        print('Getting marker genes.')
        pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea')
        markerGenes = pfamMarkers.union(tigrMarkers)
        print('  Marker genes: ' + str(len(markerGenes)))

        print('Getting genomes of interest.')
        genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final')
        print('  Genomes: ' + str(len(genomeIds)))

        print('Getting position of each marker gene.')
        geneDistTable = img.geneDistTable(genomeIds,
                                          markerGenes,
                                          spacingBetweenContigs=1e6)

        spearmanValues = []
        pearsonValues = []
        genomeIds = list(genomeIds)
        for i in range(0, len(genomeIds)):
            print(str(i + 1) + ' of ' + str(len(genomeIds)))

            geneOrderI = []
            maskI = []
            for markerGenesId in markerGenes:
                if markerGenesId in geneDistTable[genomeIds[i]]:
                    geneOrderI.append(
                        float(geneDistTable[genomeIds[i]][markerGenesId][0][0])
                        / metadata[genomeIds[i]]['genome size'])
                    maskI.append(0)
                else:
                    geneOrderI.append(-1)
                    maskI.append(1)

            for j in range(i + 1, len(genomeIds)):
                geneOrderJ = []
                maskJ = []
                for markerGenesId in markerGenes:
                    if markerGenesId in geneDistTable[genomeIds[j]]:
                        geneOrderJ.append(
                            float(geneDistTable[genomeIds[j]][markerGenesId][0]
                                  [0]) / metadata[genomeIds[j]]['genome size'])
                        maskJ.append(0)
                    else:
                        geneOrderJ.append(-1)
                        maskJ.append(1)

                # test all translations
                bestSpearman = 0
                bestPearson = 0
                for _ in range(0, len(markerGenes)):
                    maskedI = []
                    maskedJ = []
                    for k in range(0, len(maskI)):
                        if maskI[k] == 0 and maskJ[k] == 0:
                            maskedI.append(geneOrderI[k])
                            maskedJ.append(geneOrderJ[k])
                    r, _ = spearmanr(maskedI, maskedJ)
                    if abs(r) > bestSpearman:
                        bestSpearman = abs(r)

                    r, _ = pearsonr(maskedI, maskedJ)
                    if abs(r) > bestPearson:
                        bestPearson = abs(r)

                    geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]]
                    maskJ = maskJ[1:] + [maskJ[0]]

                spearmanValues.append(bestSpearman)
                pearsonValues.append(bestPearson)

        print('Spearman: %.2f +/- %.2f: ' %
              (mean(spearmanValues), std(spearmanValues)))
        print('Pearson: %.2f +/- %.2f: ' %
              (mean(pearsonValues), std(pearsonValues)))
    def run(self, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination, genomeCompleteness, genomeContamination):
        img = IMG()
        markerset = MarkerSet()

        metadata = img.genomeMetadata()

        trustedOut = open('./data/trusted_genomes.tsv', 'w')
        trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n')

        filteredOut = open('./data/filtered_genomes.tsv', 'w')
        filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n')

        allGenomeIds = set()
        allTrustedGenomeIds = set()
        for lineage in ['Archaea', 'Bacteria']:
            # get all genomes in lineage and build gene count table
            print '\nBuilding gene count table.'
            allLineageGenomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'All')
            countTable = img.countTable(allLineageGenomeIds)
            countTable = img.filterTable(allLineageGenomeIds, countTable, 0.9*ubiquityThreshold, 0.9*singleCopyThreshold)

            # get all genomes from specific lineage
            allGenomeIds = allGenomeIds.union(allLineageGenomeIds)

            print 'Lineage ' + lineage + ' contains ' + str(len(allLineageGenomeIds)) + ' genomes.'

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker set for genomes
            markerGenes = markerset.markerGenes(allLineageGenomeIds, countTable, ubiquityThreshold*len(allLineageGenomeIds), singleCopyThreshold*len(allLineageGenomeIds))
            print '  Marker genes: ' + str(len(markerGenes))

            geneDistTable = img.geneDistTable(allLineageGenomeIds, markerGenes, spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable, metadata)
            colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes)
            print '  Marker set size: ' + str(len(colocatedSets))

            # identifying trusted genomes (highly complete, low contamination genomes)
            trustedGenomeIds = set()
            for genomeId in allLineageGenomeIds:
                completeness, contamination = markerset.genomeCheck(colocatedSets, genomeId, countTable)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)

                    trustedOut.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']))
                    trustedOut.write('\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6))
                    trustedOut.write('\t' + str(metadata[genomeId]['scaffold count']))
                    trustedOut.write('\t' + metadata[genomeId]['biotic relationships'])
                    trustedOut.write('\t' + metadata[genomeId]['status'])
                    trustedOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n')
                else:
                    filteredOut.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']))
                    filteredOut.write('\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6))
                    filteredOut.write('\t' + str(metadata[genomeId]['scaffold count']))
                    filteredOut.write('\t' + metadata[genomeId]['biotic relationships'])
                    filteredOut.write('\t' + metadata[genomeId]['status'])
                    filteredOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n')

            print '  Trusted genomes: ' + str(len(trustedGenomeIds))

            # determine status of trusted genomes
            statusBreakdown = {}
            for genomeId in trustedGenomeIds:
                statusBreakdown[metadata[genomeId]['status']] = statusBreakdown.get(metadata[genomeId]['status'], 0) + 1

            print '  Trusted genome status breakdown: '
            for status, count in statusBreakdown.iteritems():
                print '    ' + status + ': ' + str(count)

            # determine status of retained genomes
            proposalNameBreakdown = {}
            for genomeId in trustedGenomeIds:
                proposalNameBreakdown[metadata[genomeId]['proposal name']] = proposalNameBreakdown.get(metadata[genomeId]['proposal name'], 0) + 1

            print '  Retained genome proposal name breakdown: '
            for pn, count in proposalNameBreakdown.iteritems():
                if 'KMG' in pn or 'GEBA' in pn or 'HMP' in pn:
                    print '    ' + pn + ': ' + str(count)

            print '  Filtered genomes by phylum:'
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1

            for phylum, count in allPhylumCounts.iteritems():
                print phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count)

        trustedOut.close()
        filteredOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in xrange(0, 6): # Domain to Genus
            for genomeId, data in metadata.iteritems():
                taxaStr = '; '.join(data['taxonomy'][0:r+1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted()

        fout = open('./data/lineage_stats.tsv', 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n')
        fout.close()
Beispiel #25
0
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, numBins,
            numRndGenomes):
        img = IMG()
        markerSet = MarkerSet()

        metadata = img.genomeMetadata()
        lineageGenomeIds = img.genomeIdsByTaxonomy(taxonomyStr, metadata)

        # build marker set from finished prokaryotic genomes
        genomeIds = []
        for genomeId in lineageGenomeIds:
            if metadata[genomeId]['status'] == 'Finished' and (
                    metadata[genomeId]['taxonomy'][0] == 'Bacteria'
                    or metadata[genomeId]['taxonomy'][0] == 'Archaea'):
                genomeIds.append(genomeId)
        genomeIds = set(genomeIds) - img.genomesWithMissingData(genomeIds)

        print 'Lineage ' + taxonomyStr + ' contains ' + str(
            len(genomeIds)) + ' genomes.'

        # get marker set
        countTable = img.countTable(genomeIds)
        countTable = img.filterTable(genomeIds, countTable,
                                     0.9 * ubiquityThreshold,
                                     0.9 * singleCopyThreshold)
        markerGenes = markerSet.markerGenes(
            genomeIds, countTable, ubiquityThreshold * len(genomeIds),
            singleCopyThreshold * len(genomeIds))
        tigrToRemove = img.identifyRedundantTIGRFAMs(markerGenes)
        markerGenes = markerGenes - tigrToRemove
        geneDistTable = img.geneDistTable(genomeIds,
                                          markerGenes,
                                          spacingBetweenContigs=1e6)

        print 'Number of marker genes: ' + str(len(markerGenes))

        # randomly set genomes to plot
        if numRndGenomes != -1:
            genomeIds = random.sample(list(genomeIds), numRndGenomes)
        genomeIds = set(genomeIds)

        # plot distribution of marker genes
        filename = 'geneDistribution.' + taxonomyStr.replace(
            ';', '_') + '.' + str(ubiquityThreshold) + '-' + str(
                singleCopyThreshold) + '.tsv'
        fout = open(filename, 'w')
        fout.write(
            'Genome ID\tLineage\tNumber of Genes\tUniformity\tDistribution\n')
        matrix = []
        rowLabels = []
        for genomeId in genomeIds:
            binSize = float(metadata[genomeId]['genome size']) / numBins

            binCounts = [0] * numBins
            pts = []
            for _, data in geneDistTable[genomeId].iteritems():
                for genePos in data:
                    binNum = int(genePos[1] / binSize)
                    binCounts[binNum] += 1
                    pts.append(genePos[1])
            matrix.append(binCounts)

            u = markerSet.uniformity(metadata[genomeId]['genome size'], pts)

            fout.write(genomeId + '\t' +
                       '; '.join(metadata[genomeId]['taxonomy']) + '\t' +
                       str(len(geneDistTable[genomeId])) + '\t%.3f' % u)
            for b in xrange(0, numBins):
                fout.write('\t' + str(binCounts[b]))
            fout.write('\n')

            rowLabels.append('%.2f' % u + ', ' + str(genomeId) + ' - ' +
                             '; '.join(metadata[genomeId]['taxonomy'][0:5]))

        fout.close()

        # plot data
        heatmap = Heatmap()
        plotFilename = 'geneDistribution.' + taxonomyStr.replace(
            ';', '_') + '.' + str(ubiquityThreshold) + '-' + str(
                singleCopyThreshold) + '.png'
        heatmap.plot(plotFilename, matrix, rowLabels, 0.6)
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
        print '\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'

        # build marker genes and colocated marker sets
        countTable = img.countTable(genomeIds)
        markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
        print '  Marker genes: ' + str(len(markerGenes))

        geneDistTable = img.geneDistTable(genomeIds, markerGenes)
        colocatedGenes = img.colocatedGenes(geneDistTable)
        colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
        print '  Co-located gene sets: ' + str(len(colocatedSets))


        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            mgCompletion = []
            msCompletion = []
            for _ in xrange(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen)

                # calculate completion with marker genes
                containedMarkerGenes = img.containedMarkerGenes(markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen)
                mgCompletion.append(float(len(containedMarkerGenes))/len(markerGenes) - percentCompletion)

                # calculate completion with marker set
                comp = 0.0
                for cs in colocatedSets:
                    present = 0
                    for contigId in cs:
                        if contigId in containedMarkerGenes:
                            present += 1

                    comp += float(present) / len(cs)
                msCompletion.append(comp / len(colocatedSets) - percentCompletion)

            plotData.append(mgCompletion)
            plotData.append(msCompletion)

            species = ' '.join(metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:])

            plotLabels.append(species + ' (' + genomeId + ')')
            plotLabels.append('')

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes,
            minMarkers, mostSpecificRank, percentGenomes, numReplicates):
        img = IMG()

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)

        fout = open('./data/lineage_evaluation.tsv', 'w')
        fout.write(
            'Lineage\t# genomes\t# markers\tpercentage\tnum replicates\tmean\tstd\tmean %\tmean + std%\tmean + 2*std %\n'
        )

        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            if len(genomeIds) < minGenomes:
                continue

            countTable = img.countTable(genomeIds)
            countTable = img.filterTable(genomeIds, countTable,
                                         ubiquityThreshold * 0.9,
                                         singleCopyThreshold * 0.9)

            # calculate marker set for all genomes
            markerGenes = img.markerGenes(genomeIds, countTable,
                                          ubiquityThreshold * len(genomeIds),
                                          singleCopyThreshold * len(genomeIds))
            if len(markerGenes) < minMarkers:
                continue

            print '\nLineage ' + lineage + ' contains ' + str(
                len(genomeIds)) + ' genomes.'
            print '  Marker genes: ' + str(len(markerGenes))

            fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' +
                       str(len(markerGenes)) + '\t%.2f' % percentGenomes +
                       '\t' + str(numReplicates))

            # withhold select percentage of genomes and calculate new marker set
            changeMarkerSetSize = []
            for _ in xrange(0, numReplicates):
                subsetGenomeIds = random.sample(
                    genomeIds,
                    int((1.0 - percentGenomes) * len(genomeIds) + 0.5))

                newMarkerGenes = img.markerGenes(
                    subsetGenomeIds, countTable,
                    ubiquityThreshold * len(subsetGenomeIds),
                    singleCopyThreshold * len(subsetGenomeIds))

                changeMarkerSetSize.append(
                    len(newMarkerGenes.symmetric_difference(markerGenes)))

            m = mean(changeMarkerSetSize)
            s = std(changeMarkerSetSize)

            print '  Mean: %.2f, Std: %.2f, Per: %.2f' % (m, s,
                                                          (m + 2 * s) * 100 /
                                                          len(markerGenes))
            fout.write('\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f' %
                       (m, s, m * 100 / len(markerGenes),
                        (m + s) * 100 / len(markerGenes),
                        (m + 2 * s) * 100 / len(markerGenes)) + '\n')

        fout.close()
Beispiel #28
0
    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers):
        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
        print 'Single-copy threshold: ' + str(singleCopyThreshold)
        print 'Min. genomes: ' + str(minGenomes)
        print 'Most specific taxonomic rank: ' + str(mostSpecificRank)

        img = IMG()

        deltaMarkerSetSizes = []

        lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank)
        lineages = ['prokaryotes'] + lineages

        boxPlotLabels = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage)
            trusted = img.trustedGenomes()
            genomeIds = list(genomeIds.intersection(trusted))

            print ''
            print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # get table of PFAMs and do some initial filtering to remove PFAMs that are
            # clearly not going to pass the ubiquity and single-copy thresholds
            pfamTable = img.pfamTable(genomeIds)
            pfamTable = img.filterPfamTable(genomeIds, pfamTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9)

            markerSet = img.markerGenes(genomeIds, pfamTable, ubiquityThreshold*(len(genomeIds)-1), singleCopyThreshold*(len(genomeIds)-1))
            fullMarkerSetSize = len(markerSet)

            if fullMarkerSetSize < minMarkers:
                continue

            boxPlotLabels.append(lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) + ', ' + str(fullMarkerSetSize) + ')')

            deltaMarkerSetSize = []
            numGenomes = len(genomeIds)-1

            for loo in xrange(0, len(genomeIds)):
                if loo != len(genomeIds) - 1:
                    genomeIdSubset = genomeIds[0:loo] + genomeIds[loo+1:]
                else:
                    genomeIdSubset = genomeIds[0:loo]

                markerSet = img.markerGenes(genomeIdSubset, pfamTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset))
                deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet))

                if fullMarkerSetSize < len(markerSet):
                    print '[Warning] Unexpected!'

            deltaMarkerSetSizes.append(deltaMarkerSetSize)

            m = mean(deltaMarkerSetSize)
            s = std(deltaMarkerSetSize)

            print '  LOO Ubiquity >= ' + str(int(ubiquityThreshold*numGenomes)) + ', LOO Single-copy >= ' + str(int(singleCopyThreshold*numGenomes))
            print '  Delta Mean: %.2f +/- %.2f' % (m, s)
            print '  Delta Min: %d, Delta Max: %d' % (min(deltaMarkerSetSize), max(deltaMarkerSetSize))

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png'
        title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
        boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels, r'$\Delta$' + ' Marker Set Size', '', False, title)
Beispiel #29
0
    def run(self, ubiquityThreshold, singleCopyThreshold, trustedCompleteness,
            trustedContamination, genomeCompleteness, genomeContamination):
        img = IMG()
        markerset = MarkerSet()

        metadata = img.genomeMetadata()

        trustedOut = open('./data/trusted_genomes.tsv', 'w')
        trustedOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n'
        )

        filteredOut = open('./data/filtered_genomes.tsv', 'w')
        filteredOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n'
        )

        allGenomeIds = set()
        allTrustedGenomeIds = set()
        for lineage in ['Archaea', 'Bacteria']:
            # get all genomes in lineage and build gene count table
            print '\nBuilding gene count table.'
            allLineageGenomeIds = img.genomeIdsByTaxonomy(
                lineage, metadata, 'All')
            countTable = img.countTable(allLineageGenomeIds)
            countTable = img.filterTable(allLineageGenomeIds, countTable,
                                         0.9 * ubiquityThreshold,
                                         0.9 * singleCopyThreshold)

            # get all genomes from specific lineage
            allGenomeIds = allGenomeIds.union(allLineageGenomeIds)

            print 'Lineage ' + lineage + ' contains ' + str(
                len(allLineageGenomeIds)) + ' genomes.'

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker set for genomes
            markerGenes = markerset.markerGenes(
                allLineageGenomeIds, countTable,
                ubiquityThreshold * len(allLineageGenomeIds),
                singleCopyThreshold * len(allLineageGenomeIds))
            print '  Marker genes: ' + str(len(markerGenes))

            geneDistTable = img.geneDistTable(allLineageGenomeIds,
                                              markerGenes,
                                              spacingBetweenContigs=1e6)
            colocatedGenes = markerset.colocatedGenes(geneDistTable, metadata)
            colocatedSets = markerset.colocatedSets(colocatedGenes,
                                                    markerGenes)
            print '  Marker set size: ' + str(len(colocatedSets))

            # identifying trusted genomes (highly complete, low contamination genomes)
            trustedGenomeIds = set()
            for genomeId in allLineageGenomeIds:
                completeness, contamination = markerset.genomeCheck(
                    colocatedSets, genomeId, countTable)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)

                    trustedOut.write(genomeId + '\t' +
                                     '; '.join(metadata[genomeId]['taxonomy']))
                    trustedOut.write(
                        '\t%.2f' %
                        (float(metadata[genomeId]['genome size']) / 1e6))
                    trustedOut.write('\t' +
                                     str(metadata[genomeId]['scaffold count']))
                    trustedOut.write(
                        '\t' + metadata[genomeId]['biotic relationships'])
                    trustedOut.write('\t' + metadata[genomeId]['status'])
                    trustedOut.write('\t%.3f\t%.3f' %
                                     (completeness, contamination) + '\n')
                else:
                    filteredOut.write(
                        genomeId + '\t' +
                        '; '.join(metadata[genomeId]['taxonomy']))
                    filteredOut.write(
                        '\t%.2f' %
                        (float(metadata[genomeId]['genome size']) / 1e6))
                    filteredOut.write(
                        '\t' + str(metadata[genomeId]['scaffold count']))
                    filteredOut.write(
                        '\t' + metadata[genomeId]['biotic relationships'])
                    filteredOut.write('\t' + metadata[genomeId]['status'])
                    filteredOut.write('\t%.3f\t%.3f' %
                                      (completeness, contamination) + '\n')

            print '  Trusted genomes: ' + str(len(trustedGenomeIds))

            # determine status of trusted genomes
            statusBreakdown = {}
            for genomeId in trustedGenomeIds:
                statusBreakdown[metadata[genomeId]
                                ['status']] = statusBreakdown.get(
                                    metadata[genomeId]['status'], 0) + 1

            print '  Trusted genome status breakdown: '
            for status, count in statusBreakdown.iteritems():
                print '    ' + status + ': ' + str(count)

            # determine status of retained genomes
            proposalNameBreakdown = {}
            for genomeId in trustedGenomeIds:
                proposalNameBreakdown[metadata[genomeId][
                    'proposal name']] = proposalNameBreakdown.get(
                        metadata[genomeId]['proposal name'], 0) + 1

            print '  Retained genome proposal name breakdown: '
            for pn, count in proposalNameBreakdown.iteritems():
                if 'KMG' in pn or 'GEBA' in pn or 'HMP' in pn:
                    print '    ' + pn + ': ' + str(count)

            print '  Filtered genomes by phylum:'
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon,
                                                                     0) + 1

            for phylum, count in allPhylumCounts.iteritems():
                print phylum + ': %d of %d' % (trustedPhylumCounts.get(
                    phylum, 0), count)

        trustedOut.close()
        filteredOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in xrange(0, 6):  # Domain to Genus
            for genomeId, data in metadata.iteritems():
                taxaStr = '; '.join(data['taxonomy'][0:r + 1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted()

        fout = open('./data/lineage_stats.tsv', 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' +
                       str(trustedStats.get(lineage, 0)) + '\n')
        fout.close()
    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold,
            percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
        print('\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) +
              ' genomes.')

        # build marker genes and colocated marker sets
        countTable = img.countTable(genomeIds)
        markerGenes = img.markerGenes(genomeIds, countTable,
                                      ubiquityThreshold * len(genomeIds),
                                      singleCopyThreshold * len(genomeIds))
        print('  Marker genes: ' + str(len(markerGenes)))

        geneDistTable = img.geneDistTable(genomeIds,
                                          markerGenes,
                                          spacingBetweenContigs=1e6)
        colocatedGenes = img.colocatedGenes(geneDistTable)
        colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
        print('  Co-located gene sets: ' + str(len(colocatedSets)))

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            mgCompletion = []
            msCompletion = []
            for _ in range(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(
                    metadata[genomeId]['genome size'], percentCompletion,
                    contigLen)

                # calculate completion with marker genes
                containedMarkerGenes = img.containedMarkerGenes(
                    markerGenes, geneDistTable[genomeId],
                    startPartialGenomeContigs, contigLen)
                mgCompletion.append(
                    float(len(containedMarkerGenes)) / len(markerGenes) -
                    percentCompletion)

                # calculate completion with marker set
                comp = 0.0
                for cs in colocatedSets:
                    present = 0
                    for contigId in cs:
                        if contigId in containedMarkerGenes:
                            present += 1

                    comp += float(present) / len(cs)
                msCompletion.append(comp / len(colocatedSets) -
                                    percentCompletion)

            plotData.append(mgCompletion)
            plotData.append(msCompletion)

            species = ' '.join(
                metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:])

            plotLabels.append(species + ' (' + genomeId + ')')
            plotLabels.append('')

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(
            ';', '_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(
            ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels,
                     r'$\Delta$' + ' Percent Completion', '', False, title)
    def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen):
        img = IMG()

        lineages = []
        taxon = taxonomyStr.split(';')
        for r in xrange(0, len(taxon)):
            lineages.append(';'.join(taxon[0:r+1]))

        # get all marker sets
        markerGenes = []
        geneDistTable = []
        colocatedSets = []
        for lineage in lineages:
            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
            print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.'

            # build marker genes and colocated marker sets
            countTable = img.countTable(genomeIds)
            mg = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
            print '  Marker genes: ' + str(len(mg))

            mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6)
            colocatedGenes = img.colocatedGenes(mdt)
            cs = img.colocatedSets(colocatedGenes, mg)
            print '  Co-located gene sets: ' + str(len(cs))

            markerGenes.append(mg)
            geneDistTable.append(mdt)
            colocatedSets.append(cs)

        # random sample genomes
        if numGenomes == -1:
            rndGenomeIds = genomeIds
        else:
            rndGenomeIds = random.sample(genomeIds, numGenomes)

        # estimate completion for each genome using both the marker genes and marker sets
        metadata = img.genomeMetadata('Final')
        plotLabels = []
        plotData = []
        for genomeId in rndGenomeIds:
            completion = [[] for _ in xrange(len(lineages))]
            for _ in xrange(0, numReplicates):
                startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen)

                # calculate completion with marker set
                for i in xrange(len(lineages)):
                    containedMarkerGenes = img.containedMarkerGenes(markerGenes[i], geneDistTable[i][genomeId], startPartialGenomeContigs, contigLen)

                    comp = 0.0
                    for cs in colocatedSets[i]:
                        present = 0
                        for contigId in cs:
                            if contigId in containedMarkerGenes:
                                present += 1

                        comp += float(present) / len(cs)

                    completion[i].append(comp / len(colocatedSets[i]) - percentCompletion)

                    plotLabels.append(genomeId + '  - ' + lineages[i])

            for d in completion:
                plotData.append(d)

        # plot data
        boxPlot = BoxPlot()
        plotFilename = './images/sim.lineages.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png'
        title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
        boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)