def run(self, minThreshold, maxThreshold, stepSize, minGenomes, mostSpecificRanks): img = IMG() trustedGenomeIds = img.trustedGenomes() fout = open('./data/markerSetSize.tsv', 'w') fout.write('Lineage\t# genomes') for threshold in arange(maxThreshold, minThreshold, -stepSize): fout.write('\t' + str(threshold)) fout.write('\n') lineages = img.lineagesSorted(mostSpecificRanks) for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage) genomeIds = list(genomeIds.intersection(trustedGenomeIds)) if len(genomeIds) < minGenomes: continue print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.') fout.write(lineage + '\t' + str(len(genomeIds))) pfamTable = img.pfamTable(genomeIds) for threshold in arange(maxThreshold, minThreshold, -stepSize): markerSet = img.markerGenes(genomeIds, pfamTable, threshold * len(genomeIds), threshold * len(genomeIds)) fout.write('\t' + str(len(markerSet))) print(' Threshold = %.2f, marker set size = %d' % (threshold, len(markerSet))) fout.write('\n') fout.close()
def run(self, taxonomyStr, minThreshold, maxThreshold, stepSize): img = IMG() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print('Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.') markerSetSizes = [] countTable = img.countTable(genomeIds) for threshold in arange(maxThreshold, minThreshold, -stepSize): markerGenes = img.markerGenes(genomeIds, countTable, threshold*len(genomeIds), threshold*len(genomeIds)) geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) markerSetSizes.append(len(colocatedSets)) print(' Threshold = %.2f, marker set size = %d' % (threshold, len(markerGenes))) # plot data plot = LinePlot() plotFilename = './images/markerSetSize.' + taxonomyStr.replace(';','_') + '.png' title = taxonomyStr.replace(';', '; ') plot.plot(plotFilename, arange(maxThreshold, minThreshold, -stepSize), markerSetSizes, 'Threshold', 'Marker Set Size', title)
def run(self, minThreshold, maxThreshold, stepSize, minGenomes, mostSpecificRanks): img = IMG() trustedGenomeIds = img.trustedGenomes() fout = open("./data/markerSetSize.tsv", "w") fout.write("Lineage\t# genomes") for threshold in arange(maxThreshold, minThreshold, -stepSize): fout.write("\t" + str(threshold)) fout.write("\n") lineages = img.lineagesSorted(mostSpecificRanks) for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage) genomeIds = list(genomeIds.intersection(trustedGenomeIds)) if len(genomeIds) < minGenomes: continue print "\nLineage " + lineage + " contains " + str(len(genomeIds)) + " genomes." fout.write(lineage + "\t" + str(len(genomeIds))) pfamTable = img.pfamTable(genomeIds) for threshold in arange(maxThreshold, minThreshold, -stepSize): markerSet = img.markerGenes( genomeIds, pfamTable, threshold * len(genomeIds), threshold * len(genomeIds) ) fout.write("\t" + str(len(markerSet))) print " Threshold = %.2f, marker set size = %d" % (threshold, len(markerSet)) fout.write("\n") fout.close()
def run(self, taxonomyStr, minThreshold, maxThreshold, stepSize): img = IMG() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.' markerSetSizes = [] countTable = img.countTable(genomeIds) for threshold in arange(maxThreshold, minThreshold, -stepSize): markerGenes = img.markerGenes(genomeIds, countTable, threshold*len(genomeIds), threshold*len(genomeIds)) geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) markerSetSizes.append(len(colocatedSets)) print ' Threshold = %.2f, marker set size = %d' % (threshold, len(markerGenes)) # plot data plot = LinePlot() plotFilename = './images/markerSetSize.' + taxonomyStr.replace(';','_') + '.png' title = taxonomyStr.replace(';', '; ') plot.plot(plotFilename, arange(maxThreshold, minThreshold, -stepSize), markerSetSizes, 'Threshold', 'Marker Set Size', title)
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers, completenessThreshold, contaminationThreshold): print 'Ubiquity threshold: ' + str(ubiquityThreshold) print 'Single-copy threshold: ' + str(singleCopyThreshold) print 'Min. genomes: ' + str(minGenomes) print 'Most specific taxonomic rank: ' + str(mostSpecificRank) print 'Min markers: ' + str(minMarkers) print 'Completeness threshold: ' + str(completenessThreshold) print 'Contamination threshold: ' + str(contaminationThreshold) img = IMG() markerset = MarkerSet() lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) degenerateGenomes = {} for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') print '' print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' # get table of PFAMs and do some initial filtering to remove PFAMs that are # clearly not going to pass the ubiquity and single-copy thresholds countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9) markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) if len(markerGenes) < minMarkers: continue geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) for genomeId in genomeIds: completeness, contamination = markerset.genomeCheck(colocatedSets, genomeId, countTable) if completeness < completenessThreshold or contamination > contaminationThreshold: degenerateGenomes[genomeId] = degenerateGenomes.get(genomeId, []) + [[lineage.split(';')[-1].strip(), len(genomeIds), len(colocatedSets), completeness, contamination]] # write out degenerate genomes metadata = img.genomeMetadata('Final') fout = open('./data/degenerate_genomes.tsv', 'w') fout.write('Genome Id\tTaxonomy\tGenome Size (Gbps)\tScaffolds\tBiotic Relationships\tStatus\tLineage\t# genomes\tMarker set size\tCompleteness\tContamination\n') for genomeId, data in degenerateGenomes.iteritems(): fout.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']) + '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6) + '\t' + str(metadata[genomeId]['scaffold count'])) fout.write('\t' + metadata[genomeId]['biotic relationships'] + '\t' + metadata[genomeId]['status']) for d in data: fout.write('\t' + d[0] + '\t' + str(d[1]) + '\t' + str(d[2]) + '\t%.3f\t%.3f' % (d[3], d[4])) fout.write('\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, rank): img = IMG() markerset = MarkerSet() print('Reading metadata.') metadata = img.genomeMetadata() print(' Genomes with metadata: ' + str(len(metadata))) # calculate marker set for each lineage at the specified rank sortedLineages = img.lineagesSorted(metadata, rank) markerGeneLists = {} for lineage in sortedLineages: taxonomy = lineage.split(';') if len(taxonomy) != rank + 1: continue genomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'Final') countTable = img.countTable(genomeIds) if len(genomeIds) < 3: continue print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.') markerGenes = markerset.markerGenes( genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) print(' Marker genes: ' + str(len(markerGenes))) print('') markerGeneLists[lineage] = markerGenes # calculate union of marker gene list for higher taxonomic groups for r in range(rank - 1, -1, -1): print('Processing rank ' + str(r)) rankMarkerGeneLists = {} for lineage, markerGenes in markerGeneLists.iteritems(): taxonomy = lineage.split(';') if len(taxonomy) != r + 2: continue curLineage = '; '.join(taxonomy[0:r + 1]) if curLineage not in rankMarkerGeneLists: rankMarkerGeneLists[curLineage] = markerGenes else: curMarkerGenes = rankMarkerGeneLists[curLineage] curMarkerGenes = curMarkerGenes.intersection(markerGenes) rankMarkerGeneLists[curLineage] = curMarkerGenes # combine marker gene list dictionaries markerGeneLists.update(rankMarkerGeneLists)
def run( self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, distThreshold, genomeThreshold, ): img = IMG() markerset = MarkerSet() lineages = img.lineagesSorted(mostSpecificRank) fout = open("./data/colocated.tsv", "w", 1) fout.write("Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n") lineageCount = 0 for lineage in lineages: lineageCount += 1 genomeIds = img.genomeIdsByTaxonomy(lineage, "Final") if len(genomeIds) < minGenomes: continue countTable = img.countTable(genomeIds) markerGenes = markerset.markerGenes( genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds) ) geneDistTable = img.geneDistTable(genomeIds, markerGenes) colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) if len(colocatedSets) < minMarkers: continue print "\nLineage " + lineage + " contains " + str(len(genomeIds)) + " genomes (" + str( lineageCount ) + " of " + str(len(lineages)) + ")." print " Marker genes: " + str(len(markerGenes)) print " Co-located gene sets: " + str(len(colocatedSets)) fout.write( lineage + "\t" + str(len(genomeIds)) + "\t" + str(len(markerGenes)) + "\t" + str(len(colocatedSets)) ) for cs in colocatedSets: fout.write("\t" + ", ".join(cs)) fout.write("\n") fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, rank): img = IMG() markerset = MarkerSet() print 'Reading metadata.' metadata = img.genomeMetadata() print ' Genomes with metadata: ' + str(len(metadata)) # calculate marker set for each lineage at the specified rank sortedLineages = img.lineagesSorted(metadata, rank) markerGeneLists = {} for lineage in sortedLineages: taxonomy = lineage.split(';') if len(taxonomy) != rank+1: continue genomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'Final') countTable = img.countTable(genomeIds) if len(genomeIds) < 3: continue print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) print ' Marker genes: ' + str(len(markerGenes)) print '' markerGeneLists[lineage] = markerGenes # calculate union of marker gene list for higher taxonomic groups for r in xrange(rank-1, -1, -1): print 'Processing rank ' + str(r) rankMarkerGeneLists = {} for lineage, markerGenes in markerGeneLists.iteritems(): taxonomy = lineage.split(';') if len(taxonomy) != r+2: continue curLineage = '; '.join(taxonomy[0:r+1]) if curLineage not in rankMarkerGeneLists: rankMarkerGeneLists[curLineage] = markerGenes else: curMarkerGenes = rankMarkerGeneLists[curLineage] curMarkerGenes = curMarkerGenes.intersection(markerGenes) rankMarkerGeneLists[curLineage] = curMarkerGenes # combine marker gene list dictionaries markerGeneLists.update(rankMarkerGeneLists)
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, distThreshold, genomeThreshold): img = IMG() markerset = MarkerSet() lineages = img.lineagesSorted(mostSpecificRank) fout = open('./data/colocated.tsv', 'w', 1) fout.write( 'Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n' ) lineageCount = 0 for lineage in lineages: lineageCount += 1 genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') if len(genomeIds) < minGenomes: continue countTable = img.countTable(genomeIds) markerGenes = markerset.markerGenes( genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) if len(colocatedSets) < minMarkers: continue print '\nLineage ' + lineage + ' contains ' + str(len( genomeIds)) + ' genomes (' + str(lineageCount) + ' of ' + str( len(lineages)) + ').' print ' Marker genes: ' + str(len(markerGenes)) print ' Co-located gene sets: ' + str(len(colocatedSets)) fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t' + str(len(colocatedSets))) for cs in colocatedSets: fout.write('\t' + ', '.join(cs)) fout.write('\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, percentGenomes, numReplicates): img = IMG() lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) fout = open('./data/lineage_evaluation.tsv', 'w') fout.write('Lineage\t# genomes\t# markers\tpercentage\tnum replicates\tmean\tstd\tmean %\tmean + std%\tmean + 2*std %\n') for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') if len(genomeIds) < minGenomes: continue countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9) # calculate marker set for all genomes markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) if len(markerGenes) < minMarkers: continue print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' print ' Marker genes: ' + str(len(markerGenes)) fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t%.2f' % percentGenomes + '\t' + str(numReplicates)) # withhold select percentage of genomes and calculate new marker set changeMarkerSetSize = [] for _ in xrange(0, numReplicates): subsetGenomeIds = random.sample(genomeIds, int((1.0-percentGenomes)*len(genomeIds) + 0.5)) newMarkerGenes = img.markerGenes(subsetGenomeIds, countTable, ubiquityThreshold*len(subsetGenomeIds), singleCopyThreshold*len(subsetGenomeIds)) changeMarkerSetSize.append(len(newMarkerGenes.symmetric_difference(markerGenes))) m = mean(changeMarkerSetSize) s = std(changeMarkerSetSize) print ' Mean: %.2f, Std: %.2f, Per: %.2f' % (m, s, (m+ 2*s) * 100 / len(markerGenes)) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f' % (m, s, m * 100 / len(markerGenes), (m + s) * 100 / len(markerGenes), (m + 2*s) * 100 / len(markerGenes)) + '\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, distThreshold, genomeThreshold): img = IMG() markerset = MarkerSet() lineages = img.lineagesSorted(mostSpecificRank) fout = open('./data/colocated.tsv', 'w', 1) fout.write('Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n') lineageCount = 0 for lineage in lineages: lineageCount += 1 genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') if len(genomeIds) < minGenomes: continue countTable = img.countTable(genomeIds) markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) if len(colocatedSets) < minMarkers: continue print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes (' + str(lineageCount) + ' of ' + str(len(lineages)) + ').' print ' Marker genes: ' + str(len(markerGenes)) print ' Co-located gene sets: ' + str(len(colocatedSets)) fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t' + str(len(colocatedSets))) for cs in colocatedSets: fout.write('\t' + ', '.join(cs)) fout.write('\n') fout.close()
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, replicates, minGenomes, maxGenomes, stepSize): img = IMG() markergenes = MarkerGenes() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.' if len(genomeIds) < minGenomes: sys.stderr.write('[Error] Insufficent number of genomes.\n') sys.exit() print '' print 'Ubiquity threshold: ' + str(ubiquityThreshold) print 'Single-copy threshold: ' + str(singleCopyThreshold) meanMarkerSetSize = [] stdMarkerSetSize = [] markerSetSizes = [] if maxGenomes == -1: maxGenomes = len(genomeIds) if maxGenomes > len(genomeIds): maxGenomes = len(genomeIds) countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable) for numGenomes in xrange(minGenomes, maxGenomes, stepSize): markerSetSize = [] for _ in xrange(0, replicates): genomeIdSubset = random.sample(genomeIds, numGenomes) markerGenes = markergenes.identify(genomeIdSubset, countTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset)) geneDistTable = img.geneDistTable(genomeIdSubset, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) markerSetSize.append(len(colocatedSets)) markerSetSizes.append(markerSetSize) m = mean(markerSetSize) meanMarkerSetSize.append(m) s = std(markerSetSize) stdMarkerSetSize.append(s) print '' print 'Genomes: ' + str(numGenomes) + ', Ubiquity > ' + str(int(ubiquityThreshold*len(genomeIdSubset))) + ', Single-copy > ' + str(int(singleCopyThreshold*len(genomeIdSubset))) print 'Mean: %.2f +/- %.2f' % (m, s) print 'Min: %d, Max: %d' %(min(markerSetSize), max(markerSetSize)) # plot data errorBar = ErrorBar() plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.errorbar.png' title = taxonomyStr.replace(';', '; ') + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize), meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes', 'Marker Set Size', title) boxPlot = BoxPlot() plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png' boxPlot.plot(plotFilename, markerSetSizes, arange(minGenomes, maxGenomes, stepSize), 'Number of Genomes', 'Marker Set Size', True, title)
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers, completenessThreshold, contaminationThreshold): print 'Ubiquity threshold: ' + str(ubiquityThreshold) print 'Single-copy threshold: ' + str(singleCopyThreshold) print 'Min. genomes: ' + str(minGenomes) print 'Most specific taxonomic rank: ' + str(mostSpecificRank) print 'Min markers: ' + str(minMarkers) print 'Completeness threshold: ' + str(completenessThreshold) print 'Contamination threshold: ' + str(contaminationThreshold) img = IMG() markerset = MarkerSet() lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) degenerateGenomes = {} for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') print '' print 'Lineage ' + lineage + ' contains ' + str( len(genomeIds)) + ' genomes.' # get table of PFAMs and do some initial filtering to remove PFAMs that are # clearly not going to pass the ubiquity and single-copy thresholds countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold * 0.9, singleCopyThreshold * 0.9) markerGenes = markerset.markerGenes( genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) if len(markerGenes) < minMarkers: continue geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) for genomeId in genomeIds: completeness, contamination = markerset.genomeCheck( colocatedSets, genomeId, countTable) if completeness < completenessThreshold or contamination > contaminationThreshold: degenerateGenomes[genomeId] = degenerateGenomes.get( genomeId, []) + [[ lineage.split(';')[-1].strip(), len(genomeIds), len(colocatedSets), completeness, contamination ]] # write out degenerate genomes metadata = img.genomeMetadata('Final') fout = open('./data/degenerate_genomes.tsv', 'w') fout.write( 'Genome Id\tTaxonomy\tGenome Size (Gbps)\tScaffolds\tBiotic Relationships\tStatus\tLineage\t# genomes\tMarker set size\tCompleteness\tContamination\n' ) for genomeId, data in degenerateGenomes.iteritems(): fout.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']) + '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6) + '\t' + str(metadata[genomeId]['scaffold count'])) fout.write('\t' + metadata[genomeId]['biotic relationships'] + '\t' + metadata[genomeId]['status']) for d in data: fout.write('\t' + d[0] + '\t' + str(d[1]) + '\t' + str(d[2]) + '\t%.3f\t%.3f' % (d[3], d[4])) fout.write('\n') fout.close()
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, replicates, minGenomes, maxGenomes, stepSize): img = IMG() markergenes = MarkerGenes() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print('Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.') if len(genomeIds) < minGenomes: sys.stderr.write('[Error] Insufficent number of genomes.\n') sys.exit() print('') print('Ubiquity threshold: ' + str(ubiquityThreshold)) print('Single-copy threshold: ' + str(singleCopyThreshold)) meanMarkerSetSize = [] stdMarkerSetSize = [] markerSetSizes = [] if maxGenomes == -1: maxGenomes = len(genomeIds) if maxGenomes > len(genomeIds): maxGenomes = len(genomeIds) countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable) for numGenomes in range(minGenomes, maxGenomes, stepSize): markerSetSize = [] for _ in range(0, replicates): genomeIdSubset = random.sample(genomeIds, numGenomes) markerGenes = markergenes.identify( genomeIdSubset, countTable, ubiquityThreshold * len(genomeIdSubset), singleCopyThreshold * len(genomeIdSubset)) geneDistTable = img.geneDistTable(genomeIdSubset, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) markerSetSize.append(len(colocatedSets)) markerSetSizes.append(markerSetSize) m = mean(markerSetSize) meanMarkerSetSize.append(m) s = std(markerSetSize) stdMarkerSetSize.append(s) print('') print('Genomes: ' + str(numGenomes) + ', Ubiquity > ' + str(int(ubiquityThreshold * len(genomeIdSubset))) + ', Single-copy > ' + str(int(singleCopyThreshold * len(genomeIdSubset)))) print('Mean: %.2f +/- %.2f' % (m, s)) print('Min: %d, Max: %d' % (min(markerSetSize), max(markerSetSize))) # plot data errorBar = ErrorBar() plotFilename = './images/markerset.' + taxonomyStr.replace( ';', '_') + '.' + str(ubiquityThreshold) + '-' + str( singleCopyThreshold) + '.errorbar.png' title = taxonomyStr.replace( ';', '; ' ) + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize), meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes', 'Marker Set Size', title) boxPlot = BoxPlot() plotFilename = './images/markerset.' + taxonomyStr.replace( ';', '_') + '.' + str(ubiquityThreshold) + '-' + str( singleCopyThreshold) + '.boxplot.png' boxPlot.plot(plotFilename, markerSetSizes, arange(minGenomes, maxGenomes, stepSize), 'Number of Genomes', 'Marker Set Size', True, title)
def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() lineages = [] taxon = taxonomyStr.split(';') for r in range(0, len(taxon)): lineages.append(';'.join(taxon[0:r + 1])) # get all marker sets markerGenes = [] geneDistTable = [] colocatedSets = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.') # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) mg = img.markerGenes(genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) print(' Marker genes: ' + str(len(mg))) mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(mdt) cs = img.colocatedSets(colocatedGenes, mg) print(' Co-located gene sets: ' + str(len(cs))) markerGenes.append(mg) geneDistTable.append(mdt) colocatedSets.append(cs) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: completion = [[] for _ in range(len(lineages))] for _ in range(0, numReplicates): startPartialGenomeContigs = img.sampleGenome( metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker set for i in range(len(lineages)): containedMarkerGenes = img.containedMarkerGenes( markerGenes[i], geneDistTable[i][genomeId], startPartialGenomeContigs, contigLen) comp = 0.0 for cs in colocatedSets[i]: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) completion[i].append(comp / len(colocatedSets[i]) - percentCompletion) plotLabels.append(genomeId + ' - ' + lineages[i]) for d in completion: plotData.append(d) # plot data boxPlot = BoxPlot() plotFilename = './images/sim.lineages.' + taxonomyStr.replace( ';', '_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace( ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
def run(self): img = IMG() markerset = MarkerSet() print 'Reading metadata.' metadata = img.genomeMetadata('Final') print 'Getting marker genes.' pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea') markerGenes = pfamMarkers.union(tigrMarkers) print ' Marker genes: ' + str(len(markerGenes)) print 'Getting genomes of interest.' genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final') print ' Genomes: ' + str(len(genomeIds)) print 'Getting position of each marker gene.' geneDistTable = img.geneDistTable(genomeIds, markerGenes) spearmanValues = [] pearsonValues = [] genomeIds = list(genomeIds) for i in xrange(0, len(genomeIds)): print str(i+1) + ' of ' + str(len(genomeIds)) geneOrderI = [] maskI = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[i]]: geneOrderI.append(float(geneDistTable[genomeIds[i]][markerGenesId][0][0]) / metadata[genomeIds[i]]['genome size']) maskI.append(0) else: geneOrderI.append(-1) maskI.append(1) for j in xrange(i+1, len(genomeIds)): geneOrderJ = [] maskJ = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[j]]: geneOrderJ.append(float(geneDistTable[genomeIds[j]][markerGenesId][0][0]) / metadata[genomeIds[j]]['genome size']) maskJ.append(0) else: geneOrderJ.append(-1) maskJ.append(1) # test all translations bestSpearman = 0 bestPearson = 0 for _ in xrange(0, len(markerGenes)): maskedI = [] maskedJ = [] for k in xrange(0, len(maskI)): if maskI[k] == 0 and maskJ[k] == 0: maskedI.append(geneOrderI[k]) maskedJ.append(geneOrderJ[k]) r, _ = spearmanr(maskedI, maskedJ) if abs(r) > bestSpearman: bestSpearman = abs(r) r, _ = pearsonr(maskedI, maskedJ) if abs(r) > bestPearson: bestPearson = abs(r) geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]] maskJ = maskJ[1:] + [maskJ[0]] spearmanValues.append(bestSpearman) pearsonValues.append(bestPearson) print 'Spearman: %.2f +/- %.2f: ' % (mean(spearmanValues), std(spearmanValues)) print 'Pearson: %.2f +/- %.2f: ' % (mean(pearsonValues), std(pearsonValues))
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, numBins, numRndGenomes): img = IMG() markerSet = MarkerSet() metadata = img.genomeMetadata() lineageGenomeIds = img.genomeIdsByTaxonomy(taxonomyStr, metadata) # build marker set from finished prokaryotic genomes genomeIds = [] for genomeId in lineageGenomeIds: if metadata[genomeId]['status'] == 'Finished' and ( metadata[genomeId]['taxonomy'][0] == 'Bacteria' or metadata[genomeId]['taxonomy'][0] == 'Archaea'): genomeIds.append(genomeId) genomeIds = set(genomeIds) - img.genomesWithMissingData(genomeIds) print 'Lineage ' + taxonomyStr + ' contains ' + str( len(genomeIds)) + ' genomes.' # get marker set countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable, 0.9 * ubiquityThreshold, 0.9 * singleCopyThreshold) markerGenes = markerSet.markerGenes( genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) tigrToRemove = img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) print 'Number of marker genes: ' + str(len(markerGenes)) # randomly set genomes to plot if numRndGenomes != -1: genomeIds = random.sample(list(genomeIds), numRndGenomes) genomeIds = set(genomeIds) # plot distribution of marker genes filename = 'geneDistribution.' + taxonomyStr.replace( ';', '_') + '.' + str(ubiquityThreshold) + '-' + str( singleCopyThreshold) + '.tsv' fout = open(filename, 'w') fout.write( 'Genome ID\tLineage\tNumber of Genes\tUniformity\tDistribution\n') matrix = [] rowLabels = [] for genomeId in genomeIds: binSize = float(metadata[genomeId]['genome size']) / numBins binCounts = [0] * numBins pts = [] for _, data in geneDistTable[genomeId].iteritems(): for genePos in data: binNum = int(genePos[1] / binSize) binCounts[binNum] += 1 pts.append(genePos[1]) matrix.append(binCounts) u = markerSet.uniformity(metadata[genomeId]['genome size'], pts) fout.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']) + '\t' + str(len(geneDistTable[genomeId])) + '\t%.3f' % u) for b in xrange(0, numBins): fout.write('\t' + str(binCounts[b])) fout.write('\n') rowLabels.append('%.2f' % u + ', ' + str(genomeId) + ' - ' + '; '.join(metadata[genomeId]['taxonomy'][0:5])) fout.close() # plot data heatmap = Heatmap() plotFilename = 'geneDistribution.' + taxonomyStr.replace( ';', '_') + '.' + str(ubiquityThreshold) + '-' + str( singleCopyThreshold) + '.png' heatmap.plot(plotFilename, matrix, rowLabels, 0.6)
def run(self): img = IMG() markerset = MarkerSet() print('Reading metadata.') metadata = img.genomeMetadata('Final') print('Getting marker genes.') pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea') markerGenes = pfamMarkers.union(tigrMarkers) print(' Marker genes: ' + str(len(markerGenes))) print('Getting genomes of interest.') genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final') print(' Genomes: ' + str(len(genomeIds))) print('Getting position of each marker gene.') geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) spearmanValues = [] pearsonValues = [] genomeIds = list(genomeIds) for i in range(0, len(genomeIds)): print(str(i + 1) + ' of ' + str(len(genomeIds))) geneOrderI = [] maskI = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[i]]: geneOrderI.append( float(geneDistTable[genomeIds[i]][markerGenesId][0][0]) / metadata[genomeIds[i]]['genome size']) maskI.append(0) else: geneOrderI.append(-1) maskI.append(1) for j in range(i + 1, len(genomeIds)): geneOrderJ = [] maskJ = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[j]]: geneOrderJ.append( float(geneDistTable[genomeIds[j]][markerGenesId][0] [0]) / metadata[genomeIds[j]]['genome size']) maskJ.append(0) else: geneOrderJ.append(-1) maskJ.append(1) # test all translations bestSpearman = 0 bestPearson = 0 for _ in range(0, len(markerGenes)): maskedI = [] maskedJ = [] for k in range(0, len(maskI)): if maskI[k] == 0 and maskJ[k] == 0: maskedI.append(geneOrderI[k]) maskedJ.append(geneOrderJ[k]) r, _ = spearmanr(maskedI, maskedJ) if abs(r) > bestSpearman: bestSpearman = abs(r) r, _ = pearsonr(maskedI, maskedJ) if abs(r) > bestPearson: bestPearson = abs(r) geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]] maskJ = maskJ[1:] + [maskJ[0]] spearmanValues.append(bestSpearman) pearsonValues.append(bestPearson) print('Spearman: %.2f +/- %.2f: ' % (mean(spearmanValues), std(spearmanValues))) print('Pearson: %.2f +/- %.2f: ' % (mean(pearsonValues), std(pearsonValues)))
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, percentGenomes, numReplicates): img = IMG() lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) fout = open('./data/lineage_evaluation.tsv', 'w') fout.write( 'Lineage\t# genomes\t# markers\tpercentage\tnum replicates\tmean\tstd\tmean %\tmean + std%\tmean + 2*std %\n' ) for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') if len(genomeIds) < minGenomes: continue countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold * 0.9, singleCopyThreshold * 0.9) # calculate marker set for all genomes markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) if len(markerGenes) < minMarkers: continue print '\nLineage ' + lineage + ' contains ' + str( len(genomeIds)) + ' genomes.' print ' Marker genes: ' + str(len(markerGenes)) fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t%.2f' % percentGenomes + '\t' + str(numReplicates)) # withhold select percentage of genomes and calculate new marker set changeMarkerSetSize = [] for _ in xrange(0, numReplicates): subsetGenomeIds = random.sample( genomeIds, int((1.0 - percentGenomes) * len(genomeIds) + 0.5)) newMarkerGenes = img.markerGenes( subsetGenomeIds, countTable, ubiquityThreshold * len(subsetGenomeIds), singleCopyThreshold * len(subsetGenomeIds)) changeMarkerSetSize.append( len(newMarkerGenes.symmetric_difference(markerGenes))) m = mean(changeMarkerSetSize) s = std(changeMarkerSetSize) print ' Mean: %.2f, Std: %.2f, Per: %.2f' % (m, s, (m + 2 * s) * 100 / len(markerGenes)) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f' % (m, s, m * 100 / len(markerGenes), (m + s) * 100 / len(markerGenes), (m + 2 * s) * 100 / len(markerGenes)) + '\n') fout.close()
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print '\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.' # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) print ' Marker genes: ' + str(len(markerGenes)) geneDistTable = img.geneDistTable(genomeIds, markerGenes) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) print ' Co-located gene sets: ' + str(len(colocatedSets)) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: mgCompletion = [] msCompletion = [] for _ in xrange(0, numReplicates): startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker genes containedMarkerGenes = img.containedMarkerGenes(markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen) mgCompletion.append(float(len(containedMarkerGenes))/len(markerGenes) - percentCompletion) # calculate completion with marker set comp = 0.0 for cs in colocatedSets: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) msCompletion.append(comp / len(colocatedSets) - percentCompletion) plotData.append(mgCompletion) plotData.append(msCompletion) species = ' '.join(metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:]) plotLabels.append(species + ' (' + genomeId + ')') plotLabels.append('') # plot data boxPlot = BoxPlot() plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, numBins, numRndGenomes): img = IMG() markerSet = MarkerSet() metadata = img.genomeMetadata() lineageGenomeIds = img.genomeIdsByTaxonomy(taxonomyStr, metadata) # build marker set from finished prokaryotic genomes genomeIds = [] for genomeId in lineageGenomeIds: if metadata[genomeId]['status'] == 'Finished' and (metadata[genomeId]['taxonomy'][0] == 'Bacteria' or metadata[genomeId]['taxonomy'][0] == 'Archaea'): genomeIds.append(genomeId) genomeIds = set(genomeIds) - img.genomesWithMissingData(genomeIds) print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.' # get marker set countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable, 0.9*ubiquityThreshold, 0.9*singleCopyThreshold) markerGenes = markerSet.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) tigrToRemove = img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) print 'Number of marker genes: ' + str(len(markerGenes)) # randomly set genomes to plot if numRndGenomes != -1: genomeIds = random.sample(list(genomeIds), numRndGenomes) genomeIds = set(genomeIds) # plot distribution of marker genes filename = 'geneDistribution.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.tsv' fout = open(filename, 'w') fout.write('Genome ID\tLineage\tNumber of Genes\tUniformity\tDistribution\n') matrix = [] rowLabels = [] for genomeId in genomeIds: binSize = float(metadata[genomeId]['genome size']) / numBins binCounts = [0]*numBins pts = [] for _, data in geneDistTable[genomeId].iteritems(): for genePos in data: binNum = int(genePos[1] / binSize) binCounts[binNum] += 1 pts.append(genePos[1]) matrix.append(binCounts) u = markerSet.uniformity(metadata[genomeId]['genome size'], pts) fout.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']) + '\t' + str(len(geneDistTable[genomeId])) + '\t%.3f' % u) for b in xrange(0, numBins): fout.write('\t' + str(binCounts[b])) fout.write('\n') rowLabels.append('%.2f' % u + ', ' + str(genomeId) + ' - ' + '; '.join(metadata[genomeId]['taxonomy'][0:5])) fout.close() # plot data heatmap = Heatmap() plotFilename = 'geneDistribution.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.png' heatmap.plot(plotFilename, matrix, rowLabels, 0.6)
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers): print 'Ubiquity threshold: ' + str(ubiquityThreshold) print 'Single-copy threshold: ' + str(singleCopyThreshold) print 'Min. genomes: ' + str(minGenomes) print 'Most specific taxonomic rank: ' + str(mostSpecificRank) img = IMG() deltaMarkerSetSizes = [] lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) lineages = ['prokaryotes'] + lineages boxPlotLabels = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage) trusted = img.trustedGenomes() genomeIds = list(genomeIds.intersection(trusted)) print '' print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' # get table of PFAMs and do some initial filtering to remove PFAMs that are # clearly not going to pass the ubiquity and single-copy thresholds pfamTable = img.pfamTable(genomeIds) pfamTable = img.filterPfamTable(genomeIds, pfamTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9) markerSet = img.markerGenes(genomeIds, pfamTable, ubiquityThreshold*(len(genomeIds)-1), singleCopyThreshold*(len(genomeIds)-1)) fullMarkerSetSize = len(markerSet) if fullMarkerSetSize < minMarkers: continue boxPlotLabels.append(lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) + ', ' + str(fullMarkerSetSize) + ')') deltaMarkerSetSize = [] numGenomes = len(genomeIds)-1 for loo in xrange(0, len(genomeIds)): if loo != len(genomeIds) - 1: genomeIdSubset = genomeIds[0:loo] + genomeIds[loo+1:] else: genomeIdSubset = genomeIds[0:loo] markerSet = img.markerGenes(genomeIdSubset, pfamTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset)) deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet)) if fullMarkerSetSize < len(markerSet): print '[Warning] Unexpected!' deltaMarkerSetSizes.append(deltaMarkerSetSize) m = mean(deltaMarkerSetSize) s = std(deltaMarkerSetSize) print ' LOO Ubiquity >= ' + str(int(ubiquityThreshold*numGenomes)) + ', LOO Single-copy >= ' + str(int(singleCopyThreshold*numGenomes)) print ' Delta Mean: %.2f +/- %.2f' % (m, s) print ' Delta Min: %d, Delta Max: %d' % (min(deltaMarkerSetSize), max(deltaMarkerSetSize)) # plot data boxPlot = BoxPlot() plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png' title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels, r'$\Delta$' + ' Marker Set Size', '', False, title)
def run(self, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination, genomeCompleteness, genomeContamination): img = IMG() markerset = MarkerSet() metadata = img.genomeMetadata() trustedOut = open('./data/trusted_genomes.tsv', 'w') trustedOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n' ) filteredOut = open('./data/filtered_genomes.tsv', 'w') filteredOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n' ) allGenomeIds = set() allTrustedGenomeIds = set() for lineage in ['Archaea', 'Bacteria']: # get all genomes in lineage and build gene count table print '\nBuilding gene count table.' allLineageGenomeIds = img.genomeIdsByTaxonomy( lineage, metadata, 'All') countTable = img.countTable(allLineageGenomeIds) countTable = img.filterTable(allLineageGenomeIds, countTable, 0.9 * ubiquityThreshold, 0.9 * singleCopyThreshold) # get all genomes from specific lineage allGenomeIds = allGenomeIds.union(allLineageGenomeIds) print 'Lineage ' + lineage + ' contains ' + str( len(allLineageGenomeIds)) + ' genomes.' # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker set for genomes markerGenes = markerset.markerGenes( allLineageGenomeIds, countTable, ubiquityThreshold * len(allLineageGenomeIds), singleCopyThreshold * len(allLineageGenomeIds)) print ' Marker genes: ' + str(len(markerGenes)) geneDistTable = img.geneDistTable(allLineageGenomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable, metadata) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) print ' Marker set size: ' + str(len(colocatedSets)) # identifying trusted genomes (highly complete, low contamination genomes) trustedGenomeIds = set() for genomeId in allLineageGenomeIds: completeness, contamination = markerset.genomeCheck( colocatedSets, genomeId, countTable) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) trustedOut.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy'])) trustedOut.write( '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6)) trustedOut.write('\t' + str(metadata[genomeId]['scaffold count'])) trustedOut.write( '\t' + metadata[genomeId]['biotic relationships']) trustedOut.write('\t' + metadata[genomeId]['status']) trustedOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n') else: filteredOut.write( genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy'])) filteredOut.write( '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6)) filteredOut.write( '\t' + str(metadata[genomeId]['scaffold count'])) filteredOut.write( '\t' + metadata[genomeId]['biotic relationships']) filteredOut.write('\t' + metadata[genomeId]['status']) filteredOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n') print ' Trusted genomes: ' + str(len(trustedGenomeIds)) # determine status of trusted genomes statusBreakdown = {} for genomeId in trustedGenomeIds: statusBreakdown[metadata[genomeId] ['status']] = statusBreakdown.get( metadata[genomeId]['status'], 0) + 1 print ' Trusted genome status breakdown: ' for status, count in statusBreakdown.iteritems(): print ' ' + status + ': ' + str(count) # determine status of retained genomes proposalNameBreakdown = {} for genomeId in trustedGenomeIds: proposalNameBreakdown[metadata[genomeId][ 'proposal name']] = proposalNameBreakdown.get( metadata[genomeId]['proposal name'], 0) + 1 print ' Retained genome proposal name breakdown: ' for pn, count in proposalNameBreakdown.iteritems(): if 'KMG' in pn or 'GEBA' in pn or 'HMP' in pn: print ' ' + pn + ': ' + str(count) print ' Filtered genomes by phylum:' trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.iteritems(): print phylum + ': %d of %d' % (trustedPhylumCounts.get( phylum, 0), count) trustedOut.close() filteredOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in xrange(0, 6): # Domain to Genus for genomeId, data in metadata.iteritems(): taxaStr = '; '.join(data['taxonomy'][0:r + 1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted() fout = open('./data/lineage_stats.tsv', 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' + str(trustedStats.get(lineage, 0)) + '\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers): print('Ubiquity threshold: ' + str(ubiquityThreshold)) print('Single-copy threshold: ' + str(singleCopyThreshold)) print('Min. genomes: ' + str(minGenomes)) print('Most specific taxonomic rank: ' + str(mostSpecificRank)) img = IMG() deltaMarkerSetSizes = [] lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) lineages = ['prokaryotes'] + lineages boxPlotLabels = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage) trusted = img.trustedGenomes() genomeIds = list(genomeIds.intersection(trusted)) print('') print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.') # get table of PFAMs and do some initial filtering to remove PFAMs that are # clearly not going to pass the ubiquity and single-copy thresholds pfamTable = img.pfamTable(genomeIds) pfamTable = img.filterPfamTable(genomeIds, pfamTable, ubiquityThreshold * 0.9, singleCopyThreshold * 0.9) markerSet = img.markerGenes( genomeIds, pfamTable, ubiquityThreshold * (len(genomeIds) - 1), singleCopyThreshold * (len(genomeIds) - 1)) fullMarkerSetSize = len(markerSet) if fullMarkerSetSize < minMarkers: continue boxPlotLabels.append( lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) + ', ' + str(fullMarkerSetSize) + ')') deltaMarkerSetSize = [] numGenomes = len(genomeIds) - 1 for loo in range(0, len(genomeIds)): if loo != len(genomeIds) - 1: genomeIdSubset = genomeIds[0:loo] + genomeIds[loo + 1:] else: genomeIdSubset = genomeIds[0:loo] markerSet = img.markerGenes( genomeIdSubset, pfamTable, ubiquityThreshold * len(genomeIdSubset), singleCopyThreshold * len(genomeIdSubset)) deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet)) if fullMarkerSetSize < len(markerSet): print('[Warning] Unexpected!') deltaMarkerSetSizes.append(deltaMarkerSetSize) m = mean(deltaMarkerSetSize) s = std(deltaMarkerSetSize) print(' LOO Ubiquity >= ' + str(int(ubiquityThreshold * numGenomes)) + ', LOO Single-copy >= ' + str(int(singleCopyThreshold * numGenomes))) print(' Delta Mean: %.2f +/- %.2f' % (m, s)) print(' Delta Min: %d, Delta Max: %d' % (min(deltaMarkerSetSize), max(deltaMarkerSetSize))) # plot data boxPlot = BoxPlot() plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str( singleCopyThreshold) + '.boxplot.png' title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels, r'$\Delta$' + ' Marker Set Size', '', False, title)
def run(self, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination, genomeCompleteness, genomeContamination): img = IMG() markerset = MarkerSet() metadata = img.genomeMetadata() trustedOut = open('./data/trusted_genomes.tsv', 'w') trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n') filteredOut = open('./data/filtered_genomes.tsv', 'w') filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n') allGenomeIds = set() allTrustedGenomeIds = set() for lineage in ['Archaea', 'Bacteria']: # get all genomes in lineage and build gene count table print '\nBuilding gene count table.' allLineageGenomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'All') countTable = img.countTable(allLineageGenomeIds) countTable = img.filterTable(allLineageGenomeIds, countTable, 0.9*ubiquityThreshold, 0.9*singleCopyThreshold) # get all genomes from specific lineage allGenomeIds = allGenomeIds.union(allLineageGenomeIds) print 'Lineage ' + lineage + ' contains ' + str(len(allLineageGenomeIds)) + ' genomes.' # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker set for genomes markerGenes = markerset.markerGenes(allLineageGenomeIds, countTable, ubiquityThreshold*len(allLineageGenomeIds), singleCopyThreshold*len(allLineageGenomeIds)) print ' Marker genes: ' + str(len(markerGenes)) geneDistTable = img.geneDistTable(allLineageGenomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable, metadata) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) print ' Marker set size: ' + str(len(colocatedSets)) # identifying trusted genomes (highly complete, low contamination genomes) trustedGenomeIds = set() for genomeId in allLineageGenomeIds: completeness, contamination = markerset.genomeCheck(colocatedSets, genomeId, countTable) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) trustedOut.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy'])) trustedOut.write('\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6)) trustedOut.write('\t' + str(metadata[genomeId]['scaffold count'])) trustedOut.write('\t' + metadata[genomeId]['biotic relationships']) trustedOut.write('\t' + metadata[genomeId]['status']) trustedOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n') else: filteredOut.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy'])) filteredOut.write('\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6)) filteredOut.write('\t' + str(metadata[genomeId]['scaffold count'])) filteredOut.write('\t' + metadata[genomeId]['biotic relationships']) filteredOut.write('\t' + metadata[genomeId]['status']) filteredOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n') print ' Trusted genomes: ' + str(len(trustedGenomeIds)) # determine status of trusted genomes statusBreakdown = {} for genomeId in trustedGenomeIds: statusBreakdown[metadata[genomeId]['status']] = statusBreakdown.get(metadata[genomeId]['status'], 0) + 1 print ' Trusted genome status breakdown: ' for status, count in statusBreakdown.iteritems(): print ' ' + status + ': ' + str(count) # determine status of retained genomes proposalNameBreakdown = {} for genomeId in trustedGenomeIds: proposalNameBreakdown[metadata[genomeId]['proposal name']] = proposalNameBreakdown.get(metadata[genomeId]['proposal name'], 0) + 1 print ' Retained genome proposal name breakdown: ' for pn, count in proposalNameBreakdown.iteritems(): if 'KMG' in pn or 'GEBA' in pn or 'HMP' in pn: print ' ' + pn + ': ' + str(count) print ' Filtered genomes by phylum:' trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.iteritems(): print phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count) trustedOut.close() filteredOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in xrange(0, 6): # Domain to Genus for genomeId, data in metadata.iteritems(): taxaStr = '; '.join(data['taxonomy'][0:r+1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted() fout = open('./data/lineage_stats.tsv', 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n') fout.close()
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print('\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.') # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) print(' Marker genes: ' + str(len(markerGenes))) geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) print(' Co-located gene sets: ' + str(len(colocatedSets))) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: mgCompletion = [] msCompletion = [] for _ in range(0, numReplicates): startPartialGenomeContigs = img.sampleGenome( metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker genes containedMarkerGenes = img.containedMarkerGenes( markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen) mgCompletion.append( float(len(containedMarkerGenes)) / len(markerGenes) - percentCompletion) # calculate completion with marker set comp = 0.0 for cs in colocatedSets: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) msCompletion.append(comp / len(colocatedSets) - percentCompletion) plotData.append(mgCompletion) plotData.append(msCompletion) species = ' '.join( metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:]) plotLabels.append(species + ' (' + genomeId + ')') plotLabels.append('') # plot data boxPlot = BoxPlot() plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace( ';', '_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace( ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() lineages = [] taxon = taxonomyStr.split(';') for r in xrange(0, len(taxon)): lineages.append(';'.join(taxon[0:r+1])) # get all marker sets markerGenes = [] geneDistTable = [] colocatedSets = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) mg = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) print ' Marker genes: ' + str(len(mg)) mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(mdt) cs = img.colocatedSets(colocatedGenes, mg) print ' Co-located gene sets: ' + str(len(cs)) markerGenes.append(mg) geneDistTable.append(mdt) colocatedSets.append(cs) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: completion = [[] for _ in xrange(len(lineages))] for _ in xrange(0, numReplicates): startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker set for i in xrange(len(lineages)): containedMarkerGenes = img.containedMarkerGenes(markerGenes[i], geneDistTable[i][genomeId], startPartialGenomeContigs, contigLen) comp = 0.0 for cs in colocatedSets[i]: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) completion[i].append(comp / len(colocatedSets[i]) - percentCompletion) plotLabels.append(genomeId + ' - ' + lineages[i]) for d in completion: plotData.append(d) # plot data boxPlot = BoxPlot() plotFilename = './images/sim.lineages.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)