def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers, completenessThreshold, contaminationThreshold): print 'Ubiquity threshold: ' + str(ubiquityThreshold) print 'Single-copy threshold: ' + str(singleCopyThreshold) print 'Min. genomes: ' + str(minGenomes) print 'Most specific taxonomic rank: ' + str(mostSpecificRank) print 'Min markers: ' + str(minMarkers) print 'Completeness threshold: ' + str(completenessThreshold) print 'Contamination threshold: ' + str(contaminationThreshold) img = IMG() markerset = MarkerSet() lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) degenerateGenomes = {} for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') print '' print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' # get table of PFAMs and do some initial filtering to remove PFAMs that are # clearly not going to pass the ubiquity and single-copy thresholds countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9) markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) if len(markerGenes) < minMarkers: continue geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) for genomeId in genomeIds: completeness, contamination = markerset.genomeCheck(colocatedSets, genomeId, countTable) if completeness < completenessThreshold or contamination > contaminationThreshold: degenerateGenomes[genomeId] = degenerateGenomes.get(genomeId, []) + [[lineage.split(';')[-1].strip(), len(genomeIds), len(colocatedSets), completeness, contamination]] # write out degenerate genomes metadata = img.genomeMetadata('Final') fout = open('./data/degenerate_genomes.tsv', 'w') fout.write('Genome Id\tTaxonomy\tGenome Size (Gbps)\tScaffolds\tBiotic Relationships\tStatus\tLineage\t# genomes\tMarker set size\tCompleteness\tContamination\n') for genomeId, data in degenerateGenomes.iteritems(): fout.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']) + '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6) + '\t' + str(metadata[genomeId]['scaffold count'])) fout.write('\t' + metadata[genomeId]['biotic relationships'] + '\t' + metadata[genomeId]['status']) for d in data: fout.write('\t' + d[0] + '\t' + str(d[1]) + '\t' + str(d[2]) + '\t%.3f\t%.3f' % (d[3], d[4])) fout.write('\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, percentGenomes, numReplicates): img = IMG() lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) fout = open('./data/lineage_evaluation.tsv', 'w') fout.write('Lineage\t# genomes\t# markers\tpercentage\tnum replicates\tmean\tstd\tmean %\tmean + std%\tmean + 2*std %\n') for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') if len(genomeIds) < minGenomes: continue countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9) # calculate marker set for all genomes markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) if len(markerGenes) < minMarkers: continue print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' print ' Marker genes: ' + str(len(markerGenes)) fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t%.2f' % percentGenomes + '\t' + str(numReplicates)) # withhold select percentage of genomes and calculate new marker set changeMarkerSetSize = [] for _ in xrange(0, numReplicates): subsetGenomeIds = random.sample(genomeIds, int((1.0-percentGenomes)*len(genomeIds) + 0.5)) newMarkerGenes = img.markerGenes(subsetGenomeIds, countTable, ubiquityThreshold*len(subsetGenomeIds), singleCopyThreshold*len(subsetGenomeIds)) changeMarkerSetSize.append(len(newMarkerGenes.symmetric_difference(markerGenes))) m = mean(changeMarkerSetSize) s = std(changeMarkerSetSize) print ' Mean: %.2f, Std: %.2f, Per: %.2f' % (m, s, (m+ 2*s) * 100 / len(markerGenes)) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f' % (m, s, m * 100 / len(markerGenes), (m + s) * 100 / len(markerGenes), (m + 2*s) * 100 / len(markerGenes)) + '\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers, completenessThreshold, contaminationThreshold): print 'Ubiquity threshold: ' + str(ubiquityThreshold) print 'Single-copy threshold: ' + str(singleCopyThreshold) print 'Min. genomes: ' + str(minGenomes) print 'Most specific taxonomic rank: ' + str(mostSpecificRank) print 'Min markers: ' + str(minMarkers) print 'Completeness threshold: ' + str(completenessThreshold) print 'Contamination threshold: ' + str(contaminationThreshold) img = IMG() markerset = MarkerSet() lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) degenerateGenomes = {} for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') print '' print 'Lineage ' + lineage + ' contains ' + str( len(genomeIds)) + ' genomes.' # get table of PFAMs and do some initial filtering to remove PFAMs that are # clearly not going to pass the ubiquity and single-copy thresholds countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold * 0.9, singleCopyThreshold * 0.9) markerGenes = markerset.markerGenes( genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) if len(markerGenes) < minMarkers: continue geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) for genomeId in genomeIds: completeness, contamination = markerset.genomeCheck( colocatedSets, genomeId, countTable) if completeness < completenessThreshold or contamination > contaminationThreshold: degenerateGenomes[genomeId] = degenerateGenomes.get( genomeId, []) + [[ lineage.split(';')[-1].strip(), len(genomeIds), len(colocatedSets), completeness, contamination ]] # write out degenerate genomes metadata = img.genomeMetadata('Final') fout = open('./data/degenerate_genomes.tsv', 'w') fout.write( 'Genome Id\tTaxonomy\tGenome Size (Gbps)\tScaffolds\tBiotic Relationships\tStatus\tLineage\t# genomes\tMarker set size\tCompleteness\tContamination\n' ) for genomeId, data in degenerateGenomes.iteritems(): fout.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']) + '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6) + '\t' + str(metadata[genomeId]['scaffold count'])) fout.write('\t' + metadata[genomeId]['biotic relationships'] + '\t' + metadata[genomeId]['status']) for d in data: fout.write('\t' + d[0] + '\t' + str(d[1]) + '\t' + str(d[2]) + '\t%.3f\t%.3f' % (d[3], d[4])) fout.write('\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, percentGenomes, numReplicates): img = IMG() lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) fout = open('./data/lineage_evaluation.tsv', 'w') fout.write( 'Lineage\t# genomes\t# markers\tpercentage\tnum replicates\tmean\tstd\tmean %\tmean + std%\tmean + 2*std %\n' ) for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') if len(genomeIds) < minGenomes: continue countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable, ubiquityThreshold * 0.9, singleCopyThreshold * 0.9) # calculate marker set for all genomes markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) if len(markerGenes) < minMarkers: continue print '\nLineage ' + lineage + ' contains ' + str( len(genomeIds)) + ' genomes.' print ' Marker genes: ' + str(len(markerGenes)) fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t%.2f' % percentGenomes + '\t' + str(numReplicates)) # withhold select percentage of genomes and calculate new marker set changeMarkerSetSize = [] for _ in xrange(0, numReplicates): subsetGenomeIds = random.sample( genomeIds, int((1.0 - percentGenomes) * len(genomeIds) + 0.5)) newMarkerGenes = img.markerGenes( subsetGenomeIds, countTable, ubiquityThreshold * len(subsetGenomeIds), singleCopyThreshold * len(subsetGenomeIds)) changeMarkerSetSize.append( len(newMarkerGenes.symmetric_difference(markerGenes))) m = mean(changeMarkerSetSize) s = std(changeMarkerSetSize) print ' Mean: %.2f, Std: %.2f, Per: %.2f' % (m, s, (m + 2 * s) * 100 / len(markerGenes)) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f' % (m, s, m * 100 / len(markerGenes), (m + s) * 100 / len(markerGenes), (m + 2 * s) * 100 / len(markerGenes)) + '\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers): print('Ubiquity threshold: ' + str(ubiquityThreshold)) print('Single-copy threshold: ' + str(singleCopyThreshold)) print('Min. genomes: ' + str(minGenomes)) print('Most specific taxonomic rank: ' + str(mostSpecificRank)) img = IMG() deltaMarkerSetSizes = [] lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) lineages = ['prokaryotes'] + lineages boxPlotLabels = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage) trusted = img.trustedGenomes() genomeIds = list(genomeIds.intersection(trusted)) print('') print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.') # get table of PFAMs and do some initial filtering to remove PFAMs that are # clearly not going to pass the ubiquity and single-copy thresholds pfamTable = img.pfamTable(genomeIds) pfamTable = img.filterPfamTable(genomeIds, pfamTable, ubiquityThreshold * 0.9, singleCopyThreshold * 0.9) markerSet = img.markerGenes( genomeIds, pfamTable, ubiquityThreshold * (len(genomeIds) - 1), singleCopyThreshold * (len(genomeIds) - 1)) fullMarkerSetSize = len(markerSet) if fullMarkerSetSize < minMarkers: continue boxPlotLabels.append( lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) + ', ' + str(fullMarkerSetSize) + ')') deltaMarkerSetSize = [] numGenomes = len(genomeIds) - 1 for loo in range(0, len(genomeIds)): if loo != len(genomeIds) - 1: genomeIdSubset = genomeIds[0:loo] + genomeIds[loo + 1:] else: genomeIdSubset = genomeIds[0:loo] markerSet = img.markerGenes( genomeIdSubset, pfamTable, ubiquityThreshold * len(genomeIdSubset), singleCopyThreshold * len(genomeIdSubset)) deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet)) if fullMarkerSetSize < len(markerSet): print('[Warning] Unexpected!') deltaMarkerSetSizes.append(deltaMarkerSetSize) m = mean(deltaMarkerSetSize) s = std(deltaMarkerSetSize) print(' LOO Ubiquity >= ' + str(int(ubiquityThreshold * numGenomes)) + ', LOO Single-copy >= ' + str(int(singleCopyThreshold * numGenomes))) print(' Delta Mean: %.2f +/- %.2f' % (m, s)) print(' Delta Min: %d, Delta Max: %d' % (min(deltaMarkerSetSize), max(deltaMarkerSetSize))) # plot data boxPlot = BoxPlot() plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str( singleCopyThreshold) + '.boxplot.png' title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels, r'$\Delta$' + ' Marker Set Size', '', False, title)
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers): print 'Ubiquity threshold: ' + str(ubiquityThreshold) print 'Single-copy threshold: ' + str(singleCopyThreshold) print 'Min. genomes: ' + str(minGenomes) print 'Most specific taxonomic rank: ' + str(mostSpecificRank) img = IMG() deltaMarkerSetSizes = [] lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) lineages = ['prokaryotes'] + lineages boxPlotLabels = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage) trusted = img.trustedGenomes() genomeIds = list(genomeIds.intersection(trusted)) print '' print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' # get table of PFAMs and do some initial filtering to remove PFAMs that are # clearly not going to pass the ubiquity and single-copy thresholds pfamTable = img.pfamTable(genomeIds) pfamTable = img.filterPfamTable(genomeIds, pfamTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9) markerSet = img.markerGenes(genomeIds, pfamTable, ubiquityThreshold*(len(genomeIds)-1), singleCopyThreshold*(len(genomeIds)-1)) fullMarkerSetSize = len(markerSet) if fullMarkerSetSize < minMarkers: continue boxPlotLabels.append(lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) + ', ' + str(fullMarkerSetSize) + ')') deltaMarkerSetSize = [] numGenomes = len(genomeIds)-1 for loo in xrange(0, len(genomeIds)): if loo != len(genomeIds) - 1: genomeIdSubset = genomeIds[0:loo] + genomeIds[loo+1:] else: genomeIdSubset = genomeIds[0:loo] markerSet = img.markerGenes(genomeIdSubset, pfamTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset)) deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet)) if fullMarkerSetSize < len(markerSet): print '[Warning] Unexpected!' deltaMarkerSetSizes.append(deltaMarkerSetSize) m = mean(deltaMarkerSetSize) s = std(deltaMarkerSetSize) print ' LOO Ubiquity >= ' + str(int(ubiquityThreshold*numGenomes)) + ', LOO Single-copy >= ' + str(int(singleCopyThreshold*numGenomes)) print ' Delta Mean: %.2f +/- %.2f' % (m, s) print ' Delta Min: %d, Delta Max: %d' % (min(deltaMarkerSetSize), max(deltaMarkerSetSize)) # plot data boxPlot = BoxPlot() plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png' title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels, r'$\Delta$' + ' Marker Set Size', '', False, title)