def run(self, taxonomyStr, minThreshold, maxThreshold, stepSize): img = IMG() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.' markerSetSizes = [] countTable = img.countTable(genomeIds) for threshold in arange(maxThreshold, minThreshold, -stepSize): markerGenes = img.markerGenes(genomeIds, countTable, threshold*len(genomeIds), threshold*len(genomeIds)) geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) markerSetSizes.append(len(colocatedSets)) print ' Threshold = %.2f, marker set size = %d' % (threshold, len(markerGenes)) # plot data plot = LinePlot() plotFilename = './images/markerSetSize.' + taxonomyStr.replace(';','_') + '.png' title = taxonomyStr.replace(';', '; ') plot.plot(plotFilename, arange(maxThreshold, minThreshold, -stepSize), markerSetSizes, 'Threshold', 'Marker Set Size', title)
def run(self, taxonomyStr, minThreshold, maxThreshold, stepSize): img = IMG() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print('Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.') markerSetSizes = [] countTable = img.countTable(genomeIds) for threshold in arange(maxThreshold, minThreshold, -stepSize): markerGenes = img.markerGenes(genomeIds, countTable, threshold*len(genomeIds), threshold*len(genomeIds)) geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) markerSetSizes.append(len(colocatedSets)) print(' Threshold = %.2f, marker set size = %d' % (threshold, len(markerGenes))) # plot data plot = LinePlot() plotFilename = './images/markerSetSize.' + taxonomyStr.replace(';','_') + '.png' title = taxonomyStr.replace(';', '; ') plot.plot(plotFilename, arange(maxThreshold, minThreshold, -stepSize), markerSetSizes, 'Threshold', 'Marker Set Size', title)
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print('\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.') # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) print(' Marker genes: ' + str(len(markerGenes))) geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) print(' Co-located gene sets: ' + str(len(colocatedSets))) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: mgCompletion = [] msCompletion = [] for _ in range(0, numReplicates): startPartialGenomeContigs = img.sampleGenome( metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker genes containedMarkerGenes = img.containedMarkerGenes( markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen) mgCompletion.append( float(len(containedMarkerGenes)) / len(markerGenes) - percentCompletion) # calculate completion with marker set comp = 0.0 for cs in colocatedSets: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) msCompletion.append(comp / len(colocatedSets) - percentCompletion) plotData.append(mgCompletion) plotData.append(msCompletion) species = ' '.join( metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:]) plotLabels.append(species + ' (' + genomeId + ')') plotLabels.append('') # plot data boxPlot = BoxPlot() plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace( ';', '_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace( ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() lineages = [] taxon = taxonomyStr.split(';') for r in range(0, len(taxon)): lineages.append(';'.join(taxon[0:r + 1])) # get all marker sets markerGenes = [] geneDistTable = [] colocatedSets = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.') # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) mg = img.markerGenes(genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) print(' Marker genes: ' + str(len(mg))) mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(mdt) cs = img.colocatedSets(colocatedGenes, mg) print(' Co-located gene sets: ' + str(len(cs))) markerGenes.append(mg) geneDistTable.append(mdt) colocatedSets.append(cs) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: completion = [[] for _ in range(len(lineages))] for _ in range(0, numReplicates): startPartialGenomeContigs = img.sampleGenome( metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker set for i in range(len(lineages)): containedMarkerGenes = img.containedMarkerGenes( markerGenes[i], geneDistTable[i][genomeId], startPartialGenomeContigs, contigLen) comp = 0.0 for cs in colocatedSets[i]: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) completion[i].append(comp / len(colocatedSets[i]) - percentCompletion) plotLabels.append(genomeId + ' - ' + lineages[i]) for d in completion: plotData.append(d) # plot data boxPlot = BoxPlot() plotFilename = './images/sim.lineages.' + taxonomyStr.replace( ';', '_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace( ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print '\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.' # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) print ' Marker genes: ' + str(len(markerGenes)) geneDistTable = img.geneDistTable(genomeIds, markerGenes) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) print ' Co-located gene sets: ' + str(len(colocatedSets)) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: mgCompletion = [] msCompletion = [] for _ in xrange(0, numReplicates): startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker genes containedMarkerGenes = img.containedMarkerGenes(markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen) mgCompletion.append(float(len(containedMarkerGenes))/len(markerGenes) - percentCompletion) # calculate completion with marker set comp = 0.0 for cs in colocatedSets: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) msCompletion.append(comp / len(colocatedSets) - percentCompletion) plotData.append(mgCompletion) plotData.append(msCompletion) species = ' '.join(metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:]) plotLabels.append(species + ' (' + genomeId + ')') plotLabels.append('') # plot data boxPlot = BoxPlot() plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, replicates, minGenomes, maxGenomes, stepSize): img = IMG() markergenes = MarkerGenes() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print('Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.') if len(genomeIds) < minGenomes: sys.stderr.write('[Error] Insufficent number of genomes.\n') sys.exit() print('') print('Ubiquity threshold: ' + str(ubiquityThreshold)) print('Single-copy threshold: ' + str(singleCopyThreshold)) meanMarkerSetSize = [] stdMarkerSetSize = [] markerSetSizes = [] if maxGenomes == -1: maxGenomes = len(genomeIds) if maxGenomes > len(genomeIds): maxGenomes = len(genomeIds) countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable) for numGenomes in range(minGenomes, maxGenomes, stepSize): markerSetSize = [] for _ in range(0, replicates): genomeIdSubset = random.sample(genomeIds, numGenomes) markerGenes = markergenes.identify( genomeIdSubset, countTable, ubiquityThreshold * len(genomeIdSubset), singleCopyThreshold * len(genomeIdSubset)) geneDistTable = img.geneDistTable(genomeIdSubset, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) markerSetSize.append(len(colocatedSets)) markerSetSizes.append(markerSetSize) m = mean(markerSetSize) meanMarkerSetSize.append(m) s = std(markerSetSize) stdMarkerSetSize.append(s) print('') print('Genomes: ' + str(numGenomes) + ', Ubiquity > ' + str(int(ubiquityThreshold * len(genomeIdSubset))) + ', Single-copy > ' + str(int(singleCopyThreshold * len(genomeIdSubset)))) print('Mean: %.2f +/- %.2f' % (m, s)) print('Min: %d, Max: %d' % (min(markerSetSize), max(markerSetSize))) # plot data errorBar = ErrorBar() plotFilename = './images/markerset.' + taxonomyStr.replace( ';', '_') + '.' + str(ubiquityThreshold) + '-' + str( singleCopyThreshold) + '.errorbar.png' title = taxonomyStr.replace( ';', '; ' ) + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize), meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes', 'Marker Set Size', title) boxPlot = BoxPlot() plotFilename = './images/markerset.' + taxonomyStr.replace( ';', '_') + '.' + str(ubiquityThreshold) + '-' + str( singleCopyThreshold) + '.boxplot.png' boxPlot.plot(plotFilename, markerSetSizes, arange(minGenomes, maxGenomes, stepSize), 'Number of Genomes', 'Marker Set Size', True, title)
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, replicates, minGenomes, maxGenomes, stepSize): img = IMG() markergenes = MarkerGenes() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.' if len(genomeIds) < minGenomes: sys.stderr.write('[Error] Insufficent number of genomes.\n') sys.exit() print '' print 'Ubiquity threshold: ' + str(ubiquityThreshold) print 'Single-copy threshold: ' + str(singleCopyThreshold) meanMarkerSetSize = [] stdMarkerSetSize = [] markerSetSizes = [] if maxGenomes == -1: maxGenomes = len(genomeIds) if maxGenomes > len(genomeIds): maxGenomes = len(genomeIds) countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable) for numGenomes in xrange(minGenomes, maxGenomes, stepSize): markerSetSize = [] for _ in xrange(0, replicates): genomeIdSubset = random.sample(genomeIds, numGenomes) markerGenes = markergenes.identify(genomeIdSubset, countTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset)) geneDistTable = img.geneDistTable(genomeIdSubset, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) markerSetSize.append(len(colocatedSets)) markerSetSizes.append(markerSetSize) m = mean(markerSetSize) meanMarkerSetSize.append(m) s = std(markerSetSize) stdMarkerSetSize.append(s) print '' print 'Genomes: ' + str(numGenomes) + ', Ubiquity > ' + str(int(ubiquityThreshold*len(genomeIdSubset))) + ', Single-copy > ' + str(int(singleCopyThreshold*len(genomeIdSubset))) print 'Mean: %.2f +/- %.2f' % (m, s) print 'Min: %d, Max: %d' %(min(markerSetSize), max(markerSetSize)) # plot data errorBar = ErrorBar() plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.errorbar.png' title = taxonomyStr.replace(';', '; ') + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize), meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes', 'Marker Set Size', title) boxPlot = BoxPlot() plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png' boxPlot.plot(plotFilename, markerSetSizes, arange(minGenomes, maxGenomes, stepSize), 'Number of Genomes', 'Marker Set Size', True, title)
def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() lineages = [] taxon = taxonomyStr.split(';') for r in xrange(0, len(taxon)): lineages.append(';'.join(taxon[0:r+1])) # get all marker sets markerGenes = [] geneDistTable = [] colocatedSets = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) mg = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) print ' Marker genes: ' + str(len(mg)) mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(mdt) cs = img.colocatedSets(colocatedGenes, mg) print ' Co-located gene sets: ' + str(len(cs)) markerGenes.append(mg) geneDistTable.append(mdt) colocatedSets.append(cs) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: completion = [[] for _ in xrange(len(lineages))] for _ in xrange(0, numReplicates): startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker set for i in xrange(len(lineages)): containedMarkerGenes = img.containedMarkerGenes(markerGenes[i], geneDistTable[i][genomeId], startPartialGenomeContigs, contigLen) comp = 0.0 for cs in colocatedSets[i]: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) completion[i].append(comp / len(colocatedSets[i]) - percentCompletion) plotLabels.append(genomeId + ' - ' + lineages[i]) for d in completion: plotData.append(d) # plot data boxPlot = BoxPlot() plotFilename = './images/sim.lineages.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)