class SimulationScaffolds(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.contigLens = [5000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __seqLens(self, seqs): """Calculate lengths of seqs.""" genomeSize = 0 seqLens = {} for seqId, seq in seqs.iteritems(): seqLens[seqId] = len(seq) genomeSize += len(seq) return seqLens, genomeSize def __workerThread(self, tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet( tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet=True, genomeIdsToRemove=[testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable( [testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck( hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets testSeqs = readFasta( os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) testSeqLens, genomeSize = self.__seqLens(testSeqs) for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for i in xrange(0, numReplicates): # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove # (this will sample >= the desired level of completeness) retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement( percentComp, testSeqLens, genomeSize) trueComps.append(trueComp) # select a random genome to use as a source of contamination contGenomeId = random.sample( genomeIdsToTest - set([testGenomeId]), 1)[0] contSeqs = readFasta( os.path.join(self.img.genomeDir, contGenomeId, contGenomeId + '.fna')) contSeqLens, contGenomeSize = self.__seqLens( contSeqs) seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement( 1 - percentCont, contSeqLens, contGenomeSize) contSampledSeqIds = set( contSeqs.keys()).difference(seqsToRetain) trueCont = 100.0 - trueRetainedPer trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes = defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False) deltaCompSet[ms.lineageStr].append( completeness - trueComp) deltaContSet[ms.lineageStr].append( contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes = defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True) deltaCompRefined[ms.lineageStr].append( completeness - trueComp) deltaContRefined[ms.lineageStr].append( contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False) deltaCompSetRefined[ms.lineageStr].append( completeness - trueComp) deltaContSetRefined[ms.lineageStr].append( contamination - trueCont) taxonomy = ';'.join(metadata[testGenomeId]['taxonomy']) queueOut.put( (testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" summaryOut = open( '/tmp/simulation.random_scaffolds.w_refinement_50.draft.summary.tsv', 'w') summaryOut.write('Genome Id\tContig len\t% comp\t% cont') summaryOut.write('\tTaxonomy\tMarker set\t# descendants') summaryOut.write('\tUnmodified comp\tUnmodified cont') summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std') summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std') summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std') summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n') fout = gzip.open( '/tmp/simulation.random_scaffolds.w_refinement_50.draft.tsv.gz', 'wb') fout.write('Genome Id\tContig len\t% comp\t% cont') fout.write('\tTaxonomy\tMarker set\t# descendants') fout.write('\tUnmodified comp\tUnmodified cont') fout.write('\tIM comp\tIM cont') fout.write('\tMS comp\tMS cont') fout.write('\tRIM comp\tRIM cont') fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n') testsPerGenome = len(self.contigLens) * len(self.percentComps) * len( self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get( block=True, timeout=None) if testGenomeId == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % ( itemsProcessed, numTestGenomes * testsPerGenome, float(itemsProcessed) * 100 / (numTestGenomes * testsPerGenome)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) summaryOut.write( '\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId])))) summaryOut.write('\n') fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) fout.write( '\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContRefined[markerSetId]))) fout.write( '\t%s' % ','.join(map(str, deltaCompSetRefined[markerSetId]))) fout.write( '\t%s' % ','.join(map(str, deltaContSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\n') summaryOut.close() fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, minScaffolds, numThreads): random.seed(0) print '\n Reading reference genome tree.' treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print ' Number of taxa in tree: %d' % (len(tree.leaf_nodes())) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace('IMG_', '')) # get all draft genomes consisting of a user-specific minimum number of scaffolds print '' metadata = self.img.genomeMetadata() print ' Total genomes: %d' % len(metadata) draftGenomeIds = genomesInTree - self.img.filterGenomeIds( genomesInTree, metadata, 'status', 'Finished') print ' Number of draft genomes: %d' % len(draftGenomeIds) genomeIdsToTest = set() for genomeId in draftGenomeIds: if metadata[genomeId]['scaffold count'] >= minScaffolds: genomeIdsToTest.add(genomeId) print ' Number of draft genomes with >= %d scaffolds: %d' % ( minScaffolds, len(genomeIdsToTest)) print '' start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print ' readLineageSpecificGenesToRemove: %.2f' % (end - start) print ' Pre-computing genome information for calculating marker sets:' start = time.time() self.markerSetBuilder.precomputeGenomeFamilyScaffolds(metadata.keys()) end = time.time() print ' precomputeGenomeFamilyScaffolds: %.2f' % (end - start) start = time.time() self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable( metadata.keys()) end = time.time() print ' globalGeneCountTable: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print ' precomputeGenomeSeqLens: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeFamilyPositions( metadata.keys(), 0) end = time.time() print ' precomputeGenomeFamilyPositions: %.2f' % (end - start) print '' print ' Evaluating %d test genomes.' % len(genomeIdsToTest) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in list(genomeIdsToTest): workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [ mp.Process(target=self.__workerThread, args=(tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)) writeProc.join()
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.contigLens = [1000, 2000, 5000, 10000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet=True, genomeIdsToRemove=[testGenomeId]) #!!!binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildDomainMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = [testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0) print('# marker genes: ', len(binMarkerSets.getMarkerGenes())) print('# genes in table: ', len(geneDistTable[testGenomeId])) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination print(completeness, contamination) # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) print('genomeSize', genomeSize) for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for _ in range(0, numReplicates): trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(genomeSize, percentComp, percentCont, contigLen) print(contigLen, trueComp, trueCont, len(startPartialGenomeContigs)) trueComps.append(trueComp) trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSet[ms.lineageStr].append(completeness - trueComp) deltaContSet[ms.lineageStr].append(contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaCompRefined[ms.lineageStr].append(completeness - trueComp) deltaContRefined[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp) deltaContSetRefined[ms.lineageStr].append(contamination - trueCont) taxonomy = ';'.join(metadata[testGenomeId]['taxonomy']) queueOut.put((testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" # summaryOut = open('/tmp/simulation.draft.summary.w_refinement_50.tsv', 'w') summaryOut = open('/tmp/simulation.summary.testing.tsv', 'w') summaryOut.write('Genome Id\tContig len\t% comp\t% cont') summaryOut.write('\tTaxonomy\tMarker set\t# descendants') summaryOut.write('\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont') summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std') summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std') summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std') summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n') # fout = gzip.open('/tmp/simulation.draft.w_refinement_50.tsv.gz', 'wb') fout = gzip.open('/tmp/simulation.testing.tsv.gz', 'wb') fout.write('Genome Id\tContig len\t% comp\t% cont') fout.write('\tTaxonomy\tMarker set\t# descendants') fout.write('\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont') fout.write('\tIM comp\tIM cont') fout.write('\tMS comp\tMS cont') fout.write('\tRIM comp\tRIM cont') fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n') testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get(block=True, timeout=None) if testGenomeId == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, numTestGenomes * testsPerGenome, float(itemsProcessed) * 100 / (numTestGenomes * testsPerGenome)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (mean(trueComps), std(trueConts))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId])))) summaryOut.write('\n') fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) fout.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\n') summaryOut.close() fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, numThreads): print('\n Reading reference genome tree.') treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree', 'genome_tree_full.refpkg', 'genome_tree.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print(' Number of taxa in tree: %d' % (len(tree.leaf_nodes()))) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace('IMG_', '')) # get all draft genomes for testing print('') metadata = self.img.genomeMetadata() print(' Total genomes: %d' % len(metadata)) genomeIdsToTest = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, 'status', 'Finished') print(' Number of draft genomes: %d' % len(genomeIdsToTest)) print('') print(' Pre-computing genome information for calculating marker sets:') start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print(' readLineageSpecificGenesToRemove: %.2f' % (end - start)) start = time.time() # self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print(' globalGeneCountTable: %.2f' % (end - start)) start = time.time() # self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print(' precomputeGenomeSeqLens: %.2f' % (end - start)) start = time.time() # self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0) end = time.time() print(' precomputeGenomeFamilyPositions: %.2f' % (end - start)) print('') print(' Evaluating %d test genomes.' % len(genomeIdsToTest)) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in genomeIdsToTest: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [mp.Process(target=self.__workerThread, args=(tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target=self.__writerThread, args=(len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)) writeProc.join()
class SimulationScaffolds(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.contigLens = [5000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __seqLens(self, seqs): """Calculate lengths of seqs.""" genomeSize = 0 seqLens = {} for seqId, seq in seqs.iteritems(): seqLens[seqId] = len(seq) genomeSize += len(seq) return seqLens, genomeSize def __workerThread(self, tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = [testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets testSeqs = readFasta(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) testSeqLens, genomeSize = self.__seqLens(testSeqs) for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for i in xrange(0, numReplicates): # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove # (this will sample >= the desired level of completeness) retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(percentComp, testSeqLens, genomeSize) trueComps.append(trueComp) # select a random genome to use as a source of contamination contGenomeId = random.sample(genomeIdsToTest - set([testGenomeId]), 1)[0] contSeqs = readFasta(os.path.join(self.img.genomeDir, contGenomeId, contGenomeId + '.fna')) contSeqLens, contGenomeSize = self.__seqLens(contSeqs) seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(1 - percentCont, contSeqLens, contGenomeSize) contSampledSeqIds = set(contSeqs.keys()).difference(seqsToRetain) trueCont = 100.0 - trueRetainedPer trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes= defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSet[ms.lineageStr].append(completeness - trueComp) deltaContSet[ms.lineageStr].append(contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes= defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaCompRefined[ms.lineageStr].append(completeness - trueComp) deltaContRefined[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp) deltaContSetRefined[ms.lineageStr].append(contamination - trueCont) taxonomy = ';'.join(metadata[testGenomeId]['taxonomy']) queueOut.put((testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" summaryOut = open('/tmp/simulation.random_scaffolds.w_refinement_50.draft.summary.tsv', 'w') summaryOut.write('Genome Id\tContig len\t% comp\t% cont') summaryOut.write('\tTaxonomy\tMarker set\t# descendants') summaryOut.write('\tUnmodified comp\tUnmodified cont') summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std') summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std') summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std') summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n') fout = gzip.open('/tmp/simulation.random_scaffolds.w_refinement_50.draft.tsv.gz', 'wb') fout.write('Genome Id\tContig len\t% comp\t% cont') fout.write('\tTaxonomy\tMarker set\t# descendants') fout.write('\tUnmodified comp\tUnmodified cont') fout.write('\tIM comp\tIM cont') fout.write('\tMS comp\tMS cont') fout.write('\tRIM comp\tRIM cont') fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n') testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get(block=True, timeout=None) if testGenomeId == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, numTestGenomes*testsPerGenome, float(itemsProcessed)*100/(numTestGenomes*testsPerGenome)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId])))) summaryOut.write('\n') fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) fout.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\n') summaryOut.close() fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, minScaffolds, numThreads): random.seed(0) print '\n Reading reference genome tree.' treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print ' Number of taxa in tree: %d' % (len(tree.leaf_nodes())) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace('IMG_', '')) # get all draft genomes consisting of a user-specific minimum number of scaffolds print '' metadata = self.img.genomeMetadata() print ' Total genomes: %d' % len(metadata) draftGenomeIds = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, 'status', 'Finished') print ' Number of draft genomes: %d' % len(draftGenomeIds) genomeIdsToTest = set() for genomeId in draftGenomeIds: if metadata[genomeId]['scaffold count'] >= minScaffolds: genomeIdsToTest.add(genomeId) print ' Number of draft genomes with >= %d scaffolds: %d' % (minScaffolds, len(genomeIdsToTest)) print '' start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print ' readLineageSpecificGenesToRemove: %.2f' % (end - start) print ' Pre-computing genome information for calculating marker sets:' start = time.time() self.markerSetBuilder.precomputeGenomeFamilyScaffolds(metadata.keys()) end = time.time() print ' precomputeGenomeFamilyScaffolds: %.2f' % (end - start) start = time.time() self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print ' globalGeneCountTable: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print ' precomputeGenomeSeqLens: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0) end = time.time() print ' precomputeGenomeFamilyPositions: %.2f' % (end - start) print '' print ' Evaluating %d test genomes.' % len(genomeIdsToTest) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in list(genomeIdsToTest): workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [mp.Process(target = self.__workerThread, args = (tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__writerThread, args = (len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)) writeProc.join()
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG( "/srv/whitlam/bio/db/checkm/img/img_metadata.tsv", "/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv" ) self.contigLens = [1000, 2000, 5000, 10000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label("IMG_" + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet( tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet=True, genomeIdsToRemove=[testGenomeId], ) #!!!binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildDomainMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = [testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable( [testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0 ) print "# marker genes: ", len(binMarkerSets.getMarkerGenes()) print "# genes in table: ", len(geneDistTable[testGenomeId]) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination print completeness, contamination # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + ".fna")) print "genomeSize", genomeSize for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for _ in xrange(0, numReplicates): trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome( genomeSize, percentComp, percentCont, contigLen ) print contigLen, trueComp, trueCont, len(startPartialGenomeContigs) trueComps.append(trueComp) trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes( ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen, ) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True ) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False ) deltaCompSet[ms.lineageStr].append(completeness - trueComp) deltaContSet[ms.lineageStr].append(contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes( ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen, ) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True ) deltaCompRefined[ms.lineageStr].append(completeness - trueComp) deltaContRefined[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False ) deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp) deltaContSetRefined[ms.lineageStr].append(contamination - trueCont) taxonomy = ";".join(metadata[testGenomeId]["taxonomy"]) queueOut.put( ( testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts, ) ) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" # summaryOut = open('/tmp/simulation.draft.summary.w_refinement_50.tsv', 'w') summaryOut = open("/tmp/simulation.summary.testing.tsv", "w") summaryOut.write("Genome Id\tContig len\t% comp\t% cont") summaryOut.write("\tTaxonomy\tMarker set\t# descendants") summaryOut.write("\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont") summaryOut.write("\tIM comp\tIM comp std\tIM cont\tIM cont std") summaryOut.write("\tMS comp\tMS comp std\tMS cont\tMS cont std") summaryOut.write("\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std") summaryOut.write("\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n") # fout = gzip.open('/tmp/simulation.draft.w_refinement_50.tsv.gz', 'wb') fout = gzip.open("/tmp/simulation.testing.tsv.gz", "wb") fout.write("Genome Id\tContig len\t% comp\t% cont") fout.write("\tTaxonomy\tMarker set\t# descendants") fout.write("\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont") fout.write("\tIM comp\tIM cont") fout.write("\tMS comp\tMS cont") fout.write("\tRIM comp\tRIM cont") fout.write("\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n") testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get( block=True, timeout=None ) if testGenomeId == None: break itemsProcessed += 1 statusStr = " Finished processing %d of %d (%.2f%%) test cases." % ( itemsProcessed, numTestGenomes * testsPerGenome, float(itemsProcessed) * 100 / (numTestGenomes * testsPerGenome), ) sys.stdout.write("%s\r" % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + "\t%d\t%.2f\t%.2f" % (contigLen, percentComp, percentCont)) summaryOut.write("\t" + taxonomy + "\t" + markerSetId + "\t" + str(numDescendants[markerSetId])) summaryOut.write("\t%.3f\t%.3f" % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write("\t%.3f\t%.3f" % (mean(trueComps), std(trueConts))) summaryOut.write("\t%.3f\t%.3f" % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write("\t%.3f\t%.3f" % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId]))) ) summaryOut.write("\n") fout.write(testGenomeId + "\t%d\t%.2f\t%.2f" % (contigLen, percentComp, percentCont)) fout.write("\t" + taxonomy + "\t" + markerSetId + "\t" + str(numDescendants[markerSetId])) fout.write("\t%.3f\t%.3f" % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write("\t%s" % ",".join(map(str, trueComps))) fout.write("\t%s" % ",".join(map(str, trueConts))) fout.write("\t%s" % ",".join(map(str, deltaComp[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCont[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCompSet[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaContSet[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCompRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaContRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCompSetRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaContSetRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, trueComps))) fout.write("\t%s" % ",".join(map(str, trueConts))) fout.write("\n") summaryOut.close() fout.close() sys.stdout.write("\n") def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, numThreads): print "\n Reading reference genome tree." treeFile = os.path.join("/srv", "db", "checkm", "genome_tree", "genome_tree_full.refpkg", "genome_tree.tre") tree = dendropy.Tree.get_from_path(treeFile, schema="newick", as_rooted=True, preserve_underscores=True) print " Number of taxa in tree: %d" % (len(tree.leaf_nodes())) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace("IMG_", "")) # get all draft genomes for testing print "" metadata = self.img.genomeMetadata() print " Total genomes: %d" % len(metadata) genomeIdsToTest = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, "status", "Finished") print " Number of draft genomes: %d" % len(genomeIdsToTest) print "" print " Pre-computing genome information for calculating marker sets:" start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print " readLineageSpecificGenesToRemove: %.2f" % (end - start) start = time.time() # self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print " globalGeneCountTable: %.2f" % (end - start) start = time.time() # self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print " precomputeGenomeSeqLens: %.2f" % (end - start) start = time.time() # self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0) end = time.time() print " precomputeGenomeFamilyPositions: %.2f" % (end - start) print "" print " Evaluating %d test genomes." % len(genomeIdsToTest) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in genomeIdsToTest: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [ mp.Process( target=self.__workerThread, args=(tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue), ) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put( ( None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, ) ) writeProc.join()
class PlotScaffoldLenVsMarkers(object): def __init__(self): self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') def run(self): # get all draft genomes consisting of a user-specific minimum number of scaffolds print('') metadata = self.img.genomeMetadata() print(' Total genomes: %d' % len(metadata)) arGenome = set() for genomeId in metadata: if metadata[genomeId]['taxonomy'][0] == 'Archaea': arGenome.add(genomeId) draftGenomeIds = arGenome - self.img.filterGenomeIds(arGenome, metadata, 'status', 'Finished') print(' Number of draft genomes: %d' % len(draftGenomeIds)) minScaffolds = 20 genomeIdsToTest = set() for genomeId in draftGenomeIds: if metadata[genomeId]['scaffold count'] >= minScaffolds: genomeIdsToTest.add(genomeId) print(' Number of draft genomes with >= %d scaffolds: %d' % (minScaffolds, len(genomeIdsToTest))) print('') print(' Calculating genome information for calculating marker sets:') genomeFamilyScaffolds = self.img.precomputeGenomeFamilyScaffolds(genomeIdsToTest) print(' Calculating genome sequence lengths.') genomeSeqLens = self.img.precomputeGenomeSeqLens(genomeIdsToTest) print(' Determining domain-specific marker sets.') taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() bacMarkers = taxonMarkerSets['domain']['Bacteria'].getMarkerGenes() arMarkers = taxonMarkerSets['domain']['Archaea'].getMarkerGenes() print(' There are %d bacterial markers and %d archaeal markers.' % (len(bacMarkers), len(arMarkers))) print(' Determining percentage of markers on each scaffold.') totalMarkers = 0 totalSequenceLen = 0 markersOnShortScaffolds = 0 totalShortScaffoldLen = 0 scaffoldLen = {} percentageMarkers = defaultdict(float) for genomeId, markerIds in genomeFamilyScaffolds.items(): domain = metadata[genomeId]['taxonomy'][0] markerGenes = bacMarkers if domain == 'Bacteria' else arMarkers for markerId in markerGenes: if markerId.startswith('PF'): markerId = markerId.replace('PF', 'pfam') markerId = markerId[0:markerId.rfind('.')] if markerId in markerIds: for scaffoldId in markerIds[markerId]: scaffoldLen[scaffoldId] = genomeSeqLens[genomeId][scaffoldId] percentageMarkers[scaffoldId] += 1.0/len(markerGenes) totalMarkers += 1 totalSequenceLen += genomeSeqLens[genomeId][scaffoldId] if genomeSeqLens[genomeId][scaffoldId] < 10000: markersOnShortScaffolds += 1 totalShortScaffoldLen += genomeSeqLens[genomeId][scaffoldId] print('Markers on short scaffolds: %d over %d Mbp (%f markers per base)' % (markersOnShortScaffolds, totalShortScaffoldLen, float(markersOnShortScaffolds)/totalShortScaffoldLen)) print('Total markers on scaffolds: %d over %d Mbp (%f markers per base)' % (totalMarkers, totalSequenceLen, float(totalMarkers)/totalSequenceLen)) print(' Create plot.') plotLens = [] plotPerMarkers = [] for scaffoldId in percentageMarkers: plotLens.append(scaffoldLen[scaffoldId]) plotPerMarkers.append(percentageMarkers[scaffoldId]/scaffoldLen[scaffoldId] * 1e6) scatterPlot = ScatterPlot() scatterPlot.plot(plotLens, plotPerMarkers) scatterPlot.savePlot('./experiments/plotScaffoldLenVsMarkers.png')
class PlotScaffoldLenVsMarkers(object): def __init__(self): self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') def run(self): # get all draft genomes consisting of a user-specific minimum number of scaffolds print '' metadata = self.img.genomeMetadata() print ' Total genomes: %d' % len(metadata) arGenome = set() for genomeId in metadata: if metadata[genomeId]['taxonomy'][0] == 'Archaea': arGenome.add(genomeId) draftGenomeIds = arGenome - self.img.filterGenomeIds(arGenome, metadata, 'status', 'Finished') print ' Number of draft genomes: %d' % len(draftGenomeIds) minScaffolds = 20 genomeIdsToTest = set() for genomeId in draftGenomeIds: if metadata[genomeId]['scaffold count'] >= minScaffolds: genomeIdsToTest.add(genomeId) print ' Number of draft genomes with >= %d scaffolds: %d' % (minScaffolds, len(genomeIdsToTest)) print '' print ' Calculating genome information for calculating marker sets:' genomeFamilyScaffolds = self.img.precomputeGenomeFamilyScaffolds(genomeIdsToTest) print ' Calculating genome sequence lengths.' genomeSeqLens = self.img.precomputeGenomeSeqLens(genomeIdsToTest) print ' Determining domain-specific marker sets.' taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() bacMarkers = taxonMarkerSets['domain']['Bacteria'].getMarkerGenes() arMarkers = taxonMarkerSets['domain']['Archaea'].getMarkerGenes() print ' There are %d bacterial markers and %d archaeal markers.' % (len(bacMarkers), len(arMarkers)) print ' Determining percentage of markers on each scaffold.' totalMarkers = 0 totalSequenceLen = 0 markersOnShortScaffolds = 0 totalShortScaffoldLen = 0 scaffoldLen = {} percentageMarkers = defaultdict(float) for genomeId, markerIds in genomeFamilyScaffolds.iteritems(): domain = metadata[genomeId]['taxonomy'][0] markerGenes = bacMarkers if domain == 'Bacteria' else arMarkers for markerId in markerGenes: if markerId.startswith('PF'): markerId = markerId.replace('PF', 'pfam') markerId = markerId[0:markerId.rfind('.')] if markerId in markerIds: for scaffoldId in markerIds[markerId]: scaffoldLen[scaffoldId] = genomeSeqLens[genomeId][scaffoldId] percentageMarkers[scaffoldId] += 1.0/len(markerGenes) totalMarkers += 1 totalSequenceLen += genomeSeqLens[genomeId][scaffoldId] if genomeSeqLens[genomeId][scaffoldId] < 10000: markersOnShortScaffolds += 1 totalShortScaffoldLen += genomeSeqLens[genomeId][scaffoldId] print 'Markers on short scaffolds: %d over %d Mbp (%f markers per base)' % (markersOnShortScaffolds, totalShortScaffoldLen, float(markersOnShortScaffolds)/totalShortScaffoldLen) print 'Total markers on scaffolds: %d over %d Mbp (%f markers per base)' % (totalMarkers, totalSequenceLen, float(totalMarkers)/totalSequenceLen) print ' Create plot.' plotLens = [] plotPerMarkers = [] for scaffoldId in percentageMarkers: plotLens.append(scaffoldLen[scaffoldId]) plotPerMarkers.append(percentageMarkers[scaffoldId]/scaffoldLen[scaffoldId] * 1e6) scatterPlot = ScatterPlot() scatterPlot.plot(plotLens, plotPerMarkers) scatterPlot.savePlot('./experiments/plotScaffoldLenVsMarkers.png')