class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() self.simContigLen = 10000 def __selectMarkerSet(self, tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut): """Select marker set for parent edge of specified internal node.""" # get genomes descendant from each child of the specified internal node leaves = [] for child in internalNode.child_nodes(): genomeIds = set() for leaf in child.leaf_nodes(): genomeId = leaf.taxon.label.replace('IMG_', '') genomeIds.add(genomeId) duplicateGenomes = self.markerSetBuilder.duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: dupId = dup.replace('IMG_', '') genomeIds.add(dupId) leaves.append(genomeIds) # make sure each set of leaves contains at least a minimum number of genomes orderedLeaves = sorted(leaves, key=len) if len(orderedLeaves[0]) < 5: queueOut.put(('NA', -1, -1, -1, -1, -1)) return # calculate marker genes with all genomes in lineage with the fewest genomes removed binMarkerGenes, _ = self.markerSetBuilder.buildBinMarkerSet(tree, internalNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = orderedLeaves[0]) # evaluate accuracy of completeness and contamination estimations on different partial genomes from lineage with fewest genomes testGenomeIds = random.sample(orderedLeaves[0], min(len(orderedLeaves[0]), 100)) deltaComp = defaultdict(list) deltaCont = defaultdict(list) for testGenomeId in testGenomeIds: geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerGenes.getMarkerGenes(), spacingBetweenContigs=0) genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) repsPerGenome = 100 for _ in xrange(0, repsPerGenome): testComp = random.uniform(0.5, 1.0) testCont = random.uniform(0, 0.2) trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(genomeSize, testComp, testCont, self.simContigLen) for ms in binMarkerGenes.markerSetIter(): containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, self.simContigLen) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) if completeness == 0.0: print ms.getMarkerGenes() print geneDistTable[testGenomeId] print startPartialGenomeContigs print genomeSize print '*****************' + testGenomeId sys.exit() deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) # determine lineage-specific marker set with best average performance curBest = 1000 bestUID = None dCompBest = 0 dContBest = 0 for lineageStr in deltaComp: dComp, dCont = mean(abs(array(deltaComp[lineageStr]))), mean(abs(array(deltaCont[lineageStr]))) if (dComp + dCont) < curBest: dCompBest = dComp dContBest = dCont dCompStdBest = std(abs(array(deltaComp[lineageStr]))) dContStdBest = std(abs(array(deltaCont[lineageStr]))) bestUID = lineageStr.split('|')[0] curBest = dComp + dCont queueOut.put((internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest)) def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, queueIn, queueOut): """Process each data item in parallel.""" while True: internalNode = queueIn.get(block=True, timeout=None) if internalNode == None: break self.__selectMarkerSet(tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut) def __writerThread(self, numInternalNodes, writerQueue): """Store or write results of worker threads in a single thread.""" fout = open('/tmp/simInferBestMarkerSet.tsv', 'w') fout.write('Internal node ID\tMarker set ID\tmean % delta comp\tstd % delta comp\tmean % delta cont\tstd % delta cont\n') itemsProcessed = 0 while True: internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest = writerQueue.get(block=True, timeout=None) if internalNode == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) internal branches.' % (itemsProcessed, numInternalNodes, float(itemsProcessed)*100/(numInternalNodes)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if internalNode != 'NA': fout.write(internalNode.label + '\t%s\t%.2f\t%.2f\t%.2f\t%.2f\n' % (bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest)) fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numThreads): random.seed(0) print '\n Calculating global gene count table.' metadata = self.img.genomeMetadata() self.markerSetBuilder.globalGeneCountTable = self.img.geneCountTable(metadata.keys()) print '\n Reading reference genome tree.' treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print ' Evaluating %d internal nodes.' % len(tree.internal_nodes()) workerQueue = mp.Queue() writerQueue = mp.Queue() for internalNode in tree.internal_nodes(): if internalNode.parent_node != None: workerQueue.put(internalNode) for _ in range(numThreads): workerQueue.put(None) metadata = self.img.genomeMetadata() workerProc = [mp.Process(target = self.__workerThread, args = (tree, metadata, ubiquityThreshold, singleCopyThreshold, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__writerThread, args = (len(tree.internal_nodes())-1, writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None)) writeProc.join()
class SimulationScaffolds(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.contigLens = [5000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __seqLens(self, seqs): """Calculate lengths of seqs.""" genomeSize = 0 seqLens = {} for seqId, seq in seqs.iteritems(): seqLens[seqId] = len(seq) genomeSize += len(seq) return seqLens, genomeSize def __workerThread(self, tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet( tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet=True, genomeIdsToRemove=[testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable( [testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck( hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets testSeqs = readFasta( os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) testSeqLens, genomeSize = self.__seqLens(testSeqs) for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for i in xrange(0, numReplicates): # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove # (this will sample >= the desired level of completeness) retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement( percentComp, testSeqLens, genomeSize) trueComps.append(trueComp) # select a random genome to use as a source of contamination contGenomeId = random.sample( genomeIdsToTest - set([testGenomeId]), 1)[0] contSeqs = readFasta( os.path.join(self.img.genomeDir, contGenomeId, contGenomeId + '.fna')) contSeqLens, contGenomeSize = self.__seqLens( contSeqs) seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement( 1 - percentCont, contSeqLens, contGenomeSize) contSampledSeqIds = set( contSeqs.keys()).difference(seqsToRetain) trueCont = 100.0 - trueRetainedPer trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes = defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False) deltaCompSet[ms.lineageStr].append( completeness - trueComp) deltaContSet[ms.lineageStr].append( contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes = defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True) deltaCompRefined[ms.lineageStr].append( completeness - trueComp) deltaContRefined[ms.lineageStr].append( contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False) deltaCompSetRefined[ms.lineageStr].append( completeness - trueComp) deltaContSetRefined[ms.lineageStr].append( contamination - trueCont) taxonomy = ';'.join(metadata[testGenomeId]['taxonomy']) queueOut.put( (testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" summaryOut = open( '/tmp/simulation.random_scaffolds.w_refinement_50.draft.summary.tsv', 'w') summaryOut.write('Genome Id\tContig len\t% comp\t% cont') summaryOut.write('\tTaxonomy\tMarker set\t# descendants') summaryOut.write('\tUnmodified comp\tUnmodified cont') summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std') summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std') summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std') summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n') fout = gzip.open( '/tmp/simulation.random_scaffolds.w_refinement_50.draft.tsv.gz', 'wb') fout.write('Genome Id\tContig len\t% comp\t% cont') fout.write('\tTaxonomy\tMarker set\t# descendants') fout.write('\tUnmodified comp\tUnmodified cont') fout.write('\tIM comp\tIM cont') fout.write('\tMS comp\tMS cont') fout.write('\tRIM comp\tRIM cont') fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n') testsPerGenome = len(self.contigLens) * len(self.percentComps) * len( self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get( block=True, timeout=None) if testGenomeId == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % ( itemsProcessed, numTestGenomes * testsPerGenome, float(itemsProcessed) * 100 / (numTestGenomes * testsPerGenome)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) summaryOut.write( '\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId])))) summaryOut.write('\n') fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) fout.write( '\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContRefined[markerSetId]))) fout.write( '\t%s' % ','.join(map(str, deltaCompSetRefined[markerSetId]))) fout.write( '\t%s' % ','.join(map(str, deltaContSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\n') summaryOut.close() fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, minScaffolds, numThreads): random.seed(0) print '\n Reading reference genome tree.' treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print ' Number of taxa in tree: %d' % (len(tree.leaf_nodes())) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace('IMG_', '')) # get all draft genomes consisting of a user-specific minimum number of scaffolds print '' metadata = self.img.genomeMetadata() print ' Total genomes: %d' % len(metadata) draftGenomeIds = genomesInTree - self.img.filterGenomeIds( genomesInTree, metadata, 'status', 'Finished') print ' Number of draft genomes: %d' % len(draftGenomeIds) genomeIdsToTest = set() for genomeId in draftGenomeIds: if metadata[genomeId]['scaffold count'] >= minScaffolds: genomeIdsToTest.add(genomeId) print ' Number of draft genomes with >= %d scaffolds: %d' % ( minScaffolds, len(genomeIdsToTest)) print '' start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print ' readLineageSpecificGenesToRemove: %.2f' % (end - start) print ' Pre-computing genome information for calculating marker sets:' start = time.time() self.markerSetBuilder.precomputeGenomeFamilyScaffolds(metadata.keys()) end = time.time() print ' precomputeGenomeFamilyScaffolds: %.2f' % (end - start) start = time.time() self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable( metadata.keys()) end = time.time() print ' globalGeneCountTable: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print ' precomputeGenomeSeqLens: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeFamilyPositions( metadata.keys(), 0) end = time.time() print ' precomputeGenomeFamilyPositions: %.2f' % (end - start) print '' print ' Evaluating %d test genomes.' % len(genomeIdsToTest) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in list(genomeIdsToTest): workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [ mp.Process(target=self.__workerThread, args=(tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)) writeProc.join()
class SimulationScaffolds(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.contigLens = [5000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __seqLens(self, seqs): """Calculate lengths of seqs.""" genomeSize = 0 seqLens = {} for seqId, seq in seqs.iteritems(): seqLens[seqId] = len(seq) genomeSize += len(seq) return seqLens, genomeSize def __workerThread(self, tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = [testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets testSeqs = readFasta(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) testSeqLens, genomeSize = self.__seqLens(testSeqs) for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for i in xrange(0, numReplicates): # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove # (this will sample >= the desired level of completeness) retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(percentComp, testSeqLens, genomeSize) trueComps.append(trueComp) # select a random genome to use as a source of contamination contGenomeId = random.sample(genomeIdsToTest - set([testGenomeId]), 1)[0] contSeqs = readFasta(os.path.join(self.img.genomeDir, contGenomeId, contGenomeId + '.fna')) contSeqLens, contGenomeSize = self.__seqLens(contSeqs) seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(1 - percentCont, contSeqLens, contGenomeSize) contSampledSeqIds = set(contSeqs.keys()).difference(seqsToRetain) trueCont = 100.0 - trueRetainedPer trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes= defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSet[ms.lineageStr].append(completeness - trueComp) deltaContSet[ms.lineageStr].append(contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes= defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaCompRefined[ms.lineageStr].append(completeness - trueComp) deltaContRefined[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp) deltaContSetRefined[ms.lineageStr].append(contamination - trueCont) taxonomy = ';'.join(metadata[testGenomeId]['taxonomy']) queueOut.put((testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" summaryOut = open('/tmp/simulation.random_scaffolds.w_refinement_50.draft.summary.tsv', 'w') summaryOut.write('Genome Id\tContig len\t% comp\t% cont') summaryOut.write('\tTaxonomy\tMarker set\t# descendants') summaryOut.write('\tUnmodified comp\tUnmodified cont') summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std') summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std') summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std') summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n') fout = gzip.open('/tmp/simulation.random_scaffolds.w_refinement_50.draft.tsv.gz', 'wb') fout.write('Genome Id\tContig len\t% comp\t% cont') fout.write('\tTaxonomy\tMarker set\t# descendants') fout.write('\tUnmodified comp\tUnmodified cont') fout.write('\tIM comp\tIM cont') fout.write('\tMS comp\tMS cont') fout.write('\tRIM comp\tRIM cont') fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n') testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get(block=True, timeout=None) if testGenomeId == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, numTestGenomes*testsPerGenome, float(itemsProcessed)*100/(numTestGenomes*testsPerGenome)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId])))) summaryOut.write('\n') fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) fout.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\n') summaryOut.close() fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, minScaffolds, numThreads): random.seed(0) print '\n Reading reference genome tree.' treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print ' Number of taxa in tree: %d' % (len(tree.leaf_nodes())) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace('IMG_', '')) # get all draft genomes consisting of a user-specific minimum number of scaffolds print '' metadata = self.img.genomeMetadata() print ' Total genomes: %d' % len(metadata) draftGenomeIds = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, 'status', 'Finished') print ' Number of draft genomes: %d' % len(draftGenomeIds) genomeIdsToTest = set() for genomeId in draftGenomeIds: if metadata[genomeId]['scaffold count'] >= minScaffolds: genomeIdsToTest.add(genomeId) print ' Number of draft genomes with >= %d scaffolds: %d' % (minScaffolds, len(genomeIdsToTest)) print '' start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print ' readLineageSpecificGenesToRemove: %.2f' % (end - start) print ' Pre-computing genome information for calculating marker sets:' start = time.time() self.markerSetBuilder.precomputeGenomeFamilyScaffolds(metadata.keys()) end = time.time() print ' precomputeGenomeFamilyScaffolds: %.2f' % (end - start) start = time.time() self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print ' globalGeneCountTable: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print ' precomputeGenomeSeqLens: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0) end = time.time() print ' precomputeGenomeFamilyPositions: %.2f' % (end - start) print '' print ' Evaluating %d test genomes.' % len(genomeIdsToTest) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in list(genomeIdsToTest): workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [mp.Process(target = self.__workerThread, args = (tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__writerThread, args = (len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)) writeProc.join()
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.contigLens = [1000, 2000, 5000, 10000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet=True, genomeIdsToRemove=[testGenomeId]) #!!!binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildDomainMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = [testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0) print('# marker genes: ', len(binMarkerSets.getMarkerGenes())) print('# genes in table: ', len(geneDistTable[testGenomeId])) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination print(completeness, contamination) # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) print('genomeSize', genomeSize) for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for _ in range(0, numReplicates): trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(genomeSize, percentComp, percentCont, contigLen) print(contigLen, trueComp, trueCont, len(startPartialGenomeContigs)) trueComps.append(trueComp) trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSet[ms.lineageStr].append(completeness - trueComp) deltaContSet[ms.lineageStr].append(contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaCompRefined[ms.lineageStr].append(completeness - trueComp) deltaContRefined[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp) deltaContSetRefined[ms.lineageStr].append(contamination - trueCont) taxonomy = ';'.join(metadata[testGenomeId]['taxonomy']) queueOut.put((testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" # summaryOut = open('/tmp/simulation.draft.summary.w_refinement_50.tsv', 'w') summaryOut = open('/tmp/simulation.summary.testing.tsv', 'w') summaryOut.write('Genome Id\tContig len\t% comp\t% cont') summaryOut.write('\tTaxonomy\tMarker set\t# descendants') summaryOut.write('\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont') summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std') summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std') summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std') summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n') # fout = gzip.open('/tmp/simulation.draft.w_refinement_50.tsv.gz', 'wb') fout = gzip.open('/tmp/simulation.testing.tsv.gz', 'wb') fout.write('Genome Id\tContig len\t% comp\t% cont') fout.write('\tTaxonomy\tMarker set\t# descendants') fout.write('\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont') fout.write('\tIM comp\tIM cont') fout.write('\tMS comp\tMS cont') fout.write('\tRIM comp\tRIM cont') fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n') testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get(block=True, timeout=None) if testGenomeId == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, numTestGenomes * testsPerGenome, float(itemsProcessed) * 100 / (numTestGenomes * testsPerGenome)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (mean(trueComps), std(trueConts))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId])))) summaryOut.write('\n') fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) fout.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\n') summaryOut.close() fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, numThreads): print('\n Reading reference genome tree.') treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree', 'genome_tree_full.refpkg', 'genome_tree.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print(' Number of taxa in tree: %d' % (len(tree.leaf_nodes()))) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace('IMG_', '')) # get all draft genomes for testing print('') metadata = self.img.genomeMetadata() print(' Total genomes: %d' % len(metadata)) genomeIdsToTest = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, 'status', 'Finished') print(' Number of draft genomes: %d' % len(genomeIdsToTest)) print('') print(' Pre-computing genome information for calculating marker sets:') start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print(' readLineageSpecificGenesToRemove: %.2f' % (end - start)) start = time.time() # self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print(' globalGeneCountTable: %.2f' % (end - start)) start = time.time() # self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print(' precomputeGenomeSeqLens: %.2f' % (end - start)) start = time.time() # self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0) end = time.time() print(' precomputeGenomeFamilyPositions: %.2f' % (end - start)) print('') print(' Evaluating %d test genomes.' % len(genomeIdsToTest)) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in genomeIdsToTest: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [mp.Process(target=self.__workerThread, args=(tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target=self.__writerThread, args=(len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)) writeProc.join()
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG( "/srv/whitlam/bio/db/checkm/img/img_metadata.tsv", "/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv" ) self.contigLens = [1000, 2000, 5000, 10000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label("IMG_" + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet( tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet=True, genomeIdsToRemove=[testGenomeId], ) #!!!binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildDomainMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = [testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable( [testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0 ) print "# marker genes: ", len(binMarkerSets.getMarkerGenes()) print "# genes in table: ", len(geneDistTable[testGenomeId]) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination print completeness, contamination # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + ".fna")) print "genomeSize", genomeSize for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for _ in xrange(0, numReplicates): trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome( genomeSize, percentComp, percentCont, contigLen ) print contigLen, trueComp, trueCont, len(startPartialGenomeContigs) trueComps.append(trueComp) trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes( ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen, ) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True ) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False ) deltaCompSet[ms.lineageStr].append(completeness - trueComp) deltaContSet[ms.lineageStr].append(contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes( ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen, ) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True ) deltaCompRefined[ms.lineageStr].append(completeness - trueComp) deltaContRefined[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False ) deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp) deltaContSetRefined[ms.lineageStr].append(contamination - trueCont) taxonomy = ";".join(metadata[testGenomeId]["taxonomy"]) queueOut.put( ( testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts, ) ) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" # summaryOut = open('/tmp/simulation.draft.summary.w_refinement_50.tsv', 'w') summaryOut = open("/tmp/simulation.summary.testing.tsv", "w") summaryOut.write("Genome Id\tContig len\t% comp\t% cont") summaryOut.write("\tTaxonomy\tMarker set\t# descendants") summaryOut.write("\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont") summaryOut.write("\tIM comp\tIM comp std\tIM cont\tIM cont std") summaryOut.write("\tMS comp\tMS comp std\tMS cont\tMS cont std") summaryOut.write("\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std") summaryOut.write("\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n") # fout = gzip.open('/tmp/simulation.draft.w_refinement_50.tsv.gz', 'wb') fout = gzip.open("/tmp/simulation.testing.tsv.gz", "wb") fout.write("Genome Id\tContig len\t% comp\t% cont") fout.write("\tTaxonomy\tMarker set\t# descendants") fout.write("\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont") fout.write("\tIM comp\tIM cont") fout.write("\tMS comp\tMS cont") fout.write("\tRIM comp\tRIM cont") fout.write("\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n") testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get( block=True, timeout=None ) if testGenomeId == None: break itemsProcessed += 1 statusStr = " Finished processing %d of %d (%.2f%%) test cases." % ( itemsProcessed, numTestGenomes * testsPerGenome, float(itemsProcessed) * 100 / (numTestGenomes * testsPerGenome), ) sys.stdout.write("%s\r" % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + "\t%d\t%.2f\t%.2f" % (contigLen, percentComp, percentCont)) summaryOut.write("\t" + taxonomy + "\t" + markerSetId + "\t" + str(numDescendants[markerSetId])) summaryOut.write("\t%.3f\t%.3f" % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write("\t%.3f\t%.3f" % (mean(trueComps), std(trueConts))) summaryOut.write("\t%.3f\t%.3f" % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write("\t%.3f\t%.3f" % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId]))) ) summaryOut.write("\n") fout.write(testGenomeId + "\t%d\t%.2f\t%.2f" % (contigLen, percentComp, percentCont)) fout.write("\t" + taxonomy + "\t" + markerSetId + "\t" + str(numDescendants[markerSetId])) fout.write("\t%.3f\t%.3f" % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write("\t%s" % ",".join(map(str, trueComps))) fout.write("\t%s" % ",".join(map(str, trueConts))) fout.write("\t%s" % ",".join(map(str, deltaComp[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCont[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCompSet[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaContSet[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCompRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaContRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCompSetRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaContSetRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, trueComps))) fout.write("\t%s" % ",".join(map(str, trueConts))) fout.write("\n") summaryOut.close() fout.close() sys.stdout.write("\n") def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, numThreads): print "\n Reading reference genome tree." treeFile = os.path.join("/srv", "db", "checkm", "genome_tree", "genome_tree_full.refpkg", "genome_tree.tre") tree = dendropy.Tree.get_from_path(treeFile, schema="newick", as_rooted=True, preserve_underscores=True) print " Number of taxa in tree: %d" % (len(tree.leaf_nodes())) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace("IMG_", "")) # get all draft genomes for testing print "" metadata = self.img.genomeMetadata() print " Total genomes: %d" % len(metadata) genomeIdsToTest = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, "status", "Finished") print " Number of draft genomes: %d" % len(genomeIdsToTest) print "" print " Pre-computing genome information for calculating marker sets:" start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print " readLineageSpecificGenesToRemove: %.2f" % (end - start) start = time.time() # self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print " globalGeneCountTable: %.2f" % (end - start) start = time.time() # self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print " precomputeGenomeSeqLens: %.2f" % (end - start) start = time.time() # self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0) end = time.time() print " precomputeGenomeFamilyPositions: %.2f" % (end - start) print "" print " Evaluating %d test genomes." % len(genomeIdsToTest) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in genomeIdsToTest: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [ mp.Process( target=self.__workerThread, args=(tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue), ) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put( ( None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, ) ) writeProc.join()