def run(self, outputFile): img = IMG() print 'Identifying all IMG prokaryotic genomes with valid data.' metadata = img.genomeMetadata() genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata) genomeMissingData = img.genomesWithMissingData(genomeIds) genomeIds -= genomeMissingData print ' Identified %d valid genomes.' % (len(genomeIds)) print 'Calculating gene copy number for each genome.' countTable = img.geneCountTable(genomeIds) counts = [] for _, count in countTable['pfam00318'].iteritems(): counts.append(count) print len(genomeIds) print len(counts) print mean(counts) fout = open(outputFile, 'w') fout.write(str(countTable)) fout.close() print 'Gene count dictionary to: ' + outputFile
def run(self, outputFile): img = IMG() print('Identifying all IMG prokaryotic genomes with valid data.') metadata = img.genomeMetadata() genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata) genomeMissingData = img.genomesWithMissingData(genomeIds) genomeIds -= genomeMissingData print(' Identified %d valid genomes.' % (len(genomeIds))) print('Calculating gene copy number for each genome.') countTable = img.geneCountTable(genomeIds) counts = [] for _, count in countTable['pfam00318'].iteritems(): counts.append(count) print(len(genomeIds)) print(len(counts)) print(mean(counts)) fout = open(outputFile, 'w') fout.write(str(countTable)) fout.close() print('Gene count dictionary to: ' + outputFile)
class MarkerSetSelection(object): def __init__(self): self.simFile = './experiments/simulation.tuning.genus.summary.tsv' self.looRank = 5 self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() def __stabilityTest(self, genomeIds, ubiquityThreshold=0.97, singleCopyThreshold=0.97, stabilityThreshold=0.05): """Test stability of marker set for a group of genomes using LOO-testing.""" # quick escape for lineage that are clearly stable if len(genomeIds) > 200: return True # calculate marker sets using a LOO-testing looMarkerGenes = [] for genomeId in genomeIds: looGenomeIds = genomeIds.difference([genomeId]) # calculate marker genes geneCountTable = self.img.geneCountTable(looGenomeIds) markerGenes = self.markerSetBuilder.markerGenes( looGenomeIds, geneCountTable, ubiquityThreshold * len(looGenomeIds), singleCopyThreshold * len(looGenomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove looMarkerGenes.append(markerGenes) # calculate change in marker set for all pairs markerSetSize = [] diffMarkerSet = [] for i in range(0, len(looMarkerGenes)): markerSetSize.append(len(looMarkerGenes[i])) for j in range(i + 1, len(looMarkerGenes)): symmDiff = looMarkerGenes[i].symmetric_difference( looMarkerGenes[j]) diffMarkerSet.append(len(symmDiff)) print(len(genomeIds), mean(diffMarkerSet), mean(markerSetSize)) return (float(mean(diffMarkerSet)) / mean(markerSetSize)) <= stabilityThreshold def __patristicDist(self, tree, taxa1, taxa2): mrca = tree.mrca(taxon_labels=[taxa1.taxon.label, taxa2.taxon.label]) if mrca.parent_node == None: # MRCA is the root of the tree return taxa1.distance_from_root() + taxa2.distance_from_root() else: dist = taxa1.edge_length parentNode = taxa1.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node dist += taxa2.edge_length parentNode = taxa2.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node return dist def __distToNodePercentileTest(self, genomeNode, markerSetNode, leaves, percentileTest): distToBin = self.__distanceToAncestor(genomeNode, markerSetNode) distToLeaves = [] for leaf in leaves: distToLeaves.append(self.__distanceToAncestor(leaf, markerSetNode)) return distToBin < percentile(distToLeaves, percentileTest) def __selectMarkerSetNode(self, tree, genomeId, metadata, taxonToGenomeIds): """Determine lineage-specific marker set to use for assessing the giving genome.""" # read genomes removed from tree as a result of duplicate sequences duplicateSeqs = self.markerSetBuilder.readDuplicateSeqs() # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # ascend tree to root looking for suitable marker set curNode = node.parent_node while curNode != None: uniqueId = curNode.label.split('|')[0] genomeIds = set() for leaf in curNode.leaf_nodes(): genomeIds.add(leaf.taxon.label.replace('IMG_', '')) duplicateGenomes = duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: genomeIds.add(dup.replace('IMG_', '')) # remove genome (LOO-style analysis) print('Full:', len(genomeIds)) genomeIds.difference_update([genomeId]) print('LOO:', len(genomeIds)) # remove all genomes from the same taxonomic group as the genome of interest taxon = metadata[genomeId]['taxonomy'][self.looRank] genomeIds.difference_update(taxonToGenomeIds[taxon]) print('Rank reduced:', len(genomeIds)) print(uniqueId) if len(genomeIds) > 10 and self.__stabilityTest(genomeIds): uidSelected = uniqueId break curNode = curNode.parent_node if curNode == None: # reach root so use universal marker set uidSelected = uniqueId return uidSelected def __bestMarkerSet(self, genomeId, simResults): """Get stats for best marker set.""" curBest = 1000 bestUID = None for uid, results in simResults[genomeId].items(): numDescendants, dComp, dCont = results if (dComp + dCont) < curBest: numDescendantsBest = numDescendants dCompBest = dComp dContBest = dCont bestUID = uid curBest = dComp + dCont return bestUID, numDescendantsBest, dCompBest, dContBest def __workerThread(self, tree, simResults, metadata, taxonToGenomeIds, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break uidSelected = self.__selectMarkerSetNode(tree, testGenomeId, metadata, taxonToGenomeIds) numDescendantsSelected, dCompSelected, dContSelected = simResults[ testGenomeId][uidSelected] # find best marker set bestUID, numDescendantsBest, dCompBest, dContBest = self.__bestMarkerSet( testGenomeId, simResults) queueOut.put((testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" fout = open('./experiments/markerSetSelection.tsv', 'w') fout.write( 'Genome Id\tSelected UID\t# descendants\tSelected dComp\tSelected dCont\tBest UID\t# descendants\tBest dComp\tBest dCont\tdDescendants\tdComp\tdCont\n' ) itemsToProcess = 0 dComps = [] dConts = [] dCompsPer = [] dContsPer = [] bestComp = [] bestCont = [] selectedComp = [] selectedCont = [] while True: testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest = writerQueue.get( block=True, timeout=None) if testGenomeId == None: break itemsToProcess += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test genomes.' % ( itemsToProcess, numTestGenomes, float(itemsToProcess) * 100 / (numTestGenomes)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() dComp = abs(dCompSelected - dCompBest) dCont = abs(dContSelected - dContBest) dDescendants = abs(numDescendantsSelected - numDescendantsBest) fout.write( '%s\t%s\t%d\t%.4f\t%.4f\t%s\t%d\t%.4f\t%.4f\t%d\t%.4f\t%.4f\n' % (testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest, dDescendants, dComp, dCont)) dComps.append(dComp) dConts.append(dCont) dCompsPer.append(dComp * 100.0 / dCompBest) dContsPer.append(dCont * 100.0 / max(dContBest, 0.01)) bestComp.append(dCompBest) bestCont.append(dContBest) selectedComp.append(dCompSelected) selectedCont.append(dContSelected) sys.stdout.write('\n') fout.close() print('') print(' General results:') print(' Best comp: %.2f +/- %.2f' % (mean(bestComp), std(bestComp))) print(' Best cont: %.2f +/- %.2f' % (mean(bestCont), std(bestCont))) print(' Selected comp: %.2f +/- %.2f' % (mean(selectedComp), std(selectedComp))) print(' Selected cont: %.2f +/- %.2f' % (mean(selectedCont), std(selectedCont))) print('') print(' Delta comp: %.2f +/- %.2f' % (mean(dComps), std(dComps))) print(' Delta cont: %.2f +/- %.2f' % (mean(dConts), std(dConts))) print(' Delta comp per error: %.1f +/- %.1f' % (mean(dCompsPer), std(dCompsPer))) print(' Delta cont per error: %.1f +/- %.1f' % (mean(dContsPer), std(dContsPer))) def __distanceToAncestor(self, leaf, ancestor): dist = 0 curNode = leaf while curNode != ancestor: dist += curNode.edge_length curNode = curNode.parent_node return dist def __bestNodeProperties(self, genomeId, tree, bestUID): # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # find node of best marker set curNode = node.parent_node nodesToBin = 0 distanceToBin = node.edge_length distanceToLeaves = [] while curNode != None: uniqueId = curNode.label.split('|')[0] nodesToBin += 1 if uniqueId == bestUID: for leaf in curNode.leaf_nodes(): if leaf != node: dist = self.__distanceToAncestor(leaf, curNode) distanceToLeaves.append(dist) break distanceToBin += curNode.edge_length curNode = curNode.parent_node return nodesToBin, distanceToBin, mean(distanceToLeaves) def __propertiesOfBestMarkerSets(self, tree, simResults): numDescendants = [] nodesToBin = [] distanceToBin = [] avgDistanceToLeaf = [] percDiffs = [] for genomeId in simResults: bestUID, numDescendantsBest, _, _ = self.__bestMarkerSet( genomeId, simResults) nodesToBinBest, distanceToBinBest, avgDistanceToLeafBest = self.__bestNodeProperties( genomeId, tree, bestUID) numDescendants.append(numDescendantsBest) nodesToBin.append(nodesToBinBest) distanceToBin.append(distanceToBinBest) avgDistanceToLeaf.append(avgDistanceToLeafBest) percDiff = abs(distanceToBinBest - avgDistanceToLeafBest) * 100 / distanceToBinBest percDiffs.append(percDiff) print(' # descendants: %.2f +/- %.2f' % (mean(numDescendants), std(numDescendants))) print(' # nodes to bin: %.2f +/- %.2f' % (mean(nodesToBin), std(nodesToBin))) print(' Distance to bin: %.2f +/- %.2f' % (mean(distanceToBin), std(distanceToBin))) distanceToBin = array(distanceToBin) avgDistanceToLeaf = array(avgDistanceToLeaf) print(' Distance to bin - average distance to leaf: %.2f +/- %.2f' % (mean(abs(distanceToBin - avgDistanceToLeaf)), std(abs(distanceToBin - avgDistanceToLeaf)))) print( ' Percent difference to average leaf distance: %.2f +/- %.2f' % (mean(percDiffs), std(percDiffs))) print('') def run(self, numThreads): # read reference tree print('\n Reading reference genome tree.') treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # get all genomes with a given taxon label metadata = self.img.genomeMetadata() taxonToGenomeIds = defaultdict(set) for genomeId in metadata: for t in metadata[genomeId]['taxonomy']: taxonToGenomeIds[t].add(genomeId) # read simulation results print(' Reading simulation results.') simResults = defaultdict(dict) with open(self.simFile) as f: f.readline() for line in f: lineSplit = line.split('\t') simId = lineSplit[0] + '-' + lineSplit[1] + '-' + lineSplit[ 2] + '-' + lineSplit[3] uid = lineSplit[5].split('|')[0].strip() numDescendants = int(lineSplit[6]) comp = float(lineSplit[21]) cont = float(lineSplit[23]) simResults[simId][uid] = [numDescendants, comp, cont] #print '' #print ' Properties of best marker sets:' #self.__propertiesOfBestMarkerSets(tree, simResults) print(' Evaluating %d test genomes.' % len(simResults)) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in simResults: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [ mp.Process(target=self.__workerThread, args=(tree, simResults, metadata, taxonToGenomeIds, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(simResults), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None)) writeProc.join()
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() self.simContigLen = 10000 def __selectMarkerSet(self, tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut): """Select marker set for parent edge of specified internal node.""" # get genomes descendant from each child of the specified internal node leaves = [] for child in internalNode.child_nodes(): genomeIds = set() for leaf in child.leaf_nodes(): genomeId = leaf.taxon.label.replace('IMG_', '') genomeIds.add(genomeId) duplicateGenomes = self.markerSetBuilder.duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: dupId = dup.replace('IMG_', '') genomeIds.add(dupId) leaves.append(genomeIds) # make sure each set of leaves contains at least a minimum number of genomes orderedLeaves = sorted(leaves, key=len) if len(orderedLeaves[0]) < 5: queueOut.put(('NA', -1, -1, -1, -1, -1)) return # calculate marker genes with all genomes in lineage with the fewest genomes removed binMarkerGenes, _ = self.markerSetBuilder.buildBinMarkerSet(tree, internalNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = orderedLeaves[0]) # evaluate accuracy of completeness and contamination estimations on different partial genomes from lineage with fewest genomes testGenomeIds = random.sample(orderedLeaves[0], min(len(orderedLeaves[0]), 100)) deltaComp = defaultdict(list) deltaCont = defaultdict(list) for testGenomeId in testGenomeIds: geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerGenes.getMarkerGenes(), spacingBetweenContigs=0) genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) repsPerGenome = 100 for _ in xrange(0, repsPerGenome): testComp = random.uniform(0.5, 1.0) testCont = random.uniform(0, 0.2) trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(genomeSize, testComp, testCont, self.simContigLen) for ms in binMarkerGenes.markerSetIter(): containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, self.simContigLen) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) if completeness == 0.0: print ms.getMarkerGenes() print geneDistTable[testGenomeId] print startPartialGenomeContigs print genomeSize print '*****************' + testGenomeId sys.exit() deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) # determine lineage-specific marker set with best average performance curBest = 1000 bestUID = None dCompBest = 0 dContBest = 0 for lineageStr in deltaComp: dComp, dCont = mean(abs(array(deltaComp[lineageStr]))), mean(abs(array(deltaCont[lineageStr]))) if (dComp + dCont) < curBest: dCompBest = dComp dContBest = dCont dCompStdBest = std(abs(array(deltaComp[lineageStr]))) dContStdBest = std(abs(array(deltaCont[lineageStr]))) bestUID = lineageStr.split('|')[0] curBest = dComp + dCont queueOut.put((internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest)) def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, queueIn, queueOut): """Process each data item in parallel.""" while True: internalNode = queueIn.get(block=True, timeout=None) if internalNode == None: break self.__selectMarkerSet(tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut) def __writerThread(self, numInternalNodes, writerQueue): """Store or write results of worker threads in a single thread.""" fout = open('/tmp/simInferBestMarkerSet.tsv', 'w') fout.write('Internal node ID\tMarker set ID\tmean % delta comp\tstd % delta comp\tmean % delta cont\tstd % delta cont\n') itemsProcessed = 0 while True: internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest = writerQueue.get(block=True, timeout=None) if internalNode == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) internal branches.' % (itemsProcessed, numInternalNodes, float(itemsProcessed)*100/(numInternalNodes)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if internalNode != 'NA': fout.write(internalNode.label + '\t%s\t%.2f\t%.2f\t%.2f\t%.2f\n' % (bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest)) fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numThreads): random.seed(0) print '\n Calculating global gene count table.' metadata = self.img.genomeMetadata() self.markerSetBuilder.globalGeneCountTable = self.img.geneCountTable(metadata.keys()) print '\n Reading reference genome tree.' treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print ' Evaluating %d internal nodes.' % len(tree.internal_nodes()) workerQueue = mp.Queue() writerQueue = mp.Queue() for internalNode in tree.internal_nodes(): if internalNode.parent_node != None: workerQueue.put(internalNode) for _ in range(numThreads): workerQueue.put(None) metadata = self.img.genomeMetadata() workerProc = [mp.Process(target = self.__workerThread, args = (tree, metadata, ubiquityThreshold, singleCopyThreshold, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__writerThread, args = (len(tree.internal_nodes())-1, writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None)) writeProc.join()
class MarkerSetStability(object): def __init__(self): self.img = IMG() self.markerset = MarkerSet() def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut): """Assess stability of marker set for a specific named taxonomic group.""" while True: lineage = queueIn.get(block=True, timeout=None) if lineage == None: break genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted') changeMarkerSetSize = {} markerGenes = [] if len(genomeIds) >= minGenomes: # calculate marker set for all genomes in lineage geneCountTable = self.img.geneCountTable(genomeIds) markerGenes = self.markerset.markerGenes( genomeIds, geneCountTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove for selectPer in range(50, 101, 5): numGenomesToSelect = int( float(selectPer) / 100 * len(genomeIds)) perChange = [] for _ in range(0, 10): # calculate marker set for subset of genomes subsetGenomeIds = random.sample( genomeIds, numGenomesToSelect) geneCountTable = self.img.geneCountTable( subsetGenomeIds) subsetMarkerGenes = self.markerset.markerGenes( subsetGenomeIds, geneCountTable, ubiquityThreshold * numGenomesToSelect, singleCopyThreshold * numGenomesToSelect) tigrToRemove = self.img.identifyRedundantTIGRFAMs( subsetMarkerGenes) subsetMarkerGenes = subsetMarkerGenes - tigrToRemove perChange.append( float( len( markerGenes.symmetric_difference( subsetMarkerGenes))) * 100.0 / len(markerGenes)) changeMarkerSetSize[selectPer] = [ mean(perChange), std(perChange) ] queueOut.put((lineage, len(genomeIds), len(markerGenes), changeMarkerSetSize)) def __storeResults(self, outputFile, totalLineages, writerQueue): """Store results to file.""" fout = open(outputFile, 'w') fout.write( 'Lineage\t# genomes\t# markers\tsubsample %\tmean % change\tstd % change\n' ) numProcessedLineages = 0 while True: lineage, numGenomes, numMarkerGenes, changeMarkerSetSize = writerQueue.get( block=True, timeout=None) if lineage == None: break numProcessedLineages += 1 statusStr = ' Finished processing %d of %d (%.2f%%) lineages.' % ( numProcessedLineages, totalLineages, float(numProcessedLineages) * 100 / totalLineages) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for selectPer in sorted(changeMarkerSetSize.keys()): fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, selectPer, changeMarkerSetSize[selectPer][0], changeMarkerSetSize[selectPer][1])) sys.stdout.write('\n') fout.close() def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads): """Calculate stability of marker sets for named taxonomic groups.""" print(' Calculating stability of marker sets:') random.seed(1) # process each sequence in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() metadata = self.img.genomeMetadata() lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank) #lineages = ['Bacteria'] #lineages += ['Bacteria;Proteobacteria'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;coli'] #lineages = ['Archaea'] #lineages += ['Archaea;Euryarchaeota'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosarcinaceae'] for lineage in lineages: workerQueue.put(lineage) for _ in range(numThreads): workerQueue.put(None) calcProc = [ mp.Process(target=self.__processLineage, args=(metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__storeResults, args=(outputFile, len(lineages), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None, None, None)) writeProc.join()
def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination): img = IMG() markerSetBuilder = MarkerSetBuilder() allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w') allOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w') trustedOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w') filteredOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) metadataOut = open(outputMetadataFile, 'w') # read input metadata file metadata = img.genomeMetadataFromFile(inputMetadataFile) finishedGenomes = defaultdict(set) allGenomes = defaultdict(set) metadataLine = {} bHeader = True for line in open(inputMetadataFile): if bHeader: metadataOut.write(line) bHeader = False continue lineSplit = line.split('\t') genomeId = lineSplit[0] domain = lineSplit[1] status = lineSplit[2] if status == 'Finished': finishedGenomes[domain].add(genomeId) allGenomes[domain].add(genomeId) metadataLine[genomeId] = line allTrustedGenomeIds = set() for lineage, allLineageGenomeIds in allGenomes.items(): print('[' + lineage + ']') print(' Number of genomes: %d' % len(allLineageGenomeIds)) # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker genes for finished genomes print( '\nDetermining initial marker gene sets for genome filtering.') markerSet = markerSetBuilder.buildMarkerSet( finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold) print( ' Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets())) fout = open( os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w') fout.write(str(markerSet.markerSet)) fout.close() # identifying trusted genomes (highly complete, low contamination genomes) print('\nIdentifying highly complete, low contamination genomes.') trustedGenomeIds = set() filteredGenomes = set() retainedStatus = {} filteredStatus = {} geneCountTable = img.geneCountTable(allLineageGenomeIds) for genomeId in allLineageGenomeIds: completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck( markerSet.markerSet, genomeId, geneCountTable) genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) retainedStatus[metadata[genomeId] ['status']] = retainedStatus.get( metadata[genomeId]['status'], 0) + 1 trustedOut.write(genomeStr) allOut.write(genomeStr) metadataOut.write(metadataLine[genomeId]) else: filteredGenomes.add(genomeId) filteredStatus[metadata[genomeId] ['status']] = filteredStatus.get( metadata[genomeId]['status'], 0) + 1 filteredOut.write(genomeStr) allOut.write(genomeStr) print(' Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes) * 100.0 / len(allLineageGenomeIds))) print(' ' + str(filteredStatus)) print(' \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds) * 100.0 / len(allLineageGenomeIds))) print(' ' + str(retainedStatus)) # determine status of retained genomes print('\nTrusted genomes by phylum:') trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.items(): print(' ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count)) print('') allOut.close() trustedOut.close() filteredOut.close() metadataOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in range(0, 6): # Domain to Genus for genomeId, data in metadata.items(): taxaStr = ';'.join(data['taxonomy'][0:r + 1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted(metadata) fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' + str(trustedStats.get(lineage, 0)) + '\n') fout.close()
class MarkerSetStabilityTest(object): def __init__(self): self.img = IMG() self.markerset = MarkerSet() def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut): """Assess stability of marker set for a specific named taxonomic group.""" while True: lineage = queueIn.get(block=True, timeout=None) if lineage == None: break genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted') markerGenes = [] perChange = [] numGenomesToSelect = int(0.9 * len(genomeIds)) if len(genomeIds) >= minGenomes: # calculate marker set for all genomes in lineage geneCountTable = self.img.geneCountTable(genomeIds) markerGenes = self.markerset.markerGenes( genomeIds, geneCountTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove for _ in range(0, 100): # calculate marker set for subset of genomes subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect) geneCountTable = self.img.geneCountTable(subsetGenomeIds) subsetMarkerGenes = self.markerset.markerGenes( subsetGenomeIds, geneCountTable, ubiquityThreshold * numGenomesToSelect, singleCopyThreshold * numGenomesToSelect) tigrToRemove = self.img.identifyRedundantTIGRFAMs( subsetMarkerGenes) subsetMarkerGenes = subsetMarkerGenes - tigrToRemove perChange.append( float( len( markerGenes.symmetric_difference( subsetMarkerGenes))) * 100.0 / len(markerGenes)) if perChange != []: queueOut.put( (lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, mean(perChange), std(perChange))) else: queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, -1, -1)) def __storeResults(self, outputFile, totalLineages, writerQueue): """Store results to file.""" fout = open(outputFile, 'w') fout.write( 'Lineage\t# genomes\t# markers\t# sampled genomes\tmean % change\tstd % change\n' ) numProcessedLineages = 0 while True: lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange = writerQueue.get( block=True, timeout=None) if lineage == None: break numProcessedLineages += 1 statusStr = ' Finished processing %d of %d (%.2f%%) lineages.' % ( numProcessedLineages, totalLineages, float(numProcessedLineages) * 100 / totalLineages) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange)) sys.stdout.write('\n') fout.close() def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads): """Calculate stability of marker sets for named taxonomic groups.""" print(' Testing stability of marker sets:') random.seed(1) # process each sequence in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() metadata = self.img.genomeMetadata() lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank) for lineage in lineages: workerQueue.put(lineage) for _ in range(numThreads): workerQueue.put(None) calcProc = [ mp.Process(target=self.__processLineage, args=(metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__storeResults, args=(outputFile, len(lineages), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None, None, None, None, None)) writeProc.join()
class MarkerSetStability(object): def __init__(self): self.img = IMG() self.markerset = MarkerSet() def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut): """Assess stability of marker set for a specific named taxonomic group.""" while True: lineage = queueIn.get(block=True, timeout=None) if lineage == None: break genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted') changeMarkerSetSize = {} markerGenes = [] if len(genomeIds) >= minGenomes: # calculate marker set for all genomes in lineage geneCountTable = self.img.geneCountTable(genomeIds) markerGenes = self.markerset.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove for selectPer in xrange(50, 101, 5): numGenomesToSelect = int(float(selectPer)/100 * len(genomeIds)) perChange = [] for _ in xrange(0, 10): # calculate marker set for subset of genomes subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect) geneCountTable = self.img.geneCountTable(subsetGenomeIds) subsetMarkerGenes = self.markerset.markerGenes(subsetGenomeIds, geneCountTable, ubiquityThreshold*numGenomesToSelect, singleCopyThreshold*numGenomesToSelect) tigrToRemove = self.img.identifyRedundantTIGRFAMs(subsetMarkerGenes) subsetMarkerGenes = subsetMarkerGenes - tigrToRemove perChange.append(float(len(markerGenes.symmetric_difference(subsetMarkerGenes)))*100.0 / len(markerGenes)) changeMarkerSetSize[selectPer] = [mean(perChange), std(perChange)] queueOut.put((lineage, len(genomeIds), len(markerGenes), changeMarkerSetSize)) def __storeResults(self, outputFile, totalLineages, writerQueue): """Store results to file.""" fout = open(outputFile, 'w') fout.write('Lineage\t# genomes\t# markers\tsubsample %\tmean % change\tstd % change\n') numProcessedLineages = 0 while True: lineage, numGenomes, numMarkerGenes, changeMarkerSetSize = writerQueue.get(block=True, timeout=None) if lineage == None: break numProcessedLineages += 1 statusStr = ' Finished processing %d of %d (%.2f%%) lineages.' % (numProcessedLineages, totalLineages, float(numProcessedLineages)*100/totalLineages) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for selectPer in sorted(changeMarkerSetSize.keys()): fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, selectPer, changeMarkerSetSize[selectPer][0], changeMarkerSetSize[selectPer][1])) sys.stdout.write('\n') fout.close() def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads): """Calculate stability of marker sets for named taxonomic groups.""" print ' Calculating stability of marker sets:' random.seed(1) # process each sequence in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() metadata = self.img.genomeMetadata() lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank) #lineages = ['Bacteria'] #lineages += ['Bacteria;Proteobacteria'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;coli'] #lineages = ['Archaea'] #lineages += ['Archaea;Euryarchaeota'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosarcinaceae'] for lineage in lineages: workerQueue.put(lineage) for _ in range(numThreads): workerQueue.put(None) calcProc = [mp.Process(target = self.__processLineage, args = (metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__storeResults, args = (outputFile, len(lineages), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None, None, None)) writeProc.join()
def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination): img = IMG() markerSetBuilder = MarkerSetBuilder() allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w') allOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w') trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w') filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') metadataOut = open(outputMetadataFile, 'w') # read input metadata file metadata = img.genomeMetadataFromFile(inputMetadataFile) finishedGenomes = defaultdict(set) allGenomes = defaultdict(set) metadataLine = {} bHeader = True for line in open(inputMetadataFile): if bHeader: metadataOut.write(line) bHeader = False continue lineSplit = line.split('\t') genomeId = lineSplit[0] domain = lineSplit[1] status = lineSplit[2] if status == 'Finished': finishedGenomes[domain].add(genomeId) allGenomes[domain].add(genomeId) metadataLine[genomeId] = line allTrustedGenomeIds = set() for lineage, allLineageGenomeIds in allGenomes.iteritems(): print '[' + lineage + ']' print ' Number of genomes: %d' % len(allLineageGenomeIds) # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker genes for finished genomes print '\nDetermining initial marker gene sets for genome filtering.' markerSet = markerSetBuilder.buildMarkerSet(finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold) print ' Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets()) fout = open(os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w') fout.write(str(markerSet.markerSet)) fout.close() # identifying trusted genomes (highly complete, low contamination genomes) print '\nIdentifying highly complete, low contamination genomes.' trustedGenomeIds = set() filteredGenomes = set() retainedStatus = {} filteredStatus = {} geneCountTable = img.geneCountTable(allLineageGenomeIds) for genomeId in allLineageGenomeIds: completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(markerSet.markerSet, genomeId, geneCountTable) genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) retainedStatus[metadata[genomeId]['status']] = retainedStatus.get(metadata[genomeId]['status'], 0) + 1 trustedOut.write(genomeStr) allOut.write(genomeStr) metadataOut.write(metadataLine[genomeId]) else: filteredGenomes.add(genomeId) filteredStatus[metadata[genomeId]['status']] = filteredStatus.get(metadata[genomeId]['status'], 0) + 1 filteredOut.write(genomeStr) allOut.write(genomeStr) print ' Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes)*100.0 / len(allLineageGenomeIds)) print ' ' + str(filteredStatus) print ' \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds)*100.0 / len(allLineageGenomeIds)) print ' ' + str(retainedStatus) # determine status of retained genomes print '\nTrusted genomes by phylum:' trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.iteritems(): print ' ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count) print '' allOut.close() trustedOut.close() filteredOut.close() metadataOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in xrange(0, 6): # Domain to Genus for genomeId, data in metadata.iteritems(): taxaStr = ';'.join(data['taxonomy'][0:r+1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted(metadata) fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n') fout.close()
class MarkerSetStabilityTest(object): def __init__(self): self.img = IMG() self.markerset = MarkerSet() def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut): """Assess stability of marker set for a specific named taxonomic group.""" while True: lineage = queueIn.get(block=True, timeout=None) if lineage == None: break genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted') markerGenes = [] perChange = [] numGenomesToSelect = int(0.9*len(genomeIds)) if len(genomeIds) >= minGenomes: # calculate marker set for all genomes in lineage geneCountTable = self.img.geneCountTable(genomeIds) markerGenes = self.markerset.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove for _ in xrange(0, 100): # calculate marker set for subset of genomes subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect) geneCountTable = self.img.geneCountTable(subsetGenomeIds) subsetMarkerGenes = self.markerset.markerGenes(subsetGenomeIds, geneCountTable, ubiquityThreshold*numGenomesToSelect, singleCopyThreshold*numGenomesToSelect) tigrToRemove = self.img.identifyRedundantTIGRFAMs(subsetMarkerGenes) subsetMarkerGenes = subsetMarkerGenes - tigrToRemove perChange.append(float(len(markerGenes.symmetric_difference(subsetMarkerGenes)))*100.0 / len(markerGenes)) if perChange != []: queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, mean(perChange), std(perChange))) else: queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, -1, -1)) def __storeResults(self, outputFile, totalLineages, writerQueue): """Store results to file.""" fout = open(outputFile, 'w') fout.write('Lineage\t# genomes\t# markers\t# sampled genomes\tmean % change\tstd % change\n') numProcessedLineages = 0 while True: lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange = writerQueue.get(block=True, timeout=None) if lineage == None: break numProcessedLineages += 1 statusStr = ' Finished processing %d of %d (%.2f%%) lineages.' % (numProcessedLineages, totalLineages, float(numProcessedLineages)*100/totalLineages) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange)) sys.stdout.write('\n') fout.close() def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads): """Calculate stability of marker sets for named taxonomic groups.""" print ' Testing stability of marker sets:' random.seed(1) # process each sequence in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() metadata = self.img.genomeMetadata() lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank) for lineage in lineages: workerQueue.put(lineage) for _ in range(numThreads): workerQueue.put(None) calcProc = [mp.Process(target = self.__processLineage, args = (metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__storeResults, args = (outputFile, len(lineages), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None, None, None, None, None)) writeProc.join()
class MarkerSetSelection(object): def __init__(self): self.simFile = './experiments/simulation.tuning.genus.summary.tsv' self.looRank = 5 self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() def __stabilityTest(self, genomeIds, ubiquityThreshold = 0.97, singleCopyThreshold = 0.97, stabilityThreshold = 0.05): """Test stability of marker set for a group of genomes using LOO-testing.""" # quick escape for lineage that are clearly stable if len(genomeIds) > 200: return True # calculate marker sets using a LOO-testing looMarkerGenes = [] for genomeId in genomeIds: looGenomeIds = genomeIds.difference([genomeId]) # calculate marker genes geneCountTable = self.img.geneCountTable(looGenomeIds) markerGenes = self.markerSetBuilder.markerGenes(looGenomeIds, geneCountTable, ubiquityThreshold*len(looGenomeIds), singleCopyThreshold*len(looGenomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove looMarkerGenes.append(markerGenes) # calculate change in marker set for all pairs markerSetSize = [] diffMarkerSet = [] for i in xrange(0, len(looMarkerGenes)): markerSetSize.append(len(looMarkerGenes[i])) for j in xrange(i+1, len(looMarkerGenes)): symmDiff = looMarkerGenes[i].symmetric_difference(looMarkerGenes[j]) diffMarkerSet.append(len(symmDiff)) print len(genomeIds), mean(diffMarkerSet), mean(markerSetSize) return (float(mean(diffMarkerSet)) / mean(markerSetSize)) <= stabilityThreshold def __patristicDist(self, tree, taxa1, taxa2): mrca = tree.mrca(taxon_labels=[taxa1.taxon.label, taxa2.taxon.label]) if mrca.parent_node == None: # MRCA is the root of the tree return taxa1.distance_from_root() + taxa2.distance_from_root() else: dist = taxa1.edge_length parentNode = taxa1.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node dist += taxa2.edge_length parentNode = taxa2.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node return dist def __distToNodePercentileTest(self, genomeNode, markerSetNode, leaves, percentileTest): distToBin = self.__distanceToAncestor(genomeNode, markerSetNode) distToLeaves = [] for leaf in leaves: distToLeaves.append(self.__distanceToAncestor(leaf, markerSetNode)) return distToBin < percentile(distToLeaves, percentileTest) def __selectMarkerSetNode(self, tree, genomeId, metadata, taxonToGenomeIds): """Determine lineage-specific marker set to use for assessing the giving genome.""" # read genomes removed from tree as a result of duplicate sequences duplicateSeqs = self.markerSetBuilder.readDuplicateSeqs() # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # ascend tree to root looking for suitable marker set curNode = node.parent_node while curNode != None: uniqueId = curNode.label.split('|')[0] genomeIds = set() for leaf in curNode.leaf_nodes(): genomeIds.add(leaf.taxon.label.replace('IMG_', '')) duplicateGenomes = duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: genomeIds.add(dup.replace('IMG_', '')) # remove genome (LOO-style analysis) print 'Full:', len(genomeIds) genomeIds.difference_update([genomeId]) print 'LOO:', len(genomeIds) # remove all genomes from the same taxonomic group as the genome of interest taxon = metadata[genomeId]['taxonomy'][self.looRank] genomeIds.difference_update(taxonToGenomeIds[taxon]) print 'Rank reduced:', len(genomeIds) print uniqueId if len(genomeIds) > 10 and self.__stabilityTest(genomeIds): uidSelected = uniqueId break curNode = curNode.parent_node if curNode == None: # reach root so use universal marker set uidSelected = uniqueId return uidSelected def __bestMarkerSet(self, genomeId, simResults): """Get stats for best marker set.""" curBest = 1000 bestUID = None for uid, results in simResults[genomeId].iteritems(): numDescendants, dComp, dCont = results if (dComp + dCont) < curBest: numDescendantsBest = numDescendants dCompBest = dComp dContBest = dCont bestUID = uid curBest = dComp + dCont return bestUID, numDescendantsBest, dCompBest, dContBest def __workerThread(self, tree, simResults, metadata, taxonToGenomeIds, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break uidSelected = self.__selectMarkerSetNode(tree, testGenomeId, metadata, taxonToGenomeIds) numDescendantsSelected, dCompSelected, dContSelected = simResults[testGenomeId][uidSelected] # find best marker set bestUID, numDescendantsBest, dCompBest, dContBest = self.__bestMarkerSet(testGenomeId, simResults) queueOut.put((testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" fout = open('./experiments/markerSetSelection.tsv', 'w') fout.write('Genome Id\tSelected UID\t# descendants\tSelected dComp\tSelected dCont\tBest UID\t# descendants\tBest dComp\tBest dCont\tdDescendants\tdComp\tdCont\n') itemsToProcess = 0 dComps = [] dConts = [] dCompsPer = [] dContsPer = [] bestComp = [] bestCont = [] selectedComp = [] selectedCont = [] while True: testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest = writerQueue.get(block=True, timeout=None) if testGenomeId == None: break itemsToProcess += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test genomes.' % (itemsToProcess, numTestGenomes, float(itemsToProcess)*100/(numTestGenomes)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() dComp = abs(dCompSelected - dCompBest) dCont = abs(dContSelected - dContBest) dDescendants = abs(numDescendantsSelected - numDescendantsBest) fout.write('%s\t%s\t%d\t%.4f\t%.4f\t%s\t%d\t%.4f\t%.4f\t%d\t%.4f\t%.4f\n' % (testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest, dDescendants, dComp, dCont)) dComps.append(dComp) dConts.append(dCont) dCompsPer.append(dComp*100.0 / dCompBest) dContsPer.append(dCont*100.0 / max(dContBest, 0.01)) bestComp.append(dCompBest) bestCont.append(dContBest) selectedComp.append(dCompSelected) selectedCont.append(dContSelected) sys.stdout.write('\n') fout.close() print '' print ' General results:' print ' Best comp: %.2f +/- %.2f' % (mean(bestComp), std(bestComp)) print ' Best cont: %.2f +/- %.2f' % (mean(bestCont), std(bestCont)) print ' Selected comp: %.2f +/- %.2f' % (mean(selectedComp), std(selectedComp)) print ' Selected cont: %.2f +/- %.2f' % (mean(selectedCont), std(selectedCont)) print '' print ' Delta comp: %.2f +/- %.2f' % (mean(dComps), std(dComps)) print ' Delta cont: %.2f +/- %.2f' % (mean(dConts), std(dConts)) print ' Delta comp per error: %.1f +/- %.1f' % (mean(dCompsPer), std(dCompsPer)) print ' Delta cont per error: %.1f +/- %.1f' % (mean(dContsPer), std(dContsPer)) def __distanceToAncestor(self, leaf, ancestor): dist = 0 curNode = leaf while curNode != ancestor: dist += curNode.edge_length curNode = curNode.parent_node return dist def __bestNodeProperties(self, genomeId, tree, bestUID): # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # find node of best marker set curNode = node.parent_node nodesToBin = 0 distanceToBin = node.edge_length distanceToLeaves = [] while curNode != None: uniqueId = curNode.label.split('|')[0] nodesToBin += 1 if uniqueId == bestUID: for leaf in curNode.leaf_nodes(): if leaf != node: dist = self.__distanceToAncestor(leaf, curNode) distanceToLeaves.append(dist) break distanceToBin += curNode.edge_length curNode = curNode.parent_node return nodesToBin, distanceToBin, mean(distanceToLeaves) def __propertiesOfBestMarkerSets(self, tree, simResults): numDescendants = [] nodesToBin = [] distanceToBin = [] avgDistanceToLeaf = [] percDiffs = [] for genomeId in simResults: bestUID, numDescendantsBest, _, _ = self.__bestMarkerSet(genomeId, simResults) nodesToBinBest, distanceToBinBest, avgDistanceToLeafBest = self.__bestNodeProperties(genomeId, tree, bestUID) numDescendants.append(numDescendantsBest) nodesToBin.append(nodesToBinBest) distanceToBin.append(distanceToBinBest) avgDistanceToLeaf.append(avgDistanceToLeafBest) percDiff = abs(distanceToBinBest - avgDistanceToLeafBest) * 100 / distanceToBinBest percDiffs.append(percDiff) print ' # descendants: %.2f +/- %.2f' % (mean(numDescendants), std(numDescendants)) print ' # nodes to bin: %.2f +/- %.2f' % (mean(nodesToBin), std(nodesToBin)) print ' Distance to bin: %.2f +/- %.2f' % (mean(distanceToBin), std(distanceToBin)) distanceToBin = array(distanceToBin) avgDistanceToLeaf = array(avgDistanceToLeaf) print ' Distance to bin - average distance to leaf: %.2f +/- %.2f' % (mean(abs(distanceToBin - avgDistanceToLeaf)), std(abs(distanceToBin - avgDistanceToLeaf))) print ' Percent difference to average leaf distance: %.2f +/- %.2f' % (mean(percDiffs), std(percDiffs)) print '' def run(self, numThreads): # read reference tree print '\n Reading reference genome tree.' treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # get all genomes with a given taxon label metadata = self.img.genomeMetadata() taxonToGenomeIds = defaultdict(set) for genomeId in metadata: for t in metadata[genomeId]['taxonomy']: taxonToGenomeIds[t].add(genomeId) # read simulation results print ' Reading simulation results.' simResults = defaultdict(dict) with open(self.simFile) as f: f.readline() for line in f: lineSplit = line.split('\t') simId = lineSplit[0] + '-' + lineSplit[1] + '-' + lineSplit[2] + '-' + lineSplit[3] uid = lineSplit[5].split('|')[0].strip() numDescendants = int(lineSplit[6]) comp = float(lineSplit[21]) cont = float(lineSplit[23]) simResults[simId][uid] = [numDescendants, comp, cont] #print '' #print ' Properties of best marker sets:' #self.__propertiesOfBestMarkerSets(tree, simResults) print ' Evaluating %d test genomes.' % len(simResults) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in simResults: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [mp.Process(target = self.__workerThread, args = (tree, simResults, metadata, taxonToGenomeIds, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__writerThread, args = (len(simResults), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None)) writeProc.join()