def __workerThread(self, ubiquityThreshold, singleCopyThreshold, minGenomes, colocatedDistThreshold, colocatedGenomeThreshold, metadata, queueIn, queueOut): """Process each data item in parallel.""" img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') markerSetBuilder = MarkerSetBuilder() while True: lineage = queueIn.get(block=True, timeout=None) if lineage == None: break if lineage == 'Universal': genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata) else: genomeIds = img.genomeIdsByTaxonomy(lineage, metadata) if len(genomeIds) >= minGenomes: markerSet = markerSetBuilder.buildMarkerSet( genomeIds, ubiquityThreshold, singleCopyThreshold, colocatedDistThreshold) colocatedSets = markerSet.markerSet else: colocatedSets = None # allow results to be processed or written to file queueOut.put((lineage, colocatedSets, len(genomeIds)))
def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.contigLens = [1000, 2000, 5000, 10000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2]
def __workerThread(self, ubiquityThreshold, singleCopyThreshold, minGenomes, colocatedDistThreshold, colocatedGenomeThreshold, metadata, queueIn, queueOut): """Process each data item in parallel.""" img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') markerSetBuilder = MarkerSetBuilder() while True: lineage = queueIn.get(block=True, timeout=None) if lineage == None: break if lineage == 'Universal': genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata) else: genomeIds = img.genomeIdsByTaxonomy(lineage, metadata) if len(genomeIds) >= minGenomes: markerSet = markerSetBuilder.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold, colocatedDistThreshold) colocatedSets = markerSet.markerSet else: colocatedSets = None # allow results to be processed or written to file queueOut.put((lineage, colocatedSets, len(genomeIds)))
def __init__(self, outputDir): self.img = IMG() self.markerSetBuilder = MarkerSetBuilder() if os.path.exists(outputDir): print '[Error] Output directory already exists: ' + outputDir sys.exit(0) else: os.makedirs(outputDir) self.__checkForHMMER() self.__checkForFastTree() self.hmmDir = os.path.join(outputDir, 'phylo_hmms') self.alignmentDir = os.path.join(outputDir, 'gene_alignments') self.geneTreeDir = os.path.join(outputDir, 'gene_trees') self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific') self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final') self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv') self.concatenatedAlignFile = os.path.join( outputDir, 'genome_tree.concatenated.faa') self.derepConcatenatedAlignFile = os.path.join( outputDir, 'genome_tree.concatenated.derep.fasta') self.treeOut = os.path.join(outputDir, 'genome_tree.tre') self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre') self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre') self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre') self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre') self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre') self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre') self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre') self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv') self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv') self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm') self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt') self.phyloUbiquity = 0.90 self.phyloSingleCopy = 0.90 self.paralogAcceptPer = 0.01 # self.consistencyAcceptPer = 0.95 # for trees at the class-level self.consistencyAcceptPer = 0.906 # for trees at the phylum-level self.consistencyMinTaxa = 20 # create output directories os.makedirs(self.hmmDir) os.makedirs(self.alignmentDir) os.makedirs(self.geneTreeDir) os.makedirs(self.conspecificGeneTreeDir) os.makedirs(self.finalGeneTreeDir)
def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.contigLens = [5000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2]
def __init__(self, outputDir): self.img = IMG() self.markerSetBuilder = MarkerSetBuilder() if os.path.exists(outputDir): print '[Error] Output directory already exists: ' + outputDir sys.exit(0) else: os.makedirs(outputDir) self.__checkForHMMER() self.__checkForFastTree() self.hmmDir = os.path.join(outputDir, 'phylo_hmms') self.alignmentDir = os.path.join(outputDir, 'gene_alignments') self.geneTreeDir = os.path.join(outputDir, 'gene_trees') self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific') self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final') self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv') self.concatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.faa') self.derepConcatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.derep.fasta') self.treeOut = os.path.join(outputDir, 'genome_tree.tre') self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre') self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre') self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre') self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre') self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre') self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre') self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre') self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv') self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv') self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm') self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt') self.phyloUbiquity = 0.90 self.phyloSingleCopy = 0.90 self.paralogAcceptPer = 0.01 #self.consistencyAcceptPer = 0.95 # for trees at the class-level self.consistencyAcceptPer = 0.906 # for trees at the phylum-level self.consistencyMinTaxa = 20 # create output directories os.makedirs(self.hmmDir) os.makedirs(self.alignmentDir) os.makedirs(self.geneTreeDir) os.makedirs(self.conspecificGeneTreeDir) os.makedirs(self.finalGeneTreeDir)
class DecorateTree(object): def __init__(self): self.img = IMG() self.markerSetBuilder = MarkerSetBuilder() def __meanStd(self, metadata, genomeIds, category): values = [] for genomeId in genomeIds: genomeId = genomeId.replace('IMG_', '') v = metadata[genomeId][category] if v != 'NA': values.append(v) return mean(values), std(values) def __calculateMarkerSet(self, genomeLabels, ubiquityThreshold = 0.97, singleCopyThreshold = 0.97): """Calculate marker set for a set of genomes.""" # get genome IDs from genome labels genomeIds = set() for genomeLabel in genomeLabels: genomeIds.add(genomeLabel.replace('IMG_', '')) markerSet = self.markerSetBuilder.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold) return markerSet.markerSet def __pfamIdToPfamAcc(self, img): pfamIdToPfamAcc = {} for line in open(img.pfamHMMs): if 'ACC' in line: acc = line.split()[1].strip() pfamId = acc.split('.')[0] pfamIdToPfamAcc[pfamId] = acc return pfamIdToPfamAcc def decorate(self, taxaTreeFile, derepFile, inputTreeFile, metadataOut, numThreads): # read genome metadata print ' Reading metadata.' metadata = self.img.genomeMetadata() # read list of taxa with duplicate sequences print ' Read list of taxa with duplicate sequences.' duplicateTaxa = {} for line in open(derepFile): lineSplit = line.rstrip().split() if len(lineSplit) > 1: duplicateTaxa[lineSplit[0]] = lineSplit[1:] # build gene count table print ' Building gene count table.' genomeIds = self.img.genomeMetadata().keys() print ' # trusted genomes = ' + str(len(genomeIds)) # calculate statistics for each internal node using multiple threads print ' Calculating statistics for each internal node.' self.__internalNodeStatistics(taxaTreeFile, inputTreeFile, duplicateTaxa, metadata, metadataOut, numThreads) def __internalNodeStatistics(self, taxaTreeFile, inputTreeFile, duplicateTaxa, metadata, metadataOut, numThreads): # determine HMM model accession numbers pfamIdToPfamAcc = self.__pfamIdToPfamAcc(self.img) taxaTree = dendropy.Tree.get_from_path(taxaTreeFile, schema='newick', as_rooted=True, preserve_underscores=True) inputTree = dendropy.Tree.get_from_path(inputTreeFile, schema='newick', as_rooted=True, preserve_underscores=True) workerQueue = mp.Queue() writerQueue = mp.Queue() uniqueId = 0 for node in inputTree.internal_nodes(): uniqueId += 1 workerQueue.put((uniqueId, node)) for _ in range(numThreads): workerQueue.put((None, None)) calcProc = [mp.Process(target = self.__processInternalNode, args = (taxaTree, duplicateTaxa, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__reportStatistics, args = (metadata, metadataOut, inputTree, inputTreeFile, pfamIdToPfamAcc, writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None, None, None, None, None, None)) writeProc.join() def __processInternalNode(self, taxaTree, duplicateTaxa, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: uniqueId, node = queueIn.get(block=True, timeout=None) if uniqueId == None: break # find corresponding internal node in taxa tree labels = [] for leaf in node.leaf_nodes(): labels.append(leaf.taxon.label) if leaf.taxon.label in duplicateTaxa: for genomeId in duplicateTaxa[leaf.taxon.label]: labels.append(genomeId) # check if there is a taxonomic label mrca = taxaTree.mrca(taxon_labels = labels) taxaStr = '' if mrca.label: taxaStr = mrca.label.replace(' ', '') # give node a unique Id while retraining bootstrap value bootstrap = '' if node.label: bootstrap = node.label nodeLabel = 'UID' + str(uniqueId) + '|' + taxaStr + '|' + bootstrap # calculate marker set markerSet = self.__calculateMarkerSet(labels) queueOut.put((uniqueId, labels, markerSet, taxaStr, bootstrap, node.oid, nodeLabel)) def __reportStatistics(self, metadata, metadataOut, inputTree, inputTreeFile, pfamIdToPfamAcc, writerQueue): """Store statistics for internal node.""" fout = open(metadataOut, 'w') fout.write('UID\t# genomes\tTaxonomy\tBootstrap') fout.write('\tGC mean\tGC std') fout.write('\tGenome size mean\tGenome size std') fout.write('\tGene count mean\tGene count std') fout.write('\tMarker set') fout.write('\n') numProcessedNodes = 0 numInternalNodes = len(inputTree.internal_nodes()) while True: uniqueId, labels, markerSet, taxaStr, bootstrap, nodeID, nodeLabel = writerQueue.get(block=True, timeout=None) if uniqueId == None: break numProcessedNodes += 1 statusStr = ' Finished processing %d of %d (%.2f%%) internal nodes.' % (numProcessedNodes, numInternalNodes, float(numProcessedNodes)*100/numInternalNodes) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() fout.write('UID' + str(uniqueId) + '\t' + str(len(labels)) + '\t' + taxaStr + '\t' + bootstrap) m, s = self.__meanStd(metadata, labels, 'GC %') fout.write('\t' + str(m*100) + '\t' + str(s*100)) m, s = self.__meanStd(metadata, labels, 'genome size') fout.write('\t' + str(m) + '\t' + str(s)) m, s = self.__meanStd(metadata, labels, 'gene count') fout.write('\t' + str(m) + '\t' + str(s)) # change model names to accession numbers, and make # sure there is an HMM model for each PFAM mungedMarkerSets = [] for geneSet in markerSet: s = set() for geneId in geneSet: if 'pfam' in geneId: pfamId = geneId.replace('pfam', 'PF') if pfamId in pfamIdToPfamAcc: s.add(pfamIdToPfamAcc[pfamId]) else: s.add(geneId) mungedMarkerSets.append(s) fout.write('\t' + str(mungedMarkerSets)) fout.write('\n') node = inputTree.find_node(filter_fn=lambda n: hasattr(n, 'oid') and n.oid == nodeID) node.label = nodeLabel sys.stdout.write('\n') fout.close() inputTree.write_to_path(inputTreeFile, schema='newick', suppress_rooting=True, unquoted_underscores=True)
class SimulationScaffolds(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.contigLens = [5000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __seqLens(self, seqs): """Calculate lengths of seqs.""" genomeSize = 0 seqLens = {} for seqId, seq in seqs.iteritems(): seqLens[seqId] = len(seq) genomeSize += len(seq) return seqLens, genomeSize def __workerThread(self, tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet( tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet=True, genomeIdsToRemove=[testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable( [testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck( hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets testSeqs = readFasta( os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) testSeqLens, genomeSize = self.__seqLens(testSeqs) for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for i in xrange(0, numReplicates): # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove # (this will sample >= the desired level of completeness) retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement( percentComp, testSeqLens, genomeSize) trueComps.append(trueComp) # select a random genome to use as a source of contamination contGenomeId = random.sample( genomeIdsToTest - set([testGenomeId]), 1)[0] contSeqs = readFasta( os.path.join(self.img.genomeDir, contGenomeId, contGenomeId + '.fna')) contSeqLens, contGenomeSize = self.__seqLens( contSeqs) seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement( 1 - percentCont, contSeqLens, contGenomeSize) contSampledSeqIds = set( contSeqs.keys()).difference(seqsToRetain) trueCont = 100.0 - trueRetainedPer trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes = defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False) deltaCompSet[ms.lineageStr].append( completeness - trueComp) deltaContSet[ms.lineageStr].append( contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes = defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds( ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True) deltaCompRefined[ms.lineageStr].append( completeness - trueComp) deltaContRefined[ms.lineageStr].append( contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False) deltaCompSetRefined[ms.lineageStr].append( completeness - trueComp) deltaContSetRefined[ms.lineageStr].append( contamination - trueCont) taxonomy = ';'.join(metadata[testGenomeId]['taxonomy']) queueOut.put( (testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" summaryOut = open( '/tmp/simulation.random_scaffolds.w_refinement_50.draft.summary.tsv', 'w') summaryOut.write('Genome Id\tContig len\t% comp\t% cont') summaryOut.write('\tTaxonomy\tMarker set\t# descendants') summaryOut.write('\tUnmodified comp\tUnmodified cont') summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std') summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std') summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std') summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n') fout = gzip.open( '/tmp/simulation.random_scaffolds.w_refinement_50.draft.tsv.gz', 'wb') fout.write('Genome Id\tContig len\t% comp\t% cont') fout.write('\tTaxonomy\tMarker set\t# descendants') fout.write('\tUnmodified comp\tUnmodified cont') fout.write('\tIM comp\tIM cont') fout.write('\tMS comp\tMS cont') fout.write('\tRIM comp\tRIM cont') fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n') testsPerGenome = len(self.contigLens) * len(self.percentComps) * len( self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get( block=True, timeout=None) if testGenomeId == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % ( itemsProcessed, numTestGenomes * testsPerGenome, float(itemsProcessed) * 100 / (numTestGenomes * testsPerGenome)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) summaryOut.write( '\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId])))) summaryOut.write('\n') fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) fout.write( '\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContRefined[markerSetId]))) fout.write( '\t%s' % ','.join(map(str, deltaCompSetRefined[markerSetId]))) fout.write( '\t%s' % ','.join(map(str, deltaContSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\n') summaryOut.close() fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, minScaffolds, numThreads): random.seed(0) print '\n Reading reference genome tree.' treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print ' Number of taxa in tree: %d' % (len(tree.leaf_nodes())) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace('IMG_', '')) # get all draft genomes consisting of a user-specific minimum number of scaffolds print '' metadata = self.img.genomeMetadata() print ' Total genomes: %d' % len(metadata) draftGenomeIds = genomesInTree - self.img.filterGenomeIds( genomesInTree, metadata, 'status', 'Finished') print ' Number of draft genomes: %d' % len(draftGenomeIds) genomeIdsToTest = set() for genomeId in draftGenomeIds: if metadata[genomeId]['scaffold count'] >= minScaffolds: genomeIdsToTest.add(genomeId) print ' Number of draft genomes with >= %d scaffolds: %d' % ( minScaffolds, len(genomeIdsToTest)) print '' start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print ' readLineageSpecificGenesToRemove: %.2f' % (end - start) print ' Pre-computing genome information for calculating marker sets:' start = time.time() self.markerSetBuilder.precomputeGenomeFamilyScaffolds(metadata.keys()) end = time.time() print ' precomputeGenomeFamilyScaffolds: %.2f' % (end - start) start = time.time() self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable( metadata.keys()) end = time.time() print ' globalGeneCountTable: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print ' precomputeGenomeSeqLens: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeFamilyPositions( metadata.keys(), 0) end = time.time() print ' precomputeGenomeFamilyPositions: %.2f' % (end - start) print '' print ' Evaluating %d test genomes.' % len(genomeIdsToTest) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in list(genomeIdsToTest): workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [ mp.Process(target=self.__workerThread, args=(tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)) writeProc.join()
def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination): img = IMG() markerSetBuilder = MarkerSetBuilder() allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w') allOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w') trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w') filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') metadataOut = open(outputMetadataFile, 'w') # read input metadata file metadata = img.genomeMetadataFromFile(inputMetadataFile) finishedGenomes = defaultdict(set) allGenomes = defaultdict(set) metadataLine = {} bHeader = True for line in open(inputMetadataFile): if bHeader: metadataOut.write(line) bHeader = False continue lineSplit = line.split('\t') genomeId = lineSplit[0] domain = lineSplit[1] status = lineSplit[2] if status == 'Finished': finishedGenomes[domain].add(genomeId) allGenomes[domain].add(genomeId) metadataLine[genomeId] = line allTrustedGenomeIds = set() for lineage, allLineageGenomeIds in allGenomes.iteritems(): print '[' + lineage + ']' print ' Number of genomes: %d' % len(allLineageGenomeIds) # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker genes for finished genomes print '\nDetermining initial marker gene sets for genome filtering.' markerSet = markerSetBuilder.buildMarkerSet(finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold) print ' Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets()) fout = open(os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w') fout.write(str(markerSet.markerSet)) fout.close() # identifying trusted genomes (highly complete, low contamination genomes) print '\nIdentifying highly complete, low contamination genomes.' trustedGenomeIds = set() filteredGenomes = set() retainedStatus = {} filteredStatus = {} geneCountTable = img.geneCountTable(allLineageGenomeIds) for genomeId in allLineageGenomeIds: completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(markerSet.markerSet, genomeId, geneCountTable) genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) retainedStatus[metadata[genomeId]['status']] = retainedStatus.get(metadata[genomeId]['status'], 0) + 1 trustedOut.write(genomeStr) allOut.write(genomeStr) metadataOut.write(metadataLine[genomeId]) else: filteredGenomes.add(genomeId) filteredStatus[metadata[genomeId]['status']] = filteredStatus.get(metadata[genomeId]['status'], 0) + 1 filteredOut.write(genomeStr) allOut.write(genomeStr) print ' Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes)*100.0 / len(allLineageGenomeIds)) print ' ' + str(filteredStatus) print ' \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds)*100.0 / len(allLineageGenomeIds)) print ' ' + str(retainedStatus) # determine status of retained genomes print '\nTrusted genomes by phylum:' trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.iteritems(): print ' ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count) print '' allOut.close() trustedOut.close() filteredOut.close() metadataOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in xrange(0, 6): # Domain to Genus for genomeId, data in metadata.iteritems(): taxaStr = ';'.join(data['taxonomy'][0:r+1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted(metadata) fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n') fout.close()
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.contigLens = [1000, 2000, 5000, 10000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet=True, genomeIdsToRemove=[testGenomeId]) #!!!binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildDomainMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = [testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0) print('# marker genes: ', len(binMarkerSets.getMarkerGenes())) print('# genes in table: ', len(geneDistTable[testGenomeId])) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination print(completeness, contamination) # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) print('genomeSize', genomeSize) for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for _ in range(0, numReplicates): trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(genomeSize, percentComp, percentCont, contigLen) print(contigLen, trueComp, trueCont, len(startPartialGenomeContigs)) trueComps.append(trueComp) trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSet[ms.lineageStr].append(completeness - trueComp) deltaContSet[ms.lineageStr].append(contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaCompRefined[ms.lineageStr].append(completeness - trueComp) deltaContRefined[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp) deltaContSetRefined[ms.lineageStr].append(contamination - trueCont) taxonomy = ';'.join(metadata[testGenomeId]['taxonomy']) queueOut.put((testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" # summaryOut = open('/tmp/simulation.draft.summary.w_refinement_50.tsv', 'w') summaryOut = open('/tmp/simulation.summary.testing.tsv', 'w') summaryOut.write('Genome Id\tContig len\t% comp\t% cont') summaryOut.write('\tTaxonomy\tMarker set\t# descendants') summaryOut.write('\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont') summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std') summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std') summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std') summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n') # fout = gzip.open('/tmp/simulation.draft.w_refinement_50.tsv.gz', 'wb') fout = gzip.open('/tmp/simulation.testing.tsv.gz', 'wb') fout.write('Genome Id\tContig len\t% comp\t% cont') fout.write('\tTaxonomy\tMarker set\t# descendants') fout.write('\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont') fout.write('\tIM comp\tIM cont') fout.write('\tMS comp\tMS cont') fout.write('\tRIM comp\tRIM cont') fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n') testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get(block=True, timeout=None) if testGenomeId == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, numTestGenomes * testsPerGenome, float(itemsProcessed) * 100 / (numTestGenomes * testsPerGenome)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (mean(trueComps), std(trueConts))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId])))) summaryOut.write('\n') fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) fout.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\n') summaryOut.close() fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, numThreads): print('\n Reading reference genome tree.') treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree', 'genome_tree_full.refpkg', 'genome_tree.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print(' Number of taxa in tree: %d' % (len(tree.leaf_nodes()))) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace('IMG_', '')) # get all draft genomes for testing print('') metadata = self.img.genomeMetadata() print(' Total genomes: %d' % len(metadata)) genomeIdsToTest = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, 'status', 'Finished') print(' Number of draft genomes: %d' % len(genomeIdsToTest)) print('') print(' Pre-computing genome information for calculating marker sets:') start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print(' readLineageSpecificGenesToRemove: %.2f' % (end - start)) start = time.time() # self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print(' globalGeneCountTable: %.2f' % (end - start)) start = time.time() # self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print(' precomputeGenomeSeqLens: %.2f' % (end - start)) start = time.time() # self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0) end = time.time() print(' precomputeGenomeFamilyPositions: %.2f' % (end - start)) print('') print(' Evaluating %d test genomes.' % len(genomeIdsToTest)) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in genomeIdsToTest: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [mp.Process(target=self.__workerThread, args=(tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target=self.__writerThread, args=(len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)) writeProc.join()
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() def run(self): print '\n Reading reference genome tree.' treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # get all Finished, Trusted genomes metadata = self.img.genomeMetadata() bacteriaIds = self.img.getGenomesByClade('domain', 'Bacteria', metadata) print '# Bacteria: %d' % len(bacteriaIds) start = time.time() #self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print 'globalGeneCountTable: %.2f' % (end - start) start = time.time() #self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print 'precomputeGenomeSeqLens: %.2f' % (end - start) start = time.time() #self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 5000) end = time.time() print 'precomputeGenomeFamilyPositions: %.2f' % (end - start) #start = time.time() #test = self.img.geneDistTable(metadata.keys(), self.markerSetBuilder.globalGeneCountTable.keys(), spacingBetweenContigs=1e6) #end = time.time() #print 'geneDistTable: %.2f' % (end - start) #t = raw_input('waiting...') start = time.time() #testGenomeId = archaeaIds.pop() #testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) #binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, 0.97, 0.97, bMarkerSet = True) markerSet = self.markerSetBuilder.buildMarkerSet(bacteriaIds, 0.97, 0.97) end = time.time() print 'buildMarkerSet: %.2f' % (end - start) print len(markerSet.markerSet) test = eval("[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]") print len(test) for ms in markerSet.markerSet: bMatch = False for tms in test: if tms == ms: print ms print tms print '---------' bMatch = True break if not bMatch: print 'BOO!' if str(markerSet.markerSet) == "[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]": print 'Good to go!' else: print 'oh, shit!!!!'
class SimulationScaffolds(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.contigLens = [5000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __seqLens(self, seqs): """Calculate lengths of seqs.""" genomeSize = 0 seqLens = {} for seqId, seq in seqs.iteritems(): seqLens[seqId] = len(seq) genomeSize += len(seq) return seqLens, genomeSize def __workerThread(self, tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = [testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets testSeqs = readFasta(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) testSeqLens, genomeSize = self.__seqLens(testSeqs) for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for i in xrange(0, numReplicates): # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove # (this will sample >= the desired level of completeness) retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(percentComp, testSeqLens, genomeSize) trueComps.append(trueComp) # select a random genome to use as a source of contamination contGenomeId = random.sample(genomeIdsToTest - set([testGenomeId]), 1)[0] contSeqs = readFasta(os.path.join(self.img.genomeDir, contGenomeId, contGenomeId + '.fna')) contSeqLens, contGenomeSize = self.__seqLens(contSeqs) seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(1 - percentCont, contSeqLens, contGenomeSize) contSampledSeqIds = set(contSeqs.keys()).difference(seqsToRetain) trueCont = 100.0 - trueRetainedPer trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes= defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSet[ms.lineageStr].append(completeness - trueComp) deltaContSet[ms.lineageStr].append(contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes= defaultdict(list) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes) self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) deltaCompRefined[ms.lineageStr].append(completeness - trueComp) deltaContRefined[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False) deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp) deltaContSetRefined[ms.lineageStr].append(contamination - trueCont) taxonomy = ';'.join(metadata[testGenomeId]['taxonomy']) queueOut.put((testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" summaryOut = open('/tmp/simulation.random_scaffolds.w_refinement_50.draft.summary.tsv', 'w') summaryOut.write('Genome Id\tContig len\t% comp\t% cont') summaryOut.write('\tTaxonomy\tMarker set\t# descendants') summaryOut.write('\tUnmodified comp\tUnmodified cont') summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std') summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std') summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std') summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n') fout = gzip.open('/tmp/simulation.random_scaffolds.w_refinement_50.draft.tsv.gz', 'wb') fout.write('Genome Id\tContig len\t% comp\t% cont') fout.write('\tTaxonomy\tMarker set\t# descendants') fout.write('\tUnmodified comp\tUnmodified cont') fout.write('\tIM comp\tIM cont') fout.write('\tMS comp\tMS cont') fout.write('\tRIM comp\tRIM cont') fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n') testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get(block=True, timeout=None) if testGenomeId == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, numTestGenomes*testsPerGenome, float(itemsProcessed)*100/(numTestGenomes*testsPerGenome)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId])))) summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId])))) summaryOut.write('\n') fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId])) fout.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSet[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaCompSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, deltaContSetRefined[markerSetId]))) fout.write('\t%s' % ','.join(map(str, trueComps))) fout.write('\t%s' % ','.join(map(str, trueConts))) fout.write('\n') summaryOut.close() fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, minScaffolds, numThreads): random.seed(0) print '\n Reading reference genome tree.' treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print ' Number of taxa in tree: %d' % (len(tree.leaf_nodes())) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace('IMG_', '')) # get all draft genomes consisting of a user-specific minimum number of scaffolds print '' metadata = self.img.genomeMetadata() print ' Total genomes: %d' % len(metadata) draftGenomeIds = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, 'status', 'Finished') print ' Number of draft genomes: %d' % len(draftGenomeIds) genomeIdsToTest = set() for genomeId in draftGenomeIds: if metadata[genomeId]['scaffold count'] >= minScaffolds: genomeIdsToTest.add(genomeId) print ' Number of draft genomes with >= %d scaffolds: %d' % (minScaffolds, len(genomeIdsToTest)) print '' start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print ' readLineageSpecificGenesToRemove: %.2f' % (end - start) print ' Pre-computing genome information for calculating marker sets:' start = time.time() self.markerSetBuilder.precomputeGenomeFamilyScaffolds(metadata.keys()) end = time.time() print ' precomputeGenomeFamilyScaffolds: %.2f' % (end - start) start = time.time() self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print ' globalGeneCountTable: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print ' precomputeGenomeSeqLens: %.2f' % (end - start) start = time.time() self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0) end = time.time() print ' precomputeGenomeFamilyPositions: %.2f' % (end - start) print '' print ' Evaluating %d test genomes.' % len(genomeIdsToTest) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in list(genomeIdsToTest): workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [mp.Process(target = self.__workerThread, args = (tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__writerThread, args = (len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)) writeProc.join()
def __init__(self): self.simFile = './experiments/simulation.tuning.genus.summary.tsv' self.looRank = 5 self.markerSetBuilder = MarkerSetBuilder() self.img = IMG()
class GenomeTreeWorkflow(object): def __init__(self, outputDir): self.img = IMG() self.markerSetBuilder = MarkerSetBuilder() if os.path.exists(outputDir): print '[Error] Output directory already exists: ' + outputDir sys.exit(0) else: os.makedirs(outputDir) self.__checkForHMMER() self.__checkForFastTree() self.hmmDir = os.path.join(outputDir, 'phylo_hmms') self.alignmentDir = os.path.join(outputDir, 'gene_alignments') self.geneTreeDir = os.path.join(outputDir, 'gene_trees') self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific') self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final') self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv') self.concatenatedAlignFile = os.path.join( outputDir, 'genome_tree.concatenated.faa') self.derepConcatenatedAlignFile = os.path.join( outputDir, 'genome_tree.concatenated.derep.fasta') self.treeOut = os.path.join(outputDir, 'genome_tree.tre') self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre') self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre') self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre') self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre') self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre') self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre') self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre') self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv') self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv') self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm') self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt') self.phyloUbiquity = 0.90 self.phyloSingleCopy = 0.90 self.paralogAcceptPer = 0.01 # self.consistencyAcceptPer = 0.95 # for trees at the class-level self.consistencyAcceptPer = 0.906 # for trees at the phylum-level self.consistencyMinTaxa = 20 # create output directories os.makedirs(self.hmmDir) os.makedirs(self.alignmentDir) os.makedirs(self.geneTreeDir) os.makedirs(self.conspecificGeneTreeDir) os.makedirs(self.finalGeneTreeDir) def __checkForHMMER(self): """Check to see if HMMER is on the system path.""" try: exit_status = os.system('hmmfetch -h > /dev/null') except: print "Unexpected error!", sys.exc_info()[0] raise if exit_status != 0: print "[Error] hmmfetch is not on the system path" sys.exit() def __checkForFastTree(self): """Check to see if FastTree is on the system path.""" try: exit_status = os.system('FastTree 2> /dev/null') except: print "Unexpected error!", sys.exc_info()[0] raise if exit_status != 0: print "[Error] FastTree is not on the system path" sys.exit() def __genesInGenomes(self, genomeIds): genesInGenomes = {} for genomeId in genomeIds: markerIdToGeneIds = defaultdict(set) for line in open( os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.pfamExtension)): lineSplit = line.split('\t') markerIdToGeneIds[lineSplit[8]].add(lineSplit[0]) for line in open( os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.tigrExtension)): lineSplit = line.split('\t') markerIdToGeneIds[lineSplit[6]].add(lineSplit[0]) genesInGenomes[genomeId] = markerIdToGeneIds return genesInGenomes def __fetchMarkerModels(self, universalMarkerGenes, outputModelDir): markerIdToName = {} for line in open('/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'): if 'NAME' in line: name = line.split()[1].rstrip() elif 'ACC' in line: acc = line.split()[1].rstrip() markerId = acc.replace('PF', 'pfam') markerId = markerId[0:markerId.rfind('.')] markerIdToName[markerId] = name for markerId in universalMarkerGenes: if 'pfam' in markerId: os.system('hmmfetch /srv/whitlam/bio/db/pfam/27/Pfam-A.hmm ' + markerIdToName[markerId] + ' > ' + os.path.join(outputModelDir, markerId.replace('pfam', 'PF') + '.hmm')) else: os.system('hmmfetch /srv/whitlam/bio/db/tigrfam/13.0/' + markerId + '.HMM ' + markerId + ' > ' + os.path.join(outputModelDir, markerId + '.hmm')) def __alignMarkers(self, genomeIds, markerGenes, genesInGenomes, numThreads, outputGeneDir, outputModelDir): """Perform multithreaded alignment of marker genes using HMM align.""" workerQueue = mp.Queue() writerQueue = mp.Queue() for _, markerId in enumerate(markerGenes): workerQueue.put(markerId) for _ in range(numThreads): workerQueue.put(None) calcProc = [ mp.Process(target=self.__runHmmAlign, args=(genomeIds, genesInGenomes, outputGeneDir, outputModelDir, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__reportThreads, args=(len(markerGenes), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put(None) writeProc.join() def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in genomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): if geneId not in seqs: # this shouldn't be necessary, but the IMG metadata isn't always # perfectly in sync with the sequence data continue fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment( os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName) def __reportThreads(self, numGenes, writerQueue): """Store confidence intervals (i.e., to shared memory).""" numProcessedGenes = 0 while True: markerId = writerQueue.get(block=True, timeout=None) if markerId == None: break numProcessedGenes += 1 statusStr = ' Finished processing %d of %d (%.2f%%) marker genes.' % ( numProcessedGenes, numGenes, float(numProcessedGenes) * 100 / numGenes) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() sys.stdout.write('\n') def __maskAlignment(self, inputFile, outputFile): """Read HMMER alignment in STOCKHOLM format and output masked alignment in FASTA format.""" # read STOCKHOLM alignment seqs = {} for line in open(inputFile): line = line.rstrip() if line == '' or line[0] == '#' or line == '//': if 'GC RF' in line: mask = line.split('GC RF')[1].strip() continue else: lineSplit = line.split() seqs[lineSplit[0]] = lineSplit[1].upper().replace('.', '-').strip() # output masked sequences in FASTA format fout = open(outputFile, 'w') for seqId, seq in seqs.iteritems(): fout.write('>' + seqId + '\n') maskedSeq = ''.join( [seq[i] for i in xrange(0, len(seq)) if mask[i] == 'x']) fout.write(maskedSeq + '\n') fout.close() def __taxonomicMarkers(self, taxaStr, metadata, ubiquityThreshold, singleCopyThreshold): """Get genomes and marker genes for a specific lineage.""" genomeIds = self.img.genomeIdsByTaxonomy(taxaStr, metadata) # hack to add in other genomes with incorrect taxonomy aceIds = set() for line in open('./taxonomicTrees/firmicutes.nds'): aceIds.add(line.strip()) aceIdsToImgIds = self.aceIdsToImgIds() for aceId in aceIds: if aceId in aceIdsToImgIds: genomeId = aceIdsToImgIds[aceId] if genomeId in metadata: genomeIds.add(genomeId) markerGenes = self.markerSetBuilder.buildMarkerGenes( genomeIds, ubiquityThreshold, singleCopyThreshold) return genomeIds, markerGenes def inferGeneTrees(self, phyloUbiquityThreshold, phyloSingleCopyThreshold, numThreads, outputGeneDir, outputModelDir, outgroupSize): # make sure output directory is empty if not os.path.exists(outputGeneDir): os.makedirs(outputGeneDir) if not os.path.exists(outputModelDir): os.makedirs(outputModelDir) files = os.listdir(outputGeneDir) for f in files: os.remove(os.path.join(outputGeneDir, f)) # get genomes and marker genes for taxonomic groups of interest print '' print 'Identifying genomes and marker genes of interest:' metadata = self.img.genomeMetadata() ingroupGenomeIds, ingroupMarkers = self.__taxonomicMarkers( 'Bacteria;Firmicutes', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold) outgroupGenomeIds = self.img.genomeIdsByTaxonomy( 'Bacteria;Coprothermobacter', metadata) # alphaGenomeIds, _ = self.__taxonomicMarkers('Bacteria;Proteobacteria;Alphaproteobacteria', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold) print ' Identified ingroup genomes: %d' % len(ingroupGenomeIds) print ' Identified outgroup genomes: %d' % len(outgroupGenomeIds) numOutgroupTaxa = min(outgroupSize, len(outgroupGenomeIds)) print '' print ' Selecting %d taxa from the outgroup.' % (numOutgroupTaxa) genomeIds = ingroupGenomeIds.union( random.sample(outgroupGenomeIds, numOutgroupTaxa)) self.imgIdsToAceIds(genomeIds) print ' Identified markers: %d' % len(ingroupMarkers) # get mapping of marker ids to gene ids for each genome print ' Determine genes for genomes of interest.' genesInGenomes = self.__genesInGenomes(genomeIds) # get HMM for each marker gene print ' Fetching HMM for each marker genes.' self.__fetchMarkerModels(ingroupMarkers, outputModelDir) # align gene sequences and infer gene trees print ' Aligning marker genes:' #***self.__alignMarkers(genomeIds, ingroupMarkers, genesInGenomes, numThreads, outputGeneDir, outputModelDir) return genomeIds def imgIdsToAceIds(self, imgIds): imgIdToAceId = {} for line in open('ggg_tax_img.feb_2014.txt'): lineSplit = line.split('\t') imgIdToAceId[lineSplit[1].rstrip()] = lineSplit[0] missing = 0 for imgId in imgIds: if imgId not in imgIdToAceId: missing += 1 print ' Number of genomes without an ACE id: ' + str(missing) return imgIdToAceId def aceIdsToImgIds(self): aceIdsToImgIds = {} for line in open('ggg_tax_img.feb_2014.txt'): lineSplit = line.split('\t') aceIdsToImgIds[lineSplit[0].strip()] = lineSplit[1].strip() return aceIdsToImgIds def run(self, numThreads, outgroupSize): # identify genes suitable for phylogenetic inference print '--- Identifying genes suitable for phylogenetic inference ---' genomeIds = self.inferGeneTrees(self.phyloUbiquity, self.phyloSingleCopy, numThreads, self.alignmentDir, self.hmmDir, outgroupSize) # infer gene trees print '' print '--- Inferring gene trees ---' makeTrees = MakeTrees() makeTrees.run(self.alignmentDir, self.geneTreeDir, '.aln.masked.faa', numThreads) # test gene trees for paralogs print '' print '--- Testing for paralogs in gene trees ---' paralogTest = ParalogTest() paralogTest.run(self.geneTreeDir, self.paralogAcceptPer, '.tre', self.conspecificGeneTreeDir) # test gene trees for consistency with IMG taxonomy print '' print '--- Testing taxonomic consistency of gene trees ---' consistencyTest = ConsistencyTest() consistencyTest.run(self.conspecificGeneTreeDir, '.tre', self.consistencyAcceptPer, self.consistencyMinTaxa, self.consistencyOut, self.finalGeneTreeDir) # gather phylogenetically informative HMMs into a single model file print '' print '--- Gathering phylogenetically informative HMMs ---' getPhylogeneticHMMs = GetPhylogeneticHMMs() getPhylogeneticHMMs.run(self.hmmDir, self.finalGeneTreeDir, self.phyloHMMsOut) # infer genome tree print '' print '--- Inferring full genome tree ---' inferGenomeTree = InferGenomeTree() inferGenomeTree.run(self.finalGeneTreeDir, self.alignmentDir, '.aln.masked.faa', self.concatenatedAlignFile, self.treeOut, self.taxonomyOut, bSupportValues=True) # replace IMG identifiers with ACE identifiers imgIdToAceId = self.imgIdsToAceIds(genomeIds) with open(self.treeOut) as f: tree = ''.join(f.readlines()) for genomeId in genomeIds: if genomeId in imgIdToAceId: tree = tree.replace('IMG_' + genomeId, imgIdToAceId[genomeId]) fout = open(self.treeOutAce, 'w') fout.write(tree) fout.close()
class IdentifyGeneLossAndDuplication(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') def run(self, ubiquityThreshold, minGenomes): # Pre-compute gene count table print 'Computing gene count table.' start = time.time() metadata = self.img.genomeMetadata() self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print ' globalGeneCountTable: %.2f' % (end - start) # read selected node for defining marker set print 'Reading node defining marker set for each internal node.' selectedMarkerNode = {} for line in open('/srv/whitlam/bio/db/checkm/selected_marker_sets.tsv'): lineSplit = line.split('\t') selectedMarkerNode[lineSplit[0].strip()] = lineSplit[1].strip() # read duplicate taxa print 'Reading list of identical taxa in genome tree.' duplicateTaxa = {} for line in open('/srv/whitlam/bio/db/checkm/genome_tree/genome_tree.derep.txt'): lineSplit = line.rstrip().split() if len(lineSplit) > 1: duplicateTaxa[lineSplit[0]] = lineSplit[1:] # read in node metadata print 'Reading node metadata.' treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() # read genome tree print 'Reading in genome tree.' treeFile = '/srv/whitlam/bio/db/checkm/genome_tree/genome_tree_prok.refpkg/genome_tree.final.tre' tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # determine lineage-specific gene loss and duplication (relative to potential marker genes used by a node) print 'Determining lineage-specific gene loss and duplication' fout = open('/srv/whitlam/bio/db/checkm/genome_tree/missing_duplicate_genes_50.tsv', 'w') processed = 0 numInternalNodes = len(tree.internal_nodes()) for node in tree.internal_nodes(): processed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) internal nodes.' % (processed, numInternalNodes, float(processed)*100/numInternalNodes) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() nodeId = node.label.split('|')[0] missingGenes = [] duplicateGenes = [] nodeStats = uniqueIdToLineageStatistics[nodeId] if nodeStats['# genomes'] >= minGenomes: # get marker genes defined for current node along with all parental nodes markerGenes = set() parentNode = node while parentNode != None: parentNodeId = parentNode.label.split('|')[0] stats = uniqueIdToLineageStatistics[parentNodeId] markerSet = MarkerSet(parentNodeId, stats['taxonomy'], stats['# genomes'], eval(stats['marker set'])) markerGenes = markerGenes.union(markerSet.getMarkerGenes()) parentNode = parentNode.parent_node # silly hack since PFAM ids are inconsistent between the PFAM data and IMG data revisedMarkerGeneIds = set() for mg in markerGenes: if mg.startswith('PF'): revisedMarkerGeneIds.add(mg[0:mg.rfind('.')].replace('PF', 'pfam')) else: revisedMarkerGeneIds.add(mg) # get all genomes below the internal node (including genomes removed as duplicates) genomeIds = [] for leaf in node.leaf_nodes(): genomeIds.append(leaf.taxon.label.replace('IMG_', '')) if leaf.taxon.label in duplicateTaxa: for genomeId in duplicateTaxa[leaf.taxon.label]: genomeIds.append(genomeId.replace('IMG_', '')) genomeIds.append(leaf.taxon.label.replace('IMG_', '')) missingGenes = self.markerSetBuilder.missingGenes(genomeIds, revisedMarkerGeneIds, ubiquityThreshold) duplicateGenes = self.markerSetBuilder.duplicateGenes(genomeIds, revisedMarkerGeneIds, ubiquityThreshold) fout.write('%s\t%s\t%s\n' % (nodeId, str(missingGenes), str(duplicateGenes))) sys.stdout.write('\n') fout.close()
class MarkerSetSelection(object): def __init__(self): self.simFile = './experiments/simulation.tuning.genus.summary.tsv' self.looRank = 5 self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() def __stabilityTest(self, genomeIds, ubiquityThreshold=0.97, singleCopyThreshold=0.97, stabilityThreshold=0.05): """Test stability of marker set for a group of genomes using LOO-testing.""" # quick escape for lineage that are clearly stable if len(genomeIds) > 200: return True # calculate marker sets using a LOO-testing looMarkerGenes = [] for genomeId in genomeIds: looGenomeIds = genomeIds.difference([genomeId]) # calculate marker genes geneCountTable = self.img.geneCountTable(looGenomeIds) markerGenes = self.markerSetBuilder.markerGenes( looGenomeIds, geneCountTable, ubiquityThreshold * len(looGenomeIds), singleCopyThreshold * len(looGenomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove looMarkerGenes.append(markerGenes) # calculate change in marker set for all pairs markerSetSize = [] diffMarkerSet = [] for i in range(0, len(looMarkerGenes)): markerSetSize.append(len(looMarkerGenes[i])) for j in range(i + 1, len(looMarkerGenes)): symmDiff = looMarkerGenes[i].symmetric_difference( looMarkerGenes[j]) diffMarkerSet.append(len(symmDiff)) print(len(genomeIds), mean(diffMarkerSet), mean(markerSetSize)) return (float(mean(diffMarkerSet)) / mean(markerSetSize)) <= stabilityThreshold def __patristicDist(self, tree, taxa1, taxa2): mrca = tree.mrca(taxon_labels=[taxa1.taxon.label, taxa2.taxon.label]) if mrca.parent_node == None: # MRCA is the root of the tree return taxa1.distance_from_root() + taxa2.distance_from_root() else: dist = taxa1.edge_length parentNode = taxa1.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node dist += taxa2.edge_length parentNode = taxa2.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node return dist def __distToNodePercentileTest(self, genomeNode, markerSetNode, leaves, percentileTest): distToBin = self.__distanceToAncestor(genomeNode, markerSetNode) distToLeaves = [] for leaf in leaves: distToLeaves.append(self.__distanceToAncestor(leaf, markerSetNode)) return distToBin < percentile(distToLeaves, percentileTest) def __selectMarkerSetNode(self, tree, genomeId, metadata, taxonToGenomeIds): """Determine lineage-specific marker set to use for assessing the giving genome.""" # read genomes removed from tree as a result of duplicate sequences duplicateSeqs = self.markerSetBuilder.readDuplicateSeqs() # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # ascend tree to root looking for suitable marker set curNode = node.parent_node while curNode != None: uniqueId = curNode.label.split('|')[0] genomeIds = set() for leaf in curNode.leaf_nodes(): genomeIds.add(leaf.taxon.label.replace('IMG_', '')) duplicateGenomes = duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: genomeIds.add(dup.replace('IMG_', '')) # remove genome (LOO-style analysis) print('Full:', len(genomeIds)) genomeIds.difference_update([genomeId]) print('LOO:', len(genomeIds)) # remove all genomes from the same taxonomic group as the genome of interest taxon = metadata[genomeId]['taxonomy'][self.looRank] genomeIds.difference_update(taxonToGenomeIds[taxon]) print('Rank reduced:', len(genomeIds)) print(uniqueId) if len(genomeIds) > 10 and self.__stabilityTest(genomeIds): uidSelected = uniqueId break curNode = curNode.parent_node if curNode == None: # reach root so use universal marker set uidSelected = uniqueId return uidSelected def __bestMarkerSet(self, genomeId, simResults): """Get stats for best marker set.""" curBest = 1000 bestUID = None for uid, results in simResults[genomeId].items(): numDescendants, dComp, dCont = results if (dComp + dCont) < curBest: numDescendantsBest = numDescendants dCompBest = dComp dContBest = dCont bestUID = uid curBest = dComp + dCont return bestUID, numDescendantsBest, dCompBest, dContBest def __workerThread(self, tree, simResults, metadata, taxonToGenomeIds, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break uidSelected = self.__selectMarkerSetNode(tree, testGenomeId, metadata, taxonToGenomeIds) numDescendantsSelected, dCompSelected, dContSelected = simResults[ testGenomeId][uidSelected] # find best marker set bestUID, numDescendantsBest, dCompBest, dContBest = self.__bestMarkerSet( testGenomeId, simResults) queueOut.put((testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" fout = open('./experiments/markerSetSelection.tsv', 'w') fout.write( 'Genome Id\tSelected UID\t# descendants\tSelected dComp\tSelected dCont\tBest UID\t# descendants\tBest dComp\tBest dCont\tdDescendants\tdComp\tdCont\n' ) itemsToProcess = 0 dComps = [] dConts = [] dCompsPer = [] dContsPer = [] bestComp = [] bestCont = [] selectedComp = [] selectedCont = [] while True: testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest = writerQueue.get( block=True, timeout=None) if testGenomeId == None: break itemsToProcess += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test genomes.' % ( itemsToProcess, numTestGenomes, float(itemsToProcess) * 100 / (numTestGenomes)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() dComp = abs(dCompSelected - dCompBest) dCont = abs(dContSelected - dContBest) dDescendants = abs(numDescendantsSelected - numDescendantsBest) fout.write( '%s\t%s\t%d\t%.4f\t%.4f\t%s\t%d\t%.4f\t%.4f\t%d\t%.4f\t%.4f\n' % (testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest, dDescendants, dComp, dCont)) dComps.append(dComp) dConts.append(dCont) dCompsPer.append(dComp * 100.0 / dCompBest) dContsPer.append(dCont * 100.0 / max(dContBest, 0.01)) bestComp.append(dCompBest) bestCont.append(dContBest) selectedComp.append(dCompSelected) selectedCont.append(dContSelected) sys.stdout.write('\n') fout.close() print('') print(' General results:') print(' Best comp: %.2f +/- %.2f' % (mean(bestComp), std(bestComp))) print(' Best cont: %.2f +/- %.2f' % (mean(bestCont), std(bestCont))) print(' Selected comp: %.2f +/- %.2f' % (mean(selectedComp), std(selectedComp))) print(' Selected cont: %.2f +/- %.2f' % (mean(selectedCont), std(selectedCont))) print('') print(' Delta comp: %.2f +/- %.2f' % (mean(dComps), std(dComps))) print(' Delta cont: %.2f +/- %.2f' % (mean(dConts), std(dConts))) print(' Delta comp per error: %.1f +/- %.1f' % (mean(dCompsPer), std(dCompsPer))) print(' Delta cont per error: %.1f +/- %.1f' % (mean(dContsPer), std(dContsPer))) def __distanceToAncestor(self, leaf, ancestor): dist = 0 curNode = leaf while curNode != ancestor: dist += curNode.edge_length curNode = curNode.parent_node return dist def __bestNodeProperties(self, genomeId, tree, bestUID): # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # find node of best marker set curNode = node.parent_node nodesToBin = 0 distanceToBin = node.edge_length distanceToLeaves = [] while curNode != None: uniqueId = curNode.label.split('|')[0] nodesToBin += 1 if uniqueId == bestUID: for leaf in curNode.leaf_nodes(): if leaf != node: dist = self.__distanceToAncestor(leaf, curNode) distanceToLeaves.append(dist) break distanceToBin += curNode.edge_length curNode = curNode.parent_node return nodesToBin, distanceToBin, mean(distanceToLeaves) def __propertiesOfBestMarkerSets(self, tree, simResults): numDescendants = [] nodesToBin = [] distanceToBin = [] avgDistanceToLeaf = [] percDiffs = [] for genomeId in simResults: bestUID, numDescendantsBest, _, _ = self.__bestMarkerSet( genomeId, simResults) nodesToBinBest, distanceToBinBest, avgDistanceToLeafBest = self.__bestNodeProperties( genomeId, tree, bestUID) numDescendants.append(numDescendantsBest) nodesToBin.append(nodesToBinBest) distanceToBin.append(distanceToBinBest) avgDistanceToLeaf.append(avgDistanceToLeafBest) percDiff = abs(distanceToBinBest - avgDistanceToLeafBest) * 100 / distanceToBinBest percDiffs.append(percDiff) print(' # descendants: %.2f +/- %.2f' % (mean(numDescendants), std(numDescendants))) print(' # nodes to bin: %.2f +/- %.2f' % (mean(nodesToBin), std(nodesToBin))) print(' Distance to bin: %.2f +/- %.2f' % (mean(distanceToBin), std(distanceToBin))) distanceToBin = array(distanceToBin) avgDistanceToLeaf = array(avgDistanceToLeaf) print(' Distance to bin - average distance to leaf: %.2f +/- %.2f' % (mean(abs(distanceToBin - avgDistanceToLeaf)), std(abs(distanceToBin - avgDistanceToLeaf)))) print( ' Percent difference to average leaf distance: %.2f +/- %.2f' % (mean(percDiffs), std(percDiffs))) print('') def run(self, numThreads): # read reference tree print('\n Reading reference genome tree.') treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # get all genomes with a given taxon label metadata = self.img.genomeMetadata() taxonToGenomeIds = defaultdict(set) for genomeId in metadata: for t in metadata[genomeId]['taxonomy']: taxonToGenomeIds[t].add(genomeId) # read simulation results print(' Reading simulation results.') simResults = defaultdict(dict) with open(self.simFile) as f: f.readline() for line in f: lineSplit = line.split('\t') simId = lineSplit[0] + '-' + lineSplit[1] + '-' + lineSplit[ 2] + '-' + lineSplit[3] uid = lineSplit[5].split('|')[0].strip() numDescendants = int(lineSplit[6]) comp = float(lineSplit[21]) cont = float(lineSplit[23]) simResults[simId][uid] = [numDescendants, comp, cont] #print '' #print ' Properties of best marker sets:' #self.__propertiesOfBestMarkerSets(tree, simResults) print(' Evaluating %d test genomes.' % len(simResults)) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in simResults: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [ mp.Process(target=self.__workerThread, args=(tree, simResults, metadata, taxonToGenomeIds, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(simResults), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None)) writeProc.join()
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() self.simContigLen = 10000 def __selectMarkerSet(self, tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut): """Select marker set for parent edge of specified internal node.""" # get genomes descendant from each child of the specified internal node leaves = [] for child in internalNode.child_nodes(): genomeIds = set() for leaf in child.leaf_nodes(): genomeId = leaf.taxon.label.replace('IMG_', '') genomeIds.add(genomeId) duplicateGenomes = self.markerSetBuilder.duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: dupId = dup.replace('IMG_', '') genomeIds.add(dupId) leaves.append(genomeIds) # make sure each set of leaves contains at least a minimum number of genomes orderedLeaves = sorted(leaves, key=len) if len(orderedLeaves[0]) < 5: queueOut.put(('NA', -1, -1, -1, -1, -1)) return # calculate marker genes with all genomes in lineage with the fewest genomes removed binMarkerGenes, _ = self.markerSetBuilder.buildBinMarkerSet(tree, internalNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = orderedLeaves[0]) # evaluate accuracy of completeness and contamination estimations on different partial genomes from lineage with fewest genomes testGenomeIds = random.sample(orderedLeaves[0], min(len(orderedLeaves[0]), 100)) deltaComp = defaultdict(list) deltaCont = defaultdict(list) for testGenomeId in testGenomeIds: geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerGenes.getMarkerGenes(), spacingBetweenContigs=0) genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) repsPerGenome = 100 for _ in xrange(0, repsPerGenome): testComp = random.uniform(0.5, 1.0) testCont = random.uniform(0, 0.2) trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(genomeSize, testComp, testCont, self.simContigLen) for ms in binMarkerGenes.markerSetIter(): containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, self.simContigLen) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) if completeness == 0.0: print ms.getMarkerGenes() print geneDistTable[testGenomeId] print startPartialGenomeContigs print genomeSize print '*****************' + testGenomeId sys.exit() deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) # determine lineage-specific marker set with best average performance curBest = 1000 bestUID = None dCompBest = 0 dContBest = 0 for lineageStr in deltaComp: dComp, dCont = mean(abs(array(deltaComp[lineageStr]))), mean(abs(array(deltaCont[lineageStr]))) if (dComp + dCont) < curBest: dCompBest = dComp dContBest = dCont dCompStdBest = std(abs(array(deltaComp[lineageStr]))) dContStdBest = std(abs(array(deltaCont[lineageStr]))) bestUID = lineageStr.split('|')[0] curBest = dComp + dCont queueOut.put((internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest)) def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, queueIn, queueOut): """Process each data item in parallel.""" while True: internalNode = queueIn.get(block=True, timeout=None) if internalNode == None: break self.__selectMarkerSet(tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut) def __writerThread(self, numInternalNodes, writerQueue): """Store or write results of worker threads in a single thread.""" fout = open('/tmp/simInferBestMarkerSet.tsv', 'w') fout.write('Internal node ID\tMarker set ID\tmean % delta comp\tstd % delta comp\tmean % delta cont\tstd % delta cont\n') itemsProcessed = 0 while True: internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest = writerQueue.get(block=True, timeout=None) if internalNode == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) internal branches.' % (itemsProcessed, numInternalNodes, float(itemsProcessed)*100/(numInternalNodes)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if internalNode != 'NA': fout.write(internalNode.label + '\t%s\t%.2f\t%.2f\t%.2f\t%.2f\n' % (bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest)) fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numThreads): random.seed(0) print '\n Calculating global gene count table.' metadata = self.img.genomeMetadata() self.markerSetBuilder.globalGeneCountTable = self.img.geneCountTable(metadata.keys()) print '\n Reading reference genome tree.' treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print ' Evaluating %d internal nodes.' % len(tree.internal_nodes()) workerQueue = mp.Queue() writerQueue = mp.Queue() for internalNode in tree.internal_nodes(): if internalNode.parent_node != None: workerQueue.put(internalNode) for _ in range(numThreads): workerQueue.put(None) metadata = self.img.genomeMetadata() workerProc = [mp.Process(target = self.__workerThread, args = (tree, metadata, ubiquityThreshold, singleCopyThreshold, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__writerThread, args = (len(tree.internal_nodes())-1, writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None)) writeProc.join()
class GenomeTreeWorkflow(object): def __init__(self, outputDir): self.img = IMG() self.markerSetBuilder = MarkerSetBuilder() if os.path.exists(outputDir): print '[Error] Output directory already exists: ' + outputDir sys.exit(0) else: os.makedirs(outputDir) self.__checkForHMMER() self.__checkForFastTree() self.hmmDir = os.path.join(outputDir, 'phylo_hmms') self.alignmentDir = os.path.join(outputDir, 'gene_alignments') self.geneTreeDir = os.path.join(outputDir, 'gene_trees') self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific') self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final') self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv') self.concatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.faa') self.derepConcatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.derep.fasta') self.treeOut = os.path.join(outputDir, 'genome_tree.tre') self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre') self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre') self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre') self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre') self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre') self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre') self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre') self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv') self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv') self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm') self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt') self.phyloUbiquity = 0.90 self.phyloSingleCopy = 0.90 self.paralogAcceptPer = 0.01 #self.consistencyAcceptPer = 0.95 # for trees at the class-level self.consistencyAcceptPer = 0.906 # for trees at the phylum-level self.consistencyMinTaxa = 20 # create output directories os.makedirs(self.hmmDir) os.makedirs(self.alignmentDir) os.makedirs(self.geneTreeDir) os.makedirs(self.conspecificGeneTreeDir) os.makedirs(self.finalGeneTreeDir) def __checkForHMMER(self): """Check to see if HMMER is on the system path.""" try: exit_status = os.system('hmmfetch -h > /dev/null') except: print "Unexpected error!", sys.exc_info()[0] raise if exit_status != 0: print "[Error] hmmfetch is not on the system path" sys.exit() def __checkForFastTree(self): """Check to see if FastTree is on the system path.""" try: exit_status = os.system('FastTree 2> /dev/null') except: print "Unexpected error!", sys.exc_info()[0] raise if exit_status != 0: print "[Error] FastTree is not on the system path" sys.exit() def __genesInGenomes(self, genomeIds): genesInGenomes = {} for genomeId in genomeIds: markerIdToGeneIds = defaultdict(set) for line in open(os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.pfamExtension)): lineSplit = line.split('\t') markerIdToGeneIds[lineSplit[8]].add(lineSplit[0]) for line in open(os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.tigrExtension)): lineSplit = line.split('\t') markerIdToGeneIds[lineSplit[6]].add(lineSplit[0]) genesInGenomes[genomeId] = markerIdToGeneIds return genesInGenomes def __fetchMarkerModels(self, universalMarkerGenes, outputModelDir): markerIdToName = {} for line in open('/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'): if 'NAME' in line: name = line.split()[1].rstrip() elif 'ACC' in line: acc = line.split()[1].rstrip() markerId = acc.replace('PF', 'pfam') markerId = markerId[0:markerId.rfind('.')] markerIdToName[markerId] = name for markerId in universalMarkerGenes: if 'pfam' in markerId: os.system('hmmfetch /srv/whitlam/bio/db/pfam/27/Pfam-A.hmm ' + markerIdToName[markerId] + ' > ' + os.path.join(outputModelDir, markerId.replace('pfam', 'PF') + '.hmm')) else: os.system('hmmfetch /srv/whitlam/bio/db/tigrfam/13.0/' + markerId + '.HMM ' + markerId + ' > ' + os.path.join(outputModelDir, markerId + '.hmm')) def __alignMarkers(self, genomeIds, markerGenes, genesInGenomes, numThreads, outputGeneDir, outputModelDir): """Perform multithreaded alignment of marker genes using HMM align.""" workerQueue = mp.Queue() writerQueue = mp.Queue() for _, markerId in enumerate(markerGenes): workerQueue.put(markerId) for _ in range(numThreads): workerQueue.put(None) calcProc = [mp.Process(target = self.__runHmmAlign, args = (genomeIds, genesInGenomes, outputGeneDir, outputModelDir, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__reportThreads, args = (len(markerGenes), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put(None) writeProc.join() def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in genomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): if geneId not in seqs: # this shouldn't be necessary, but the IMG metadata isn't always # perfectly in sync with the sequence data continue fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName) def __reportThreads(self, numGenes, writerQueue): """Store confidence intervals (i.e., to shared memory).""" numProcessedGenes = 0 while True: markerId = writerQueue.get(block=True, timeout=None) if markerId == None: break numProcessedGenes += 1 statusStr = ' Finished processing %d of %d (%.2f%%) marker genes.' % (numProcessedGenes, numGenes, float(numProcessedGenes)*100/numGenes) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() sys.stdout.write('\n') def __maskAlignment(self, inputFile, outputFile): """Read HMMER alignment in STOCKHOLM format and output masked alignment in FASTA format.""" # read STOCKHOLM alignment seqs = {} for line in open(inputFile): line = line.rstrip() if line == '' or line[0] == '#' or line == '//': if 'GC RF' in line: mask = line.split('GC RF')[1].strip() continue else: lineSplit = line.split() seqs[lineSplit[0]] = lineSplit[1].upper().replace('.', '-').strip() # output masked sequences in FASTA format fout = open(outputFile, 'w') for seqId, seq in seqs.iteritems(): fout.write('>' + seqId + '\n') maskedSeq = ''.join([seq[i] for i in xrange(0, len(seq)) if mask[i] == 'x']) fout.write(maskedSeq + '\n') fout.close() def __taxonomicMarkers(self, taxaStr, metadata, ubiquityThreshold, singleCopyThreshold): """Get genomes and marker genes for a specific lineage.""" genomeIds = self.img.genomeIdsByTaxonomy(taxaStr, metadata) # hack to add in other genomes with incorrect taxonomy aceIds = set() for line in open('./taxonomicTrees/firmicutes.nds'): aceIds.add(line.strip()) aceIdsToImgIds = self.aceIdsToImgIds() for aceId in aceIds: if aceId in aceIdsToImgIds: genomeId = aceIdsToImgIds[aceId] if genomeId in metadata: genomeIds.add(genomeId) markerGenes = self.markerSetBuilder.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold ) return genomeIds, markerGenes def inferGeneTrees(self, phyloUbiquityThreshold, phyloSingleCopyThreshold, numThreads, outputGeneDir, outputModelDir, outgroupSize): # make sure output directory is empty if not os.path.exists(outputGeneDir): os.makedirs(outputGeneDir) if not os.path.exists(outputModelDir): os.makedirs(outputModelDir) files = os.listdir(outputGeneDir) for f in files: os.remove(os.path.join(outputGeneDir, f)) # get genomes and marker genes for taxonomic groups of interest print '' print 'Identifying genomes and marker genes of interest:' metadata = self.img.genomeMetadata() ingroupGenomeIds, ingroupMarkers = self.__taxonomicMarkers('Bacteria;Firmicutes', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold) outgroupGenomeIds = self.img.genomeIdsByTaxonomy('Bacteria;Coprothermobacter', metadata) #alphaGenomeIds, _ = self.__taxonomicMarkers('Bacteria;Proteobacteria;Alphaproteobacteria', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold) print ' Identified ingroup genomes: %d' % len(ingroupGenomeIds) print ' Identified outgroup genomes: %d' % len(outgroupGenomeIds) numOutgroupTaxa = min(outgroupSize, len(outgroupGenomeIds)) print '' print ' Selecting %d taxa from the outgroup.' % (numOutgroupTaxa) genomeIds = ingroupGenomeIds.union(random.sample(outgroupGenomeIds, numOutgroupTaxa)) self.imgIdsToAceIds(genomeIds) print ' Identified markers: %d' % len(ingroupMarkers) # get mapping of marker ids to gene ids for each genome print ' Determine genes for genomes of interest.' genesInGenomes = self.__genesInGenomes(genomeIds) # get HMM for each marker gene print ' Fetching HMM for each marker genes.' self.__fetchMarkerModels(ingroupMarkers, outputModelDir) # align gene sequences and infer gene trees print ' Aligning marker genes:' self.__alignMarkers(genomeIds, ingroupMarkers, genesInGenomes, numThreads, outputGeneDir, outputModelDir) return genomeIds def imgIdsToAceIds(self, imgIds): imgIdToAceId = {} for line in open('ggg_tax_img.feb_2014.txt'): lineSplit = line.split('\t') imgIdToAceId[lineSplit[1].rstrip()] = lineSplit[0] missing = 0 for imgId in imgIds: if imgId not in imgIdToAceId: missing += 1 print ' Number of genomes without an ACE id: ' + str(missing) return imgIdToAceId def aceIdsToImgIds(self): aceIdsToImgIds = {} for line in open('ggg_tax_img.feb_2014.txt'): lineSplit = line.split('\t') aceIdsToImgIds[lineSplit[0].strip()] = lineSplit[1].strip() return aceIdsToImgIds def run(self, numThreads, outgroupSize): # identify genes suitable for phylogenetic inference print '--- Identifying genes suitable for phylogenetic inference ---' genomeIds = self.inferGeneTrees(self.phyloUbiquity, self.phyloSingleCopy, numThreads, self.alignmentDir, self.hmmDir, outgroupSize) # infer gene trees print '' print '--- Inferring gene trees ---' makeTrees = MakeTrees() makeTrees.run(self.alignmentDir, self.geneTreeDir, '.aln.masked.faa', numThreads) # test gene trees for paralogs print '' print '--- Testing for paralogs in gene trees ---' paralogTest = ParalogTest() paralogTest.run(self.geneTreeDir, self.paralogAcceptPer, '.tre', self.conspecificGeneTreeDir) # test gene trees for consistency with IMG taxonomy print '' print '--- Testing taxonomic consistency of gene trees ---' consistencyTest = ConsistencyTest() consistencyTest.run(self.conspecificGeneTreeDir, '.tre', self.consistencyAcceptPer, self.consistencyMinTaxa, self.consistencyOut, self.finalGeneTreeDir) # gather phylogenetically informative HMMs into a single model file print '' print '--- Gathering phylogenetically informative HMMs ---' getPhylogeneticHMMs = GetPhylogeneticHMMs() getPhylogeneticHMMs.run(self.hmmDir, self.finalGeneTreeDir, self.phyloHMMsOut) # infer genome tree print '' print '--- Inferring full genome tree ---' inferGenomeTree = InferGenomeTree() inferGenomeTree.run(self.finalGeneTreeDir, self.alignmentDir, '.aln.masked.faa', self.concatenatedAlignFile, self.treeOut, self.taxonomyOut, bSupportValues = True) # replace IMG identifiers with ACE identifiers imgIdToAceId = self.imgIdsToAceIds(genomeIds) with open(self.treeOut) as f: tree = ''.join(f.readlines()) for genomeId in genomeIds: if genomeId in imgIdToAceId: tree = tree.replace('IMG_' + genomeId, imgIdToAceId[genomeId]) fout = open(self.treeOutAce, 'w') fout.write(tree) fout.close()
def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG()
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() def run(self): print('\n Reading reference genome tree.') treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # get all Finished, Trusted genomes metadata = self.img.genomeMetadata() bacteriaIds = self.img.getGenomesByClade('domain', 'Bacteria', metadata) print('# Bacteria: %d' % len(bacteriaIds)) start = time.time() #self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print('globalGeneCountTable: %.2f' % (end - start)) start = time.time() #self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print('precomputeGenomeSeqLens: %.2f' % (end - start)) start = time.time() #self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 5000) end = time.time() print('precomputeGenomeFamilyPositions: %.2f' % (end - start)) #start = time.time() #test = self.img.geneDistTable(metadata.keys(), self.markerSetBuilder.globalGeneCountTable.keys(), spacingBetweenContigs=1e6) #end = time.time() #print 'geneDistTable: %.2f' % (end - start) #t = raw_input('waiting...') start = time.time() #testGenomeId = archaeaIds.pop() #testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) #binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, 0.97, 0.97, bMarkerSet = True) markerSet = self.markerSetBuilder.buildMarkerSet( bacteriaIds, 0.97, 0.97) end = time.time() print('buildMarkerSet: %.2f' % (end - start)) print(len(markerSet.markerSet)) test = eval( "[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]" ) print(len(test)) for ms in markerSet.markerSet: bMatch = False for tms in test: if tms == ms: print(ms) print(tms) print('---------') bMatch = True break if not bMatch: print('BOO!') if str( markerSet.markerSet ) == "[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]": print('Good to go!') else: print('oh, shit!!!!')
def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() self.simContigLen = 10000
def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination): img = IMG() markerSetBuilder = MarkerSetBuilder() allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w') allOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w') trustedOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w') filteredOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) metadataOut = open(outputMetadataFile, 'w') # read input metadata file metadata = img.genomeMetadataFromFile(inputMetadataFile) finishedGenomes = defaultdict(set) allGenomes = defaultdict(set) metadataLine = {} bHeader = True for line in open(inputMetadataFile): if bHeader: metadataOut.write(line) bHeader = False continue lineSplit = line.split('\t') genomeId = lineSplit[0] domain = lineSplit[1] status = lineSplit[2] if status == 'Finished': finishedGenomes[domain].add(genomeId) allGenomes[domain].add(genomeId) metadataLine[genomeId] = line allTrustedGenomeIds = set() for lineage, allLineageGenomeIds in allGenomes.items(): print('[' + lineage + ']') print(' Number of genomes: %d' % len(allLineageGenomeIds)) # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker genes for finished genomes print( '\nDetermining initial marker gene sets for genome filtering.') markerSet = markerSetBuilder.buildMarkerSet( finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold) print( ' Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets())) fout = open( os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w') fout.write(str(markerSet.markerSet)) fout.close() # identifying trusted genomes (highly complete, low contamination genomes) print('\nIdentifying highly complete, low contamination genomes.') trustedGenomeIds = set() filteredGenomes = set() retainedStatus = {} filteredStatus = {} geneCountTable = img.geneCountTable(allLineageGenomeIds) for genomeId in allLineageGenomeIds: completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck( markerSet.markerSet, genomeId, geneCountTable) genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) retainedStatus[metadata[genomeId] ['status']] = retainedStatus.get( metadata[genomeId]['status'], 0) + 1 trustedOut.write(genomeStr) allOut.write(genomeStr) metadataOut.write(metadataLine[genomeId]) else: filteredGenomes.add(genomeId) filteredStatus[metadata[genomeId] ['status']] = filteredStatus.get( metadata[genomeId]['status'], 0) + 1 filteredOut.write(genomeStr) allOut.write(genomeStr) print(' Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes) * 100.0 / len(allLineageGenomeIds))) print(' ' + str(filteredStatus)) print(' \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds) * 100.0 / len(allLineageGenomeIds))) print(' ' + str(retainedStatus)) # determine status of retained genomes print('\nTrusted genomes by phylum:') trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.items(): print(' ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count)) print('') allOut.close() trustedOut.close() filteredOut.close() metadataOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in range(0, 6): # Domain to Genus for genomeId, data in metadata.items(): taxaStr = ';'.join(data['taxonomy'][0:r + 1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted(metadata) fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' + str(trustedStats.get(lineage, 0)) + '\n') fout.close()
def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG( "/srv/whitlam/bio/db/checkm/img/img_metadata.tsv", "/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv" ) self.contigLens = [1000, 2000, 5000, 10000, 20000, 50000] self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0] self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2] def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break # build marker sets for evaluating test genome testNode = tree.find_node_with_taxon_label("IMG_" + testGenomeId) binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet( tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet=True, genomeIdsToRemove=[testGenomeId], ) #!!!binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildDomainMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = [testGenomeId]) # determine distribution of all marker genes within the test genome geneDistTable = self.img.geneDistTable( [testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0 ) print "# marker genes: ", len(binMarkerSets.getMarkerGenes()) print "# genes in table: ", len(geneDistTable[testGenomeId]) # estimate completeness of unmodified genome unmodifiedComp = {} unmodifiedCont = {} for ms in binMarkerSets.markerSetIter(): hits = {} for mg in ms.getMarkerGenes(): if mg in geneDistTable[testGenomeId]: hits[mg] = geneDistTable[testGenomeId][mg] completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) unmodifiedComp[ms.lineageStr] = completeness unmodifiedCont[ms.lineageStr] = contamination print completeness, contamination # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + ".fna")) print "genomeSize", genomeSize for contigLen in self.contigLens: for percentComp in self.percentComps: for percentCont in self.percentConts: deltaComp = defaultdict(list) deltaCont = defaultdict(list) deltaCompSet = defaultdict(list) deltaContSet = defaultdict(list) deltaCompRefined = defaultdict(list) deltaContRefined = defaultdict(list) deltaCompSetRefined = defaultdict(list) deltaContSetRefined = defaultdict(list) trueComps = [] trueConts = [] numDescendants = {} for _ in xrange(0, numReplicates): trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome( genomeSize, percentComp, percentCont, contigLen ) print contigLen, trueComp, trueCont, len(startPartialGenomeContigs) trueComps.append(trueComp) trueConts.append(trueCont) for ms in binMarkerSets.markerSetIter(): numDescendants[ms.lineageStr] = ms.numGenomes containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes( ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen, ) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True ) deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False ) deltaCompSet[ms.lineageStr].append(completeness - trueComp) deltaContSet[ms.lineageStr].append(contamination - trueCont) for ms in refinedBinMarkerSet.markerSetIter(): containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes( ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen, ) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=True ) deltaCompRefined[ms.lineageStr].append(completeness - trueComp) deltaContRefined[ms.lineageStr].append(contamination - trueCont) completeness, contamination = ms.genomeCheck( containedMarkerGenes, bIndividualMarkers=False ) deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp) deltaContSetRefined[ms.lineageStr].append(contamination - trueCont) taxonomy = ";".join(metadata[testGenomeId]["taxonomy"]) queueOut.put( ( testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts, ) ) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" # summaryOut = open('/tmp/simulation.draft.summary.w_refinement_50.tsv', 'w') summaryOut = open("/tmp/simulation.summary.testing.tsv", "w") summaryOut.write("Genome Id\tContig len\t% comp\t% cont") summaryOut.write("\tTaxonomy\tMarker set\t# descendants") summaryOut.write("\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont") summaryOut.write("\tIM comp\tIM comp std\tIM cont\tIM cont std") summaryOut.write("\tMS comp\tMS comp std\tMS cont\tMS cont std") summaryOut.write("\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std") summaryOut.write("\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n") # fout = gzip.open('/tmp/simulation.draft.w_refinement_50.tsv.gz', 'wb') fout = gzip.open("/tmp/simulation.testing.tsv.gz", "wb") fout.write("Genome Id\tContig len\t% comp\t% cont") fout.write("\tTaxonomy\tMarker set\t# descendants") fout.write("\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont") fout.write("\tIM comp\tIM cont") fout.write("\tMS comp\tMS cont") fout.write("\tRIM comp\tRIM cont") fout.write("\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n") testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts) itemsProcessed = 0 while True: testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get( block=True, timeout=None ) if testGenomeId == None: break itemsProcessed += 1 statusStr = " Finished processing %d of %d (%.2f%%) test cases." % ( itemsProcessed, numTestGenomes * testsPerGenome, float(itemsProcessed) * 100 / (numTestGenomes * testsPerGenome), ) sys.stdout.write("%s\r" % statusStr) sys.stdout.flush() for markerSetId in unmodifiedComp: summaryOut.write(testGenomeId + "\t%d\t%.2f\t%.2f" % (contigLen, percentComp, percentCont)) summaryOut.write("\t" + taxonomy + "\t" + markerSetId + "\t" + str(numDescendants[markerSetId])) summaryOut.write("\t%.3f\t%.3f" % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) summaryOut.write("\t%.3f\t%.3f" % (mean(trueComps), std(trueConts))) summaryOut.write("\t%.3f\t%.3f" % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId])))) summaryOut.write("\t%.3f\t%.3f" % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId])))) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId]))) ) summaryOut.write( "\t%.3f\t%.3f" % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId]))) ) summaryOut.write("\n") fout.write(testGenomeId + "\t%d\t%.2f\t%.2f" % (contigLen, percentComp, percentCont)) fout.write("\t" + taxonomy + "\t" + markerSetId + "\t" + str(numDescendants[markerSetId])) fout.write("\t%.3f\t%.3f" % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId])) fout.write("\t%s" % ",".join(map(str, trueComps))) fout.write("\t%s" % ",".join(map(str, trueConts))) fout.write("\t%s" % ",".join(map(str, deltaComp[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCont[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCompSet[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaContSet[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCompRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaContRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaCompSetRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, deltaContSetRefined[markerSetId]))) fout.write("\t%s" % ",".join(map(str, trueComps))) fout.write("\t%s" % ",".join(map(str, trueConts))) fout.write("\n") summaryOut.close() fout.close() sys.stdout.write("\n") def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, numThreads): print "\n Reading reference genome tree." treeFile = os.path.join("/srv", "db", "checkm", "genome_tree", "genome_tree_full.refpkg", "genome_tree.tre") tree = dendropy.Tree.get_from_path(treeFile, schema="newick", as_rooted=True, preserve_underscores=True) print " Number of taxa in tree: %d" % (len(tree.leaf_nodes())) genomesInTree = set() for leaf in tree.leaf_iter(): genomesInTree.add(leaf.taxon.label.replace("IMG_", "")) # get all draft genomes for testing print "" metadata = self.img.genomeMetadata() print " Total genomes: %d" % len(metadata) genomeIdsToTest = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, "status", "Finished") print " Number of draft genomes: %d" % len(genomeIdsToTest) print "" print " Pre-computing genome information for calculating marker sets:" start = time.time() self.markerSetBuilder.readLineageSpecificGenesToRemove() end = time.time() print " readLineageSpecificGenesToRemove: %.2f" % (end - start) start = time.time() # self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print " globalGeneCountTable: %.2f" % (end - start) start = time.time() # self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print " precomputeGenomeSeqLens: %.2f" % (end - start) start = time.time() # self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0) end = time.time() print " precomputeGenomeFamilyPositions: %.2f" % (end - start) print "" print " Evaluating %d test genomes." % len(genomeIdsToTest) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in genomeIdsToTest: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [ mp.Process( target=self.__workerThread, args=(tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue), ) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(genomeIdsToTest), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put( ( None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, ) ) writeProc.join()
class MarkerSetSelection(object): def __init__(self): self.simFile = './experiments/simulation.tuning.genus.summary.tsv' self.looRank = 5 self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() def __stabilityTest(self, genomeIds, ubiquityThreshold = 0.97, singleCopyThreshold = 0.97, stabilityThreshold = 0.05): """Test stability of marker set for a group of genomes using LOO-testing.""" # quick escape for lineage that are clearly stable if len(genomeIds) > 200: return True # calculate marker sets using a LOO-testing looMarkerGenes = [] for genomeId in genomeIds: looGenomeIds = genomeIds.difference([genomeId]) # calculate marker genes geneCountTable = self.img.geneCountTable(looGenomeIds) markerGenes = self.markerSetBuilder.markerGenes(looGenomeIds, geneCountTable, ubiquityThreshold*len(looGenomeIds), singleCopyThreshold*len(looGenomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove looMarkerGenes.append(markerGenes) # calculate change in marker set for all pairs markerSetSize = [] diffMarkerSet = [] for i in xrange(0, len(looMarkerGenes)): markerSetSize.append(len(looMarkerGenes[i])) for j in xrange(i+1, len(looMarkerGenes)): symmDiff = looMarkerGenes[i].symmetric_difference(looMarkerGenes[j]) diffMarkerSet.append(len(symmDiff)) print len(genomeIds), mean(diffMarkerSet), mean(markerSetSize) return (float(mean(diffMarkerSet)) / mean(markerSetSize)) <= stabilityThreshold def __patristicDist(self, tree, taxa1, taxa2): mrca = tree.mrca(taxon_labels=[taxa1.taxon.label, taxa2.taxon.label]) if mrca.parent_node == None: # MRCA is the root of the tree return taxa1.distance_from_root() + taxa2.distance_from_root() else: dist = taxa1.edge_length parentNode = taxa1.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node dist += taxa2.edge_length parentNode = taxa2.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node return dist def __distToNodePercentileTest(self, genomeNode, markerSetNode, leaves, percentileTest): distToBin = self.__distanceToAncestor(genomeNode, markerSetNode) distToLeaves = [] for leaf in leaves: distToLeaves.append(self.__distanceToAncestor(leaf, markerSetNode)) return distToBin < percentile(distToLeaves, percentileTest) def __selectMarkerSetNode(self, tree, genomeId, metadata, taxonToGenomeIds): """Determine lineage-specific marker set to use for assessing the giving genome.""" # read genomes removed from tree as a result of duplicate sequences duplicateSeqs = self.markerSetBuilder.readDuplicateSeqs() # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # ascend tree to root looking for suitable marker set curNode = node.parent_node while curNode != None: uniqueId = curNode.label.split('|')[0] genomeIds = set() for leaf in curNode.leaf_nodes(): genomeIds.add(leaf.taxon.label.replace('IMG_', '')) duplicateGenomes = duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: genomeIds.add(dup.replace('IMG_', '')) # remove genome (LOO-style analysis) print 'Full:', len(genomeIds) genomeIds.difference_update([genomeId]) print 'LOO:', len(genomeIds) # remove all genomes from the same taxonomic group as the genome of interest taxon = metadata[genomeId]['taxonomy'][self.looRank] genomeIds.difference_update(taxonToGenomeIds[taxon]) print 'Rank reduced:', len(genomeIds) print uniqueId if len(genomeIds) > 10 and self.__stabilityTest(genomeIds): uidSelected = uniqueId break curNode = curNode.parent_node if curNode == None: # reach root so use universal marker set uidSelected = uniqueId return uidSelected def __bestMarkerSet(self, genomeId, simResults): """Get stats for best marker set.""" curBest = 1000 bestUID = None for uid, results in simResults[genomeId].iteritems(): numDescendants, dComp, dCont = results if (dComp + dCont) < curBest: numDescendantsBest = numDescendants dCompBest = dComp dContBest = dCont bestUID = uid curBest = dComp + dCont return bestUID, numDescendantsBest, dCompBest, dContBest def __workerThread(self, tree, simResults, metadata, taxonToGenomeIds, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break uidSelected = self.__selectMarkerSetNode(tree, testGenomeId, metadata, taxonToGenomeIds) numDescendantsSelected, dCompSelected, dContSelected = simResults[testGenomeId][uidSelected] # find best marker set bestUID, numDescendantsBest, dCompBest, dContBest = self.__bestMarkerSet(testGenomeId, simResults) queueOut.put((testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" fout = open('./experiments/markerSetSelection.tsv', 'w') fout.write('Genome Id\tSelected UID\t# descendants\tSelected dComp\tSelected dCont\tBest UID\t# descendants\tBest dComp\tBest dCont\tdDescendants\tdComp\tdCont\n') itemsToProcess = 0 dComps = [] dConts = [] dCompsPer = [] dContsPer = [] bestComp = [] bestCont = [] selectedComp = [] selectedCont = [] while True: testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest = writerQueue.get(block=True, timeout=None) if testGenomeId == None: break itemsToProcess += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test genomes.' % (itemsToProcess, numTestGenomes, float(itemsToProcess)*100/(numTestGenomes)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() dComp = abs(dCompSelected - dCompBest) dCont = abs(dContSelected - dContBest) dDescendants = abs(numDescendantsSelected - numDescendantsBest) fout.write('%s\t%s\t%d\t%.4f\t%.4f\t%s\t%d\t%.4f\t%.4f\t%d\t%.4f\t%.4f\n' % (testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest, dDescendants, dComp, dCont)) dComps.append(dComp) dConts.append(dCont) dCompsPer.append(dComp*100.0 / dCompBest) dContsPer.append(dCont*100.0 / max(dContBest, 0.01)) bestComp.append(dCompBest) bestCont.append(dContBest) selectedComp.append(dCompSelected) selectedCont.append(dContSelected) sys.stdout.write('\n') fout.close() print '' print ' General results:' print ' Best comp: %.2f +/- %.2f' % (mean(bestComp), std(bestComp)) print ' Best cont: %.2f +/- %.2f' % (mean(bestCont), std(bestCont)) print ' Selected comp: %.2f +/- %.2f' % (mean(selectedComp), std(selectedComp)) print ' Selected cont: %.2f +/- %.2f' % (mean(selectedCont), std(selectedCont)) print '' print ' Delta comp: %.2f +/- %.2f' % (mean(dComps), std(dComps)) print ' Delta cont: %.2f +/- %.2f' % (mean(dConts), std(dConts)) print ' Delta comp per error: %.1f +/- %.1f' % (mean(dCompsPer), std(dCompsPer)) print ' Delta cont per error: %.1f +/- %.1f' % (mean(dContsPer), std(dContsPer)) def __distanceToAncestor(self, leaf, ancestor): dist = 0 curNode = leaf while curNode != ancestor: dist += curNode.edge_length curNode = curNode.parent_node return dist def __bestNodeProperties(self, genomeId, tree, bestUID): # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # find node of best marker set curNode = node.parent_node nodesToBin = 0 distanceToBin = node.edge_length distanceToLeaves = [] while curNode != None: uniqueId = curNode.label.split('|')[0] nodesToBin += 1 if uniqueId == bestUID: for leaf in curNode.leaf_nodes(): if leaf != node: dist = self.__distanceToAncestor(leaf, curNode) distanceToLeaves.append(dist) break distanceToBin += curNode.edge_length curNode = curNode.parent_node return nodesToBin, distanceToBin, mean(distanceToLeaves) def __propertiesOfBestMarkerSets(self, tree, simResults): numDescendants = [] nodesToBin = [] distanceToBin = [] avgDistanceToLeaf = [] percDiffs = [] for genomeId in simResults: bestUID, numDescendantsBest, _, _ = self.__bestMarkerSet(genomeId, simResults) nodesToBinBest, distanceToBinBest, avgDistanceToLeafBest = self.__bestNodeProperties(genomeId, tree, bestUID) numDescendants.append(numDescendantsBest) nodesToBin.append(nodesToBinBest) distanceToBin.append(distanceToBinBest) avgDistanceToLeaf.append(avgDistanceToLeafBest) percDiff = abs(distanceToBinBest - avgDistanceToLeafBest) * 100 / distanceToBinBest percDiffs.append(percDiff) print ' # descendants: %.2f +/- %.2f' % (mean(numDescendants), std(numDescendants)) print ' # nodes to bin: %.2f +/- %.2f' % (mean(nodesToBin), std(nodesToBin)) print ' Distance to bin: %.2f +/- %.2f' % (mean(distanceToBin), std(distanceToBin)) distanceToBin = array(distanceToBin) avgDistanceToLeaf = array(avgDistanceToLeaf) print ' Distance to bin - average distance to leaf: %.2f +/- %.2f' % (mean(abs(distanceToBin - avgDistanceToLeaf)), std(abs(distanceToBin - avgDistanceToLeaf))) print ' Percent difference to average leaf distance: %.2f +/- %.2f' % (mean(percDiffs), std(percDiffs)) print '' def run(self, numThreads): # read reference tree print '\n Reading reference genome tree.' treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # get all genomes with a given taxon label metadata = self.img.genomeMetadata() taxonToGenomeIds = defaultdict(set) for genomeId in metadata: for t in metadata[genomeId]['taxonomy']: taxonToGenomeIds[t].add(genomeId) # read simulation results print ' Reading simulation results.' simResults = defaultdict(dict) with open(self.simFile) as f: f.readline() for line in f: lineSplit = line.split('\t') simId = lineSplit[0] + '-' + lineSplit[1] + '-' + lineSplit[2] + '-' + lineSplit[3] uid = lineSplit[5].split('|')[0].strip() numDescendants = int(lineSplit[6]) comp = float(lineSplit[21]) cont = float(lineSplit[23]) simResults[simId][uid] = [numDescendants, comp, cont] #print '' #print ' Properties of best marker sets:' #self.__propertiesOfBestMarkerSets(tree, simResults) print ' Evaluating %d test genomes.' % len(simResults) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in simResults: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [mp.Process(target = self.__workerThread, args = (tree, simResults, metadata, taxonToGenomeIds, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__writerThread, args = (len(simResults), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None)) writeProc.join()