def run(self, outputFile): img = IMG() print 'Identifying all IMG prokaryotic genomes with valid data.' metadata = img.genomeMetadata() genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata) genomeMissingData = img.genomesWithMissingData(genomeIds) genomeIds -= genomeMissingData print ' Identified %d valid genomes.' % (len(genomeIds)) print 'Calculating gene copy number for each genome.' countTable = img.geneCountTable(genomeIds) counts = [] for _, count in countTable['pfam00318'].iteritems(): counts.append(count) print len(genomeIds) print len(counts) print mean(counts) fout = open(outputFile, 'w') fout.write(str(countTable)) fout.close() print 'Gene count dictionary to: ' + outputFile
def run(self): img = IMG() fout = open('./data/evaluate_hmms_with_prodigal.txt', 'w', 1) # get list of all marker genes markerset = MarkerSet() pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes() print 'PFAM marker genes: ' + str(len(tigrMarkers)) print 'TIGR marker genes: ' + str(len(pfamMarkers)) print '' # run HMMs on each of the finished genomes genomeIds = img.genomeIds('Finished') for genomeId in genomeIds: print genomeId + ':' fout.write(genomeId + ':\n') self.runPFAM(genomeId) self.runTIGRFAM(genomeId) fout.write(' ORF results:\n') self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout) #self.translateSixFrames(genomeId) #self.runPFAM_SixFrames(genomeId) #self.runTIGRFAM_SixFrames(genomeId) #fout.write(' Six-frame translation results:\n') #self.compareSixFrameResults(genomeId, pfamMarkers, tigrMarkers, fout) fout.close()
def run(self): img = IMG() fout = open('./data/evaluate_hmms_with_prodigal.txt', 'w', 1) # get list of all marker genes markerset = MarkerSet() pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes() print('PFAM marker genes: ' + str(len(tigrMarkers))) print('TIGR marker genes: ' + str(len(pfamMarkers))) print('') # run HMMs on each of the finished genomes genomeIds = img.genomeIds('Finished') for genomeId in genomeIds: print(genomeId + ':') fout.write(genomeId + ':\n') self.runPFAM(genomeId) self.runTIGRFAM(genomeId) fout.write(' ORF results:\n') self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout) #self.translateSixFrames(genomeId) #self.runPFAM_SixFrames(genomeId) #self.runTIGRFAM_SixFrames(genomeId) #fout.write(' Six-frame translation results:\n') #self.compareSixFrameResults(genomeId, pfamMarkers, tigrMarkers, fout) fout.close()
def __init__(self, outputDir): self.img = IMG() self.markerSetBuilder = MarkerSetBuilder() if os.path.exists(outputDir): print '[Error] Output directory already exists: ' + outputDir sys.exit(0) else: os.makedirs(outputDir) self.__checkForHMMER() self.__checkForFastTree() self.hmmDir = os.path.join(outputDir, 'phylo_hmms') self.alignmentDir = os.path.join(outputDir, 'gene_alignments') self.geneTreeDir = os.path.join(outputDir, 'gene_trees') self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific') self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final') self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv') self.concatenatedAlignFile = os.path.join( outputDir, 'genome_tree.concatenated.faa') self.derepConcatenatedAlignFile = os.path.join( outputDir, 'genome_tree.concatenated.derep.fasta') self.treeOut = os.path.join(outputDir, 'genome_tree.tre') self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre') self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre') self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre') self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre') self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre') self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre') self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre') self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv') self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv') self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm') self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt') self.phyloUbiquity = 0.90 self.phyloSingleCopy = 0.90 self.paralogAcceptPer = 0.01 # self.consistencyAcceptPer = 0.95 # for trees at the class-level self.consistencyAcceptPer = 0.906 # for trees at the phylum-level self.consistencyMinTaxa = 20 # create output directories os.makedirs(self.hmmDir) os.makedirs(self.alignmentDir) os.makedirs(self.geneTreeDir) os.makedirs(self.conspecificGeneTreeDir) os.makedirs(self.finalGeneTreeDir)
class ClassTree(object): def __init__(self): self.img = IMG() def run(self): tree = dendropy.Tree.get_from_path('../data/genome_tree/genome_tree_prok.refpkg/genome_tree.final.tre', schema='newick', as_rooted=True, preserve_underscores=True) metadata = self.img.genomeMetadata() # relabel taxa for leaf in tree.leaf_nodes(): genomeId = leaf.taxon.label.replace('IMG_', '') classT = metadata[genomeId]['taxonomy'][2] newLeafLabel = classT + '_' + genomeId leaf.taxon.label = newLeafLabel # relabel internal nodes for node in tree.internal_nodes(): uid, taxaStr, bootstrap = node.label.split('|') if bootstrap: node.label = uid + ':' + str(node.edge_length) + '[' + str(int(float(bootstrap)*100 + 0.5)) + ']' else: node.label = uid + ':' + str(node.edge_length) # write out reduced tree tree.write_to_path('./experiments/classTree.tre', schema='newick', suppress_rooting=True, suppress_edge_lengths=True, unquoted_underscores=True) tree.write_to_path('./experiments/classTree_no_internal.tre', schema='newick', suppress_rooting=True, suppress_edge_lengths=True, unquoted_underscores=True, suppress_internal_node_labels=True)
def run(self, outputFile): img = IMG() print('Identifying all IMG prokaryotic genomes with valid data.') metadata = img.genomeMetadata() genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata) genomeMissingData = img.genomesWithMissingData(genomeIds) genomeIds -= genomeMissingData print(' Identified %d valid genomes.' % (len(genomeIds))) print('Calculating gene copy number for each genome.') countTable = img.geneCountTable(genomeIds) counts = [] for _, count in countTable['pfam00318'].iteritems(): counts.append(count) print(len(genomeIds)) print(len(counts)) print(mean(counts)) fout = open(outputFile, 'w') fout.write(str(countTable)) fout.close() print('Gene count dictionary to: ' + outputFile)
def __init__(self, outputDir): self.img = IMG() self.markerSetBuilder = MarkerSetBuilder() if os.path.exists(outputDir): print '[Error] Output directory already exists: ' + outputDir sys.exit(0) else: os.makedirs(outputDir) self.__checkForHMMER() self.__checkForFastTree() self.hmmDir = os.path.join(outputDir, 'phylo_hmms') self.alignmentDir = os.path.join(outputDir, 'gene_alignments') self.geneTreeDir = os.path.join(outputDir, 'gene_trees') self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific') self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final') self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv') self.concatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.faa') self.derepConcatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.derep.fasta') self.treeOut = os.path.join(outputDir, 'genome_tree.tre') self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre') self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre') self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre') self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre') self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre') self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre') self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre') self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv') self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv') self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm') self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt') self.phyloUbiquity = 0.90 self.phyloSingleCopy = 0.90 self.paralogAcceptPer = 0.01 #self.consistencyAcceptPer = 0.95 # for trees at the class-level self.consistencyAcceptPer = 0.906 # for trees at the phylum-level self.consistencyMinTaxa = 20 # create output directories os.makedirs(self.hmmDir) os.makedirs(self.alignmentDir) os.makedirs(self.geneTreeDir) os.makedirs(self.conspecificGeneTreeDir) os.makedirs(self.finalGeneTreeDir)
class MarkerSetStabilityTest(object): def __init__(self): self.img = IMG() self.markerset = MarkerSet() def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut): """Assess stability of marker set for a specific named taxonomic group.""" while True: lineage = queueIn.get(block=True, timeout=None) if lineage == None: break genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted') markerGenes = [] perChange = [] numGenomesToSelect = int(0.9 * len(genomeIds)) if len(genomeIds) >= minGenomes: # calculate marker set for all genomes in lineage geneCountTable = self.img.geneCountTable(genomeIds) markerGenes = self.markerset.markerGenes( genomeIds, geneCountTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove for _ in range(0, 100): # calculate marker set for subset of genomes subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect) geneCountTable = self.img.geneCountTable(subsetGenomeIds) subsetMarkerGenes = self.markerset.markerGenes( subsetGenomeIds, geneCountTable, ubiquityThreshold * numGenomesToSelect, singleCopyThreshold * numGenomesToSelect) tigrToRemove = self.img.identifyRedundantTIGRFAMs( subsetMarkerGenes) subsetMarkerGenes = subsetMarkerGenes - tigrToRemove perChange.append( float( len( markerGenes.symmetric_difference( subsetMarkerGenes))) * 100.0 / len(markerGenes)) if perChange != []: queueOut.put( (lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, mean(perChange), std(perChange))) else: queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, -1, -1)) def __storeResults(self, outputFile, totalLineages, writerQueue): """Store results to file.""" fout = open(outputFile, 'w') fout.write( 'Lineage\t# genomes\t# markers\t# sampled genomes\tmean % change\tstd % change\n' ) numProcessedLineages = 0 while True: lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange = writerQueue.get( block=True, timeout=None) if lineage == None: break numProcessedLineages += 1 statusStr = ' Finished processing %d of %d (%.2f%%) lineages.' % ( numProcessedLineages, totalLineages, float(numProcessedLineages) * 100 / totalLineages) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange)) sys.stdout.write('\n') fout.close() def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads): """Calculate stability of marker sets for named taxonomic groups.""" print(' Testing stability of marker sets:') random.seed(1) # process each sequence in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() metadata = self.img.genomeMetadata() lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank) for lineage in lineages: workerQueue.put(lineage) for _ in range(numThreads): workerQueue.put(None) calcProc = [ mp.Process(target=self.__processLineage, args=(metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__storeResults, args=(outputFile, len(lineages), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None, None, None, None, None)) writeProc.join()
def __init__(self): self.img = IMG()
def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG()
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() self.simContigLen = 10000 def __selectMarkerSet(self, tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut): """Select marker set for parent edge of specified internal node.""" # get genomes descendant from each child of the specified internal node leaves = [] for child in internalNode.child_nodes(): genomeIds = set() for leaf in child.leaf_nodes(): genomeId = leaf.taxon.label.replace('IMG_', '') genomeIds.add(genomeId) duplicateGenomes = self.markerSetBuilder.duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: dupId = dup.replace('IMG_', '') genomeIds.add(dupId) leaves.append(genomeIds) # make sure each set of leaves contains at least a minimum number of genomes orderedLeaves = sorted(leaves, key=len) if len(orderedLeaves[0]) < 5: queueOut.put(('NA', -1, -1, -1, -1, -1)) return # calculate marker genes with all genomes in lineage with the fewest genomes removed binMarkerGenes, _ = self.markerSetBuilder.buildBinMarkerSet(tree, internalNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = orderedLeaves[0]) # evaluate accuracy of completeness and contamination estimations on different partial genomes from lineage with fewest genomes testGenomeIds = random.sample(orderedLeaves[0], min(len(orderedLeaves[0]), 100)) deltaComp = defaultdict(list) deltaCont = defaultdict(list) for testGenomeId in testGenomeIds: geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerGenes.getMarkerGenes(), spacingBetweenContigs=0) genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna')) repsPerGenome = 100 for _ in xrange(0, repsPerGenome): testComp = random.uniform(0.5, 1.0) testCont = random.uniform(0, 0.2) trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(genomeSize, testComp, testCont, self.simContigLen) for ms in binMarkerGenes.markerSetIter(): containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, self.simContigLen) completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True) if completeness == 0.0: print ms.getMarkerGenes() print geneDistTable[testGenomeId] print startPartialGenomeContigs print genomeSize print '*****************' + testGenomeId sys.exit() deltaComp[ms.lineageStr].append(completeness - trueComp) deltaCont[ms.lineageStr].append(contamination - trueCont) # determine lineage-specific marker set with best average performance curBest = 1000 bestUID = None dCompBest = 0 dContBest = 0 for lineageStr in deltaComp: dComp, dCont = mean(abs(array(deltaComp[lineageStr]))), mean(abs(array(deltaCont[lineageStr]))) if (dComp + dCont) < curBest: dCompBest = dComp dContBest = dCont dCompStdBest = std(abs(array(deltaComp[lineageStr]))) dContStdBest = std(abs(array(deltaCont[lineageStr]))) bestUID = lineageStr.split('|')[0] curBest = dComp + dCont queueOut.put((internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest)) def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, queueIn, queueOut): """Process each data item in parallel.""" while True: internalNode = queueIn.get(block=True, timeout=None) if internalNode == None: break self.__selectMarkerSet(tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut) def __writerThread(self, numInternalNodes, writerQueue): """Store or write results of worker threads in a single thread.""" fout = open('/tmp/simInferBestMarkerSet.tsv', 'w') fout.write('Internal node ID\tMarker set ID\tmean % delta comp\tstd % delta comp\tmean % delta cont\tstd % delta cont\n') itemsProcessed = 0 while True: internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest = writerQueue.get(block=True, timeout=None) if internalNode == None: break itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) internal branches.' % (itemsProcessed, numInternalNodes, float(itemsProcessed)*100/(numInternalNodes)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if internalNode != 'NA': fout.write(internalNode.label + '\t%s\t%.2f\t%.2f\t%.2f\t%.2f\n' % (bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest)) fout.close() sys.stdout.write('\n') def run(self, ubiquityThreshold, singleCopyThreshold, numThreads): random.seed(0) print '\n Calculating global gene count table.' metadata = self.img.genomeMetadata() self.markerSetBuilder.globalGeneCountTable = self.img.geneCountTable(metadata.keys()) print '\n Reading reference genome tree.' treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) print ' Evaluating %d internal nodes.' % len(tree.internal_nodes()) workerQueue = mp.Queue() writerQueue = mp.Queue() for internalNode in tree.internal_nodes(): if internalNode.parent_node != None: workerQueue.put(internalNode) for _ in range(numThreads): workerQueue.put(None) metadata = self.img.genomeMetadata() workerProc = [mp.Process(target = self.__workerThread, args = (tree, metadata, ubiquityThreshold, singleCopyThreshold, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__writerThread, args = (len(tree.internal_nodes())-1, writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None)) writeProc.join()
def run(self, geneTreeDir, treeExtension, consistencyThreshold, minTaxaForAverage, outputFile, outputDir): # make sure output directory is empty if not os.path.exists(outputDir): os.makedirs(outputDir) files = os.listdir(outputDir) for f in files: if os.path.isfile(os.path.join(outputDir, f)): os.remove(os.path.join(outputDir, f)) # get TIGRFam info descDict = {} files = os.listdir('/srv/db/tigrfam/13.0/TIGRFAMs_13.0_INFO') for f in files: shortDesc = longDesc = '' for line in open('/srv/db/tigrfam/13.0/TIGRFAMs_13.0_INFO/' + f): lineSplit = line.split(' ') if lineSplit[0] == 'AC': acc = lineSplit[1].strip() elif lineSplit[0] == 'DE': shortDesc = lineSplit[1].strip() elif lineSplit[0] == 'CC': longDesc = lineSplit[1].strip() descDict[acc] = [shortDesc, longDesc] # get PFam info for line in open('/srv/db/pfam/27/Pfam-A.clans.tsv'): lineSplit = line.split('\t') acc = lineSplit[0] shortDesc = lineSplit[3] longDesc = lineSplit[4].strip() descDict[acc] = [shortDesc, longDesc] # get IMG taxonomy img = IMG() metadata = img.genomeMetadata() genomeIdToTaxonomy = {} for genomeId, m in metadata.iteritems(): genomeIdToTaxonomy[genomeId] = m['taxonomy'] # perform analysis for each tree treeFiles = os.listdir(geneTreeDir) allResults = {} allTaxa = [set([]), set([]), set([])] taxaCounts = {} avgConsistency = {} for treeFile in treeFiles: if not treeFile.endswith(treeExtension): continue print treeFile tree = dendropy.Tree.get_from_path(os.path.join(geneTreeDir, treeFile), schema='newick', as_rooted=True, preserve_underscores=True) domainConsistency = {} phylaConsistency = {} classConsistency = {} consistencyDict = [domainConsistency, phylaConsistency, classConsistency] # get abundance of taxa at different taxonomic ranks totals = [{}, {}, {}] leaves = tree.leaf_nodes() print ' Number of leaves: ' + str(len(leaves)) totalValidLeaves = 0 for leaf in leaves: genomeId = self.__genomeId(leaf.taxon.label) if genomeId not in metadata: print '[Error] Genome is missing metadata: ' + genomeId sys.exit() totalValidLeaves += 1 taxonomy = genomeIdToTaxonomy[genomeId] for r in xrange(0, 3): totals[r][taxonomy[r]] = totals[r].get(taxonomy[r], 0) + 1 consistencyDict[r][taxonomy[r]] = 0 allTaxa[r].add(taxonomy[r]) taxaCounts[treeFile] = [totalValidLeaves, totals[0].get('Bacteria', 0), totals[0].get('Archaea', 0)] # find highest consistency nodes (congruent descendant taxa / (total taxa + incongruent descendant taxa)) internalNodes = tree.internal_nodes() for node in internalNodes: leaves = node.leaf_nodes() for r in xrange(0, 3): leafCounts = {} for leaf in leaves: genomeId = self.__genomeId(leaf.taxon.label) taxonomy = genomeIdToTaxonomy[genomeId] leafCounts[taxonomy[r]] = leafCounts.get(taxonomy[r], 0) + 1 # calculate consistency for node for taxa in consistencyDict[r]: totalTaxaCount = totals[r][taxa] if totalTaxaCount <= 1 or taxa == 'unclassified': consistencyDict[r][taxa] = 'N/A' continue taxaCount = leafCounts.get(taxa, 0) incongruentTaxa = len(leaves) - taxaCount c = float(taxaCount) / (totalTaxaCount + incongruentTaxa) if c > consistencyDict[r][taxa]: consistencyDict[r][taxa] = c # consider clan in other direction since the trees are unrooted taxaCount = totalTaxaCount - leafCounts.get(taxa, 0) incongruentTaxa = totalValidLeaves - len(leaves) - taxaCount c = float(taxaCount) / (totalTaxaCount + incongruentTaxa) if c > consistencyDict[r][taxa]: consistencyDict[r][taxa] = c # write results consistencyDir = os.path.join(outputDir, 'consistency') if not os.path.exists(consistencyDir): os.makedirs(consistencyDir) fout = open(os.path.join(consistencyDir, treeFile + '.results.tsv'), 'w') fout.write('Tree') for r in xrange(0, 3): for taxa in sorted(consistencyDict[r].keys()): fout.write('\t' + taxa) fout.write('\n') fout.write(treeFile) for r in xrange(0, 3): for taxa in sorted(consistencyDict[r].keys()): if consistencyDict[r][taxa] != 'N/A': fout.write('\t%.2f' % (consistencyDict[r][taxa]*100)) else: fout.write('\tN/A') fout.close() # calculate average consistency at each taxonomic rank average = [] for r in xrange(0, 3): sumConsistency = [] for taxa in consistencyDict[r]: if totals[r][taxa] > minTaxaForAverage and consistencyDict[r][taxa] != 'N/A': sumConsistency.append(consistencyDict[r][taxa]) if len(sumConsistency) > 0: average.append(sum(sumConsistency) / len(sumConsistency)) else: average.append(0) avgConsistency[treeFile] = average allResults[treeFile] = consistencyDict print ' Average consistency: ' + str(average) + ', mean = %.2f' % (sum(average)/len(average)) print '' # print out combined results fout = open(outputFile, 'w') fout.write('Tree\tShort Desc.\tLong Desc.\tAlignment Length\t# Taxa\t# Bacteria\t# Archaea\tAvg. Consistency\tAvg. Domain Consistency\tAvg. Phylum Consistency\tAvg. Class Consistency') for r in xrange(0, 3): for t in sorted(allTaxa[r]): fout.write('\t' + t) fout.write('\n') filteredGeneTrees = 0 retainedGeneTrees = 0 for treeFile in sorted(allResults.keys()): consistencyDict = allResults[treeFile] treeId = treeFile[0:treeFile.find('.')].replace('pfam', 'PF') fout.write(treeId + '\t' + descDict[treeId][0] + '\t' + descDict[treeId][1]) # Taxa count fout.write('\t' + str(taxaCounts[treeFile][0]) + '\t' + str(taxaCounts[treeFile][1]) + '\t' + str(taxaCounts[treeFile][2])) avgCon = 0 for r in xrange(0, 3): avgCon += avgConsistency[treeFile][r] avgCon /= 3 fout.write('\t' + str(avgCon)) if avgCon >= consistencyThreshold: retainedGeneTrees += 1 os.system('cp ' + os.path.join(geneTreeDir, treeFile) + ' ' + os.path.join(outputDir, treeFile)) else: filteredGeneTrees += 1 print 'Filtered % s with an average consistency of %.4f.' % (treeFile, avgCon) for r in xrange(0, 3): fout.write('\t' + str(avgConsistency[treeFile][r])) for r in xrange(0, 3): for t in sorted(allTaxa[r]): if t in consistencyDict[r]: if consistencyDict[r][t] != 'N/A': fout.write('\t%.2f' % (consistencyDict[r][t]*100)) else: fout.write('\tN/A') else: fout.write('\tN/A') fout.write('\n') fout.close() print 'Retained gene trees: ' + str(retainedGeneTrees) print 'Filtered gene trees: ' + str(filteredGeneTrees)
def __init__(self): self.img = IMG() self.markerset = MarkerSet()
class DecorateTree(object): def __init__(self): self.img = IMG() self.markerSetBuilder = MarkerSetBuilder() def __meanStd(self, metadata, genomeIds, category): values = [] for genomeId in genomeIds: genomeId = genomeId.replace('IMG_', '') v = metadata[genomeId][category] if v != 'NA': values.append(v) return mean(values), std(values) def __calculateMarkerSet(self, genomeLabels, ubiquityThreshold = 0.97, singleCopyThreshold = 0.97): """Calculate marker set for a set of genomes.""" # get genome IDs from genome labels genomeIds = set() for genomeLabel in genomeLabels: genomeIds.add(genomeLabel.replace('IMG_', '')) markerSet = self.markerSetBuilder.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold) return markerSet.markerSet def __pfamIdToPfamAcc(self, img): pfamIdToPfamAcc = {} for line in open(img.pfamHMMs): if 'ACC' in line: acc = line.split()[1].strip() pfamId = acc.split('.')[0] pfamIdToPfamAcc[pfamId] = acc return pfamIdToPfamAcc def decorate(self, taxaTreeFile, derepFile, inputTreeFile, metadataOut, numThreads): # read genome metadata print ' Reading metadata.' metadata = self.img.genomeMetadata() # read list of taxa with duplicate sequences print ' Read list of taxa with duplicate sequences.' duplicateTaxa = {} for line in open(derepFile): lineSplit = line.rstrip().split() if len(lineSplit) > 1: duplicateTaxa[lineSplit[0]] = lineSplit[1:] # build gene count table print ' Building gene count table.' genomeIds = self.img.genomeMetadata().keys() print ' # trusted genomes = ' + str(len(genomeIds)) # calculate statistics for each internal node using multiple threads print ' Calculating statistics for each internal node.' self.__internalNodeStatistics(taxaTreeFile, inputTreeFile, duplicateTaxa, metadata, metadataOut, numThreads) def __internalNodeStatistics(self, taxaTreeFile, inputTreeFile, duplicateTaxa, metadata, metadataOut, numThreads): # determine HMM model accession numbers pfamIdToPfamAcc = self.__pfamIdToPfamAcc(self.img) taxaTree = dendropy.Tree.get_from_path(taxaTreeFile, schema='newick', as_rooted=True, preserve_underscores=True) inputTree = dendropy.Tree.get_from_path(inputTreeFile, schema='newick', as_rooted=True, preserve_underscores=True) workerQueue = mp.Queue() writerQueue = mp.Queue() uniqueId = 0 for node in inputTree.internal_nodes(): uniqueId += 1 workerQueue.put((uniqueId, node)) for _ in range(numThreads): workerQueue.put((None, None)) calcProc = [mp.Process(target = self.__processInternalNode, args = (taxaTree, duplicateTaxa, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__reportStatistics, args = (metadata, metadataOut, inputTree, inputTreeFile, pfamIdToPfamAcc, writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None, None, None, None, None, None)) writeProc.join() def __processInternalNode(self, taxaTree, duplicateTaxa, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: uniqueId, node = queueIn.get(block=True, timeout=None) if uniqueId == None: break # find corresponding internal node in taxa tree labels = [] for leaf in node.leaf_nodes(): labels.append(leaf.taxon.label) if leaf.taxon.label in duplicateTaxa: for genomeId in duplicateTaxa[leaf.taxon.label]: labels.append(genomeId) # check if there is a taxonomic label mrca = taxaTree.mrca(taxon_labels = labels) taxaStr = '' if mrca.label: taxaStr = mrca.label.replace(' ', '') # give node a unique Id while retraining bootstrap value bootstrap = '' if node.label: bootstrap = node.label nodeLabel = 'UID' + str(uniqueId) + '|' + taxaStr + '|' + bootstrap # calculate marker set markerSet = self.__calculateMarkerSet(labels) queueOut.put((uniqueId, labels, markerSet, taxaStr, bootstrap, node.oid, nodeLabel)) def __reportStatistics(self, metadata, metadataOut, inputTree, inputTreeFile, pfamIdToPfamAcc, writerQueue): """Store statistics for internal node.""" fout = open(metadataOut, 'w') fout.write('UID\t# genomes\tTaxonomy\tBootstrap') fout.write('\tGC mean\tGC std') fout.write('\tGenome size mean\tGenome size std') fout.write('\tGene count mean\tGene count std') fout.write('\tMarker set') fout.write('\n') numProcessedNodes = 0 numInternalNodes = len(inputTree.internal_nodes()) while True: uniqueId, labels, markerSet, taxaStr, bootstrap, nodeID, nodeLabel = writerQueue.get(block=True, timeout=None) if uniqueId == None: break numProcessedNodes += 1 statusStr = ' Finished processing %d of %d (%.2f%%) internal nodes.' % (numProcessedNodes, numInternalNodes, float(numProcessedNodes)*100/numInternalNodes) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() fout.write('UID' + str(uniqueId) + '\t' + str(len(labels)) + '\t' + taxaStr + '\t' + bootstrap) m, s = self.__meanStd(metadata, labels, 'GC %') fout.write('\t' + str(m*100) + '\t' + str(s*100)) m, s = self.__meanStd(metadata, labels, 'genome size') fout.write('\t' + str(m) + '\t' + str(s)) m, s = self.__meanStd(metadata, labels, 'gene count') fout.write('\t' + str(m) + '\t' + str(s)) # change model names to accession numbers, and make # sure there is an HMM model for each PFAM mungedMarkerSets = [] for geneSet in markerSet: s = set() for geneId in geneSet: if 'pfam' in geneId: pfamId = geneId.replace('pfam', 'PF') if pfamId in pfamIdToPfamAcc: s.add(pfamIdToPfamAcc[pfamId]) else: s.add(geneId) mungedMarkerSets.append(s) fout.write('\t' + str(mungedMarkerSets)) fout.write('\n') node = inputTree.find_node(filter_fn=lambda n: hasattr(n, 'oid') and n.oid == nodeID) node.label = nodeLabel sys.stdout.write('\n') fout.close() inputTree.write_to_path(inputTreeFile, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination): img = IMG() markerSetBuilder = MarkerSetBuilder() allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w') allOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w') trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w') filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') metadataOut = open(outputMetadataFile, 'w') # read input metadata file metadata = img.genomeMetadataFromFile(inputMetadataFile) finishedGenomes = defaultdict(set) allGenomes = defaultdict(set) metadataLine = {} bHeader = True for line in open(inputMetadataFile): if bHeader: metadataOut.write(line) bHeader = False continue lineSplit = line.split('\t') genomeId = lineSplit[0] domain = lineSplit[1] status = lineSplit[2] if status == 'Finished': finishedGenomes[domain].add(genomeId) allGenomes[domain].add(genomeId) metadataLine[genomeId] = line allTrustedGenomeIds = set() for lineage, allLineageGenomeIds in allGenomes.iteritems(): print '[' + lineage + ']' print ' Number of genomes: %d' % len(allLineageGenomeIds) # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker genes for finished genomes print '\nDetermining initial marker gene sets for genome filtering.' markerSet = markerSetBuilder.buildMarkerSet(finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold) print ' Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets()) fout = open(os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w') fout.write(str(markerSet.markerSet)) fout.close() # identifying trusted genomes (highly complete, low contamination genomes) print '\nIdentifying highly complete, low contamination genomes.' trustedGenomeIds = set() filteredGenomes = set() retainedStatus = {} filteredStatus = {} geneCountTable = img.geneCountTable(allLineageGenomeIds) for genomeId in allLineageGenomeIds: completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(markerSet.markerSet, genomeId, geneCountTable) genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) retainedStatus[metadata[genomeId]['status']] = retainedStatus.get(metadata[genomeId]['status'], 0) + 1 trustedOut.write(genomeStr) allOut.write(genomeStr) metadataOut.write(metadataLine[genomeId]) else: filteredGenomes.add(genomeId) filteredStatus[metadata[genomeId]['status']] = filteredStatus.get(metadata[genomeId]['status'], 0) + 1 filteredOut.write(genomeStr) allOut.write(genomeStr) print ' Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes)*100.0 / len(allLineageGenomeIds)) print ' ' + str(filteredStatus) print ' \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds)*100.0 / len(allLineageGenomeIds)) print ' ' + str(retainedStatus) # determine status of retained genomes print '\nTrusted genomes by phylum:' trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.iteritems(): print ' ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count) print '' allOut.close() trustedOut.close() filteredOut.close() metadataOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in xrange(0, 6): # Domain to Genus for genomeId, data in metadata.iteritems(): taxaStr = ';'.join(data['taxonomy'][0:r+1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted(metadata) fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n') fout.close()
def __init__(self): img = IMG() self.metadata = img.genomeMetadata()
class GenomeTreeWorkflow(object): def __init__(self, outputDir): self.img = IMG() self.markerSetBuilder = MarkerSetBuilder() if os.path.exists(outputDir): print '[Error] Output directory already exists: ' + outputDir sys.exit(0) else: os.makedirs(outputDir) self.__checkForHMMER() self.__checkForFastTree() self.hmmDir = os.path.join(outputDir, 'phylo_hmms') self.alignmentDir = os.path.join(outputDir, 'gene_alignments') self.geneTreeDir = os.path.join(outputDir, 'gene_trees') self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific') self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final') self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv') self.concatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.faa') self.derepConcatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.derep.fasta') self.treeOut = os.path.join(outputDir, 'genome_tree.tre') self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre') self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre') self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre') self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre') self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre') self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre') self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre') self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv') self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv') self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm') self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt') self.phyloUbiquity = 0.90 self.phyloSingleCopy = 0.90 self.paralogAcceptPer = 0.01 #self.consistencyAcceptPer = 0.95 # for trees at the class-level self.consistencyAcceptPer = 0.906 # for trees at the phylum-level self.consistencyMinTaxa = 20 # create output directories os.makedirs(self.hmmDir) os.makedirs(self.alignmentDir) os.makedirs(self.geneTreeDir) os.makedirs(self.conspecificGeneTreeDir) os.makedirs(self.finalGeneTreeDir) def __checkForHMMER(self): """Check to see if HMMER is on the system path.""" try: exit_status = os.system('hmmfetch -h > /dev/null') except: print "Unexpected error!", sys.exc_info()[0] raise if exit_status != 0: print "[Error] hmmfetch is not on the system path" sys.exit() def __checkForFastTree(self): """Check to see if FastTree is on the system path.""" try: exit_status = os.system('FastTree 2> /dev/null') except: print "Unexpected error!", sys.exc_info()[0] raise if exit_status != 0: print "[Error] FastTree is not on the system path" sys.exit() def __genesInGenomes(self, genomeIds): genesInGenomes = {} for genomeId in genomeIds: markerIdToGeneIds = defaultdict(set) for line in open(os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.pfamExtension)): lineSplit = line.split('\t') markerIdToGeneIds[lineSplit[8]].add(lineSplit[0]) for line in open(os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.tigrExtension)): lineSplit = line.split('\t') markerIdToGeneIds[lineSplit[6]].add(lineSplit[0]) genesInGenomes[genomeId] = markerIdToGeneIds return genesInGenomes def __fetchMarkerModels(self, universalMarkerGenes, outputModelDir): markerIdToName = {} for line in open('/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'): if 'NAME' in line: name = line.split()[1].rstrip() elif 'ACC' in line: acc = line.split()[1].rstrip() markerId = acc.replace('PF', 'pfam') markerId = markerId[0:markerId.rfind('.')] markerIdToName[markerId] = name for markerId in universalMarkerGenes: if 'pfam' in markerId: os.system('hmmfetch /srv/whitlam/bio/db/pfam/27/Pfam-A.hmm ' + markerIdToName[markerId] + ' > ' + os.path.join(outputModelDir, markerId.replace('pfam', 'PF') + '.hmm')) else: os.system('hmmfetch /srv/whitlam/bio/db/tigrfam/13.0/' + markerId + '.HMM ' + markerId + ' > ' + os.path.join(outputModelDir, markerId + '.hmm')) def __alignMarkers(self, genomeIds, markerGenes, genesInGenomes, numThreads, outputGeneDir, outputModelDir): """Perform multithreaded alignment of marker genes using HMM align.""" workerQueue = mp.Queue() writerQueue = mp.Queue() for _, markerId in enumerate(markerGenes): workerQueue.put(markerId) for _ in range(numThreads): workerQueue.put(None) calcProc = [mp.Process(target = self.__runHmmAlign, args = (genomeIds, genesInGenomes, outputGeneDir, outputModelDir, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__reportThreads, args = (len(markerGenes), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put(None) writeProc.join() def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in genomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): if geneId not in seqs: # this shouldn't be necessary, but the IMG metadata isn't always # perfectly in sync with the sequence data continue fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName) def __reportThreads(self, numGenes, writerQueue): """Store confidence intervals (i.e., to shared memory).""" numProcessedGenes = 0 while True: markerId = writerQueue.get(block=True, timeout=None) if markerId == None: break numProcessedGenes += 1 statusStr = ' Finished processing %d of %d (%.2f%%) marker genes.' % (numProcessedGenes, numGenes, float(numProcessedGenes)*100/numGenes) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() sys.stdout.write('\n') def __maskAlignment(self, inputFile, outputFile): """Read HMMER alignment in STOCKHOLM format and output masked alignment in FASTA format.""" # read STOCKHOLM alignment seqs = {} for line in open(inputFile): line = line.rstrip() if line == '' or line[0] == '#' or line == '//': if 'GC RF' in line: mask = line.split('GC RF')[1].strip() continue else: lineSplit = line.split() seqs[lineSplit[0]] = lineSplit[1].upper().replace('.', '-').strip() # output masked sequences in FASTA format fout = open(outputFile, 'w') for seqId, seq in seqs.iteritems(): fout.write('>' + seqId + '\n') maskedSeq = ''.join([seq[i] for i in xrange(0, len(seq)) if mask[i] == 'x']) fout.write(maskedSeq + '\n') fout.close() def __taxonomicMarkers(self, taxaStr, metadata, ubiquityThreshold, singleCopyThreshold): """Get genomes and marker genes for a specific lineage.""" genomeIds = self.img.genomeIdsByTaxonomy(taxaStr, metadata) # hack to add in other genomes with incorrect taxonomy aceIds = set() for line in open('./taxonomicTrees/firmicutes.nds'): aceIds.add(line.strip()) aceIdsToImgIds = self.aceIdsToImgIds() for aceId in aceIds: if aceId in aceIdsToImgIds: genomeId = aceIdsToImgIds[aceId] if genomeId in metadata: genomeIds.add(genomeId) markerGenes = self.markerSetBuilder.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold ) return genomeIds, markerGenes def inferGeneTrees(self, phyloUbiquityThreshold, phyloSingleCopyThreshold, numThreads, outputGeneDir, outputModelDir, outgroupSize): # make sure output directory is empty if not os.path.exists(outputGeneDir): os.makedirs(outputGeneDir) if not os.path.exists(outputModelDir): os.makedirs(outputModelDir) files = os.listdir(outputGeneDir) for f in files: os.remove(os.path.join(outputGeneDir, f)) # get genomes and marker genes for taxonomic groups of interest print '' print 'Identifying genomes and marker genes of interest:' metadata = self.img.genomeMetadata() ingroupGenomeIds, ingroupMarkers = self.__taxonomicMarkers('Bacteria;Firmicutes', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold) outgroupGenomeIds = self.img.genomeIdsByTaxonomy('Bacteria;Coprothermobacter', metadata) #alphaGenomeIds, _ = self.__taxonomicMarkers('Bacteria;Proteobacteria;Alphaproteobacteria', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold) print ' Identified ingroup genomes: %d' % len(ingroupGenomeIds) print ' Identified outgroup genomes: %d' % len(outgroupGenomeIds) numOutgroupTaxa = min(outgroupSize, len(outgroupGenomeIds)) print '' print ' Selecting %d taxa from the outgroup.' % (numOutgroupTaxa) genomeIds = ingroupGenomeIds.union(random.sample(outgroupGenomeIds, numOutgroupTaxa)) self.imgIdsToAceIds(genomeIds) print ' Identified markers: %d' % len(ingroupMarkers) # get mapping of marker ids to gene ids for each genome print ' Determine genes for genomes of interest.' genesInGenomes = self.__genesInGenomes(genomeIds) # get HMM for each marker gene print ' Fetching HMM for each marker genes.' self.__fetchMarkerModels(ingroupMarkers, outputModelDir) # align gene sequences and infer gene trees print ' Aligning marker genes:' self.__alignMarkers(genomeIds, ingroupMarkers, genesInGenomes, numThreads, outputGeneDir, outputModelDir) return genomeIds def imgIdsToAceIds(self, imgIds): imgIdToAceId = {} for line in open('ggg_tax_img.feb_2014.txt'): lineSplit = line.split('\t') imgIdToAceId[lineSplit[1].rstrip()] = lineSplit[0] missing = 0 for imgId in imgIds: if imgId not in imgIdToAceId: missing += 1 print ' Number of genomes without an ACE id: ' + str(missing) return imgIdToAceId def aceIdsToImgIds(self): aceIdsToImgIds = {} for line in open('ggg_tax_img.feb_2014.txt'): lineSplit = line.split('\t') aceIdsToImgIds[lineSplit[0].strip()] = lineSplit[1].strip() return aceIdsToImgIds def run(self, numThreads, outgroupSize): # identify genes suitable for phylogenetic inference print '--- Identifying genes suitable for phylogenetic inference ---' genomeIds = self.inferGeneTrees(self.phyloUbiquity, self.phyloSingleCopy, numThreads, self.alignmentDir, self.hmmDir, outgroupSize) # infer gene trees print '' print '--- Inferring gene trees ---' makeTrees = MakeTrees() makeTrees.run(self.alignmentDir, self.geneTreeDir, '.aln.masked.faa', numThreads) # test gene trees for paralogs print '' print '--- Testing for paralogs in gene trees ---' paralogTest = ParalogTest() paralogTest.run(self.geneTreeDir, self.paralogAcceptPer, '.tre', self.conspecificGeneTreeDir) # test gene trees for consistency with IMG taxonomy print '' print '--- Testing taxonomic consistency of gene trees ---' consistencyTest = ConsistencyTest() consistencyTest.run(self.conspecificGeneTreeDir, '.tre', self.consistencyAcceptPer, self.consistencyMinTaxa, self.consistencyOut, self.finalGeneTreeDir) # gather phylogenetically informative HMMs into a single model file print '' print '--- Gathering phylogenetically informative HMMs ---' getPhylogeneticHMMs = GetPhylogeneticHMMs() getPhylogeneticHMMs.run(self.hmmDir, self.finalGeneTreeDir, self.phyloHMMsOut) # infer genome tree print '' print '--- Inferring full genome tree ---' inferGenomeTree = InferGenomeTree() inferGenomeTree.run(self.finalGeneTreeDir, self.alignmentDir, '.aln.masked.faa', self.concatenatedAlignFile, self.treeOut, self.taxonomyOut, bSupportValues = True) # replace IMG identifiers with ACE identifiers imgIdToAceId = self.imgIdsToAceIds(genomeIds) with open(self.treeOut) as f: tree = ''.join(f.readlines()) for genomeId in genomeIds: if genomeId in imgIdToAceId: tree = tree.replace('IMG_' + genomeId, imgIdToAceId[genomeId]) fout = open(self.treeOutAce, 'w') fout.write(tree) fout.close()
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() def run(self): print('\n Reading reference genome tree.') treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # get all Finished, Trusted genomes metadata = self.img.genomeMetadata() bacteriaIds = self.img.getGenomesByClade('domain', 'Bacteria', metadata) print('# Bacteria: %d' % len(bacteriaIds)) start = time.time() #self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print('globalGeneCountTable: %.2f' % (end - start)) start = time.time() #self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print('precomputeGenomeSeqLens: %.2f' % (end - start)) start = time.time() #self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 5000) end = time.time() print('precomputeGenomeFamilyPositions: %.2f' % (end - start)) #start = time.time() #test = self.img.geneDistTable(metadata.keys(), self.markerSetBuilder.globalGeneCountTable.keys(), spacingBetweenContigs=1e6) #end = time.time() #print 'geneDistTable: %.2f' % (end - start) #t = raw_input('waiting...') start = time.time() #testGenomeId = archaeaIds.pop() #testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) #binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, 0.97, 0.97, bMarkerSet = True) markerSet = self.markerSetBuilder.buildMarkerSet( bacteriaIds, 0.97, 0.97) end = time.time() print('buildMarkerSet: %.2f' % (end - start)) print(len(markerSet.markerSet)) test = eval( "[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]" ) print(len(test)) for ms in markerSet.markerSet: bMatch = False for tms in test: if tms == ms: print(ms) print(tms) print('---------') bMatch = True break if not bMatch: print('BOO!') if str( markerSet.markerSet ) == "[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]": print('Good to go!') else: print('oh, shit!!!!')
def run(self, metadataFile, percentThreshold): img = IMG() metadata = img.genomeMetadataFromFile(metadataFile) matches = {} pfamCount = {} tigrCount = {} for genomeCounter, genomeId in enumerate(metadata): statusStr = ' Finished processing %d of %d (%.2f%%) genomes.' % (genomeCounter+1, len(metadata), float(genomeCounter+1)*100/len(metadata)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if metadata[genomeId]['status'] == 'Finished': pfamFile = img.genomeDir + genomeId + '/' + genomeId + img.pfamExtension if not os.path.exists(pfamFile): continue # get PFAM hits geneIdToPfams = {} bHeader = True for line in open(pfamFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') if lineSplit[0] in geneIdToPfams: geneIdToPfams[lineSplit[0]].add(lineSplit[8]) else: geneIdToPfams[lineSplit[0]] = set([lineSplit[8]]) if lineSplit[8] in pfamCount: pfamCount[lineSplit[8]].add(genomeId) else: pfamCount[lineSplit[8]] = set([genomeId]) # get TIGRFAM hits geneIdToTigr = {} bHeader = True for line in open(img.genomeDir + genomeId + '/' + genomeId + img.tigrExtension): if bHeader: bHeader = False continue lineSplit = line.split('\t') if lineSplit[0] in geneIdToTigr: geneIdToTigr[lineSplit[0]].add(lineSplit[6]) else: geneIdToTigr[lineSplit[0]] = set([lineSplit[6]]) if lineSplit[6] in tigrCount: tigrCount[lineSplit[6]].add(genomeId) else: tigrCount[lineSplit[6]] = set([genomeId]) # keep track of TIGRFAMs matching the same gene as a PFAM geneIds = set(geneIdToPfams.keys()).union(set(geneIdToTigr.keys())) for geneId in geneIds: pfams = geneIdToPfams.get(geneId, None) tigrs = geneIdToTigr.get(geneId, None) if pfams == None or tigrs == None: continue for pfamId in pfams: for tigrId in tigrs: key = pfamId + '-' + tigrId if key in matches: matches[key].add(genomeId) else: matches[key] = set([genomeId]) sys.stdout.write('\n') # find TIGRFAMs that generally hit the same gene as a PFAM fout = open('../data/pfam/tigrfam2pfam.tsv', 'w') for key, genomeSet in matches.items(): pfam, tigr = key.split('-') # deem a TIGRFAM HMM redundant if it is almost always hits that # same ORF as a PFAM HMM if float(len(genomeSet)) / len(tigrCount[tigr]) >= percentThreshold: fout.write(pfam + '\t' + tigr + '\n') fout.close()
def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination): img = IMG() markerSetBuilder = MarkerSetBuilder() allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w') allOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w') trustedOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w') filteredOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) metadataOut = open(outputMetadataFile, 'w') # read input metadata file metadata = img.genomeMetadataFromFile(inputMetadataFile) finishedGenomes = defaultdict(set) allGenomes = defaultdict(set) metadataLine = {} bHeader = True for line in open(inputMetadataFile): if bHeader: metadataOut.write(line) bHeader = False continue lineSplit = line.split('\t') genomeId = lineSplit[0] domain = lineSplit[1] status = lineSplit[2] if status == 'Finished': finishedGenomes[domain].add(genomeId) allGenomes[domain].add(genomeId) metadataLine[genomeId] = line allTrustedGenomeIds = set() for lineage, allLineageGenomeIds in allGenomes.items(): print('[' + lineage + ']') print(' Number of genomes: %d' % len(allLineageGenomeIds)) # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker genes for finished genomes print( '\nDetermining initial marker gene sets for genome filtering.') markerSet = markerSetBuilder.buildMarkerSet( finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold) print( ' Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets())) fout = open( os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w') fout.write(str(markerSet.markerSet)) fout.close() # identifying trusted genomes (highly complete, low contamination genomes) print('\nIdentifying highly complete, low contamination genomes.') trustedGenomeIds = set() filteredGenomes = set() retainedStatus = {} filteredStatus = {} geneCountTable = img.geneCountTable(allLineageGenomeIds) for genomeId in allLineageGenomeIds: completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck( markerSet.markerSet, genomeId, geneCountTable) genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) retainedStatus[metadata[genomeId] ['status']] = retainedStatus.get( metadata[genomeId]['status'], 0) + 1 trustedOut.write(genomeStr) allOut.write(genomeStr) metadataOut.write(metadataLine[genomeId]) else: filteredGenomes.add(genomeId) filteredStatus[metadata[genomeId] ['status']] = filteredStatus.get( metadata[genomeId]['status'], 0) + 1 filteredOut.write(genomeStr) allOut.write(genomeStr) print(' Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes) * 100.0 / len(allLineageGenomeIds))) print(' ' + str(filteredStatus)) print(' \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds) * 100.0 / len(allLineageGenomeIds))) print(' ' + str(retainedStatus)) # determine status of retained genomes print('\nTrusted genomes by phylum:') trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.items(): print(' ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count)) print('') allOut.close() trustedOut.close() filteredOut.close() metadataOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in range(0, 6): # Domain to Genus for genomeId, data in metadata.items(): taxaStr = ';'.join(data['taxonomy'][0:r + 1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted(metadata) fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' + str(trustedStats.get(lineage, 0)) + '\n') fout.close()
class MarkerSetSelection(object): def __init__(self): self.simFile = './experiments/simulation.tuning.genus.summary.tsv' self.looRank = 5 self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() def __stabilityTest(self, genomeIds, ubiquityThreshold=0.97, singleCopyThreshold=0.97, stabilityThreshold=0.05): """Test stability of marker set for a group of genomes using LOO-testing.""" # quick escape for lineage that are clearly stable if len(genomeIds) > 200: return True # calculate marker sets using a LOO-testing looMarkerGenes = [] for genomeId in genomeIds: looGenomeIds = genomeIds.difference([genomeId]) # calculate marker genes geneCountTable = self.img.geneCountTable(looGenomeIds) markerGenes = self.markerSetBuilder.markerGenes( looGenomeIds, geneCountTable, ubiquityThreshold * len(looGenomeIds), singleCopyThreshold * len(looGenomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove looMarkerGenes.append(markerGenes) # calculate change in marker set for all pairs markerSetSize = [] diffMarkerSet = [] for i in range(0, len(looMarkerGenes)): markerSetSize.append(len(looMarkerGenes[i])) for j in range(i + 1, len(looMarkerGenes)): symmDiff = looMarkerGenes[i].symmetric_difference( looMarkerGenes[j]) diffMarkerSet.append(len(symmDiff)) print(len(genomeIds), mean(diffMarkerSet), mean(markerSetSize)) return (float(mean(diffMarkerSet)) / mean(markerSetSize)) <= stabilityThreshold def __patristicDist(self, tree, taxa1, taxa2): mrca = tree.mrca(taxon_labels=[taxa1.taxon.label, taxa2.taxon.label]) if mrca.parent_node == None: # MRCA is the root of the tree return taxa1.distance_from_root() + taxa2.distance_from_root() else: dist = taxa1.edge_length parentNode = taxa1.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node dist += taxa2.edge_length parentNode = taxa2.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node return dist def __distToNodePercentileTest(self, genomeNode, markerSetNode, leaves, percentileTest): distToBin = self.__distanceToAncestor(genomeNode, markerSetNode) distToLeaves = [] for leaf in leaves: distToLeaves.append(self.__distanceToAncestor(leaf, markerSetNode)) return distToBin < percentile(distToLeaves, percentileTest) def __selectMarkerSetNode(self, tree, genomeId, metadata, taxonToGenomeIds): """Determine lineage-specific marker set to use for assessing the giving genome.""" # read genomes removed from tree as a result of duplicate sequences duplicateSeqs = self.markerSetBuilder.readDuplicateSeqs() # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # ascend tree to root looking for suitable marker set curNode = node.parent_node while curNode != None: uniqueId = curNode.label.split('|')[0] genomeIds = set() for leaf in curNode.leaf_nodes(): genomeIds.add(leaf.taxon.label.replace('IMG_', '')) duplicateGenomes = duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: genomeIds.add(dup.replace('IMG_', '')) # remove genome (LOO-style analysis) print('Full:', len(genomeIds)) genomeIds.difference_update([genomeId]) print('LOO:', len(genomeIds)) # remove all genomes from the same taxonomic group as the genome of interest taxon = metadata[genomeId]['taxonomy'][self.looRank] genomeIds.difference_update(taxonToGenomeIds[taxon]) print('Rank reduced:', len(genomeIds)) print(uniqueId) if len(genomeIds) > 10 and self.__stabilityTest(genomeIds): uidSelected = uniqueId break curNode = curNode.parent_node if curNode == None: # reach root so use universal marker set uidSelected = uniqueId return uidSelected def __bestMarkerSet(self, genomeId, simResults): """Get stats for best marker set.""" curBest = 1000 bestUID = None for uid, results in simResults[genomeId].items(): numDescendants, dComp, dCont = results if (dComp + dCont) < curBest: numDescendantsBest = numDescendants dCompBest = dComp dContBest = dCont bestUID = uid curBest = dComp + dCont return bestUID, numDescendantsBest, dCompBest, dContBest def __workerThread(self, tree, simResults, metadata, taxonToGenomeIds, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break uidSelected = self.__selectMarkerSetNode(tree, testGenomeId, metadata, taxonToGenomeIds) numDescendantsSelected, dCompSelected, dContSelected = simResults[ testGenomeId][uidSelected] # find best marker set bestUID, numDescendantsBest, dCompBest, dContBest = self.__bestMarkerSet( testGenomeId, simResults) queueOut.put((testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" fout = open('./experiments/markerSetSelection.tsv', 'w') fout.write( 'Genome Id\tSelected UID\t# descendants\tSelected dComp\tSelected dCont\tBest UID\t# descendants\tBest dComp\tBest dCont\tdDescendants\tdComp\tdCont\n' ) itemsToProcess = 0 dComps = [] dConts = [] dCompsPer = [] dContsPer = [] bestComp = [] bestCont = [] selectedComp = [] selectedCont = [] while True: testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest = writerQueue.get( block=True, timeout=None) if testGenomeId == None: break itemsToProcess += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test genomes.' % ( itemsToProcess, numTestGenomes, float(itemsToProcess) * 100 / (numTestGenomes)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() dComp = abs(dCompSelected - dCompBest) dCont = abs(dContSelected - dContBest) dDescendants = abs(numDescendantsSelected - numDescendantsBest) fout.write( '%s\t%s\t%d\t%.4f\t%.4f\t%s\t%d\t%.4f\t%.4f\t%d\t%.4f\t%.4f\n' % (testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest, dDescendants, dComp, dCont)) dComps.append(dComp) dConts.append(dCont) dCompsPer.append(dComp * 100.0 / dCompBest) dContsPer.append(dCont * 100.0 / max(dContBest, 0.01)) bestComp.append(dCompBest) bestCont.append(dContBest) selectedComp.append(dCompSelected) selectedCont.append(dContSelected) sys.stdout.write('\n') fout.close() print('') print(' General results:') print(' Best comp: %.2f +/- %.2f' % (mean(bestComp), std(bestComp))) print(' Best cont: %.2f +/- %.2f' % (mean(bestCont), std(bestCont))) print(' Selected comp: %.2f +/- %.2f' % (mean(selectedComp), std(selectedComp))) print(' Selected cont: %.2f +/- %.2f' % (mean(selectedCont), std(selectedCont))) print('') print(' Delta comp: %.2f +/- %.2f' % (mean(dComps), std(dComps))) print(' Delta cont: %.2f +/- %.2f' % (mean(dConts), std(dConts))) print(' Delta comp per error: %.1f +/- %.1f' % (mean(dCompsPer), std(dCompsPer))) print(' Delta cont per error: %.1f +/- %.1f' % (mean(dContsPer), std(dContsPer))) def __distanceToAncestor(self, leaf, ancestor): dist = 0 curNode = leaf while curNode != ancestor: dist += curNode.edge_length curNode = curNode.parent_node return dist def __bestNodeProperties(self, genomeId, tree, bestUID): # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # find node of best marker set curNode = node.parent_node nodesToBin = 0 distanceToBin = node.edge_length distanceToLeaves = [] while curNode != None: uniqueId = curNode.label.split('|')[0] nodesToBin += 1 if uniqueId == bestUID: for leaf in curNode.leaf_nodes(): if leaf != node: dist = self.__distanceToAncestor(leaf, curNode) distanceToLeaves.append(dist) break distanceToBin += curNode.edge_length curNode = curNode.parent_node return nodesToBin, distanceToBin, mean(distanceToLeaves) def __propertiesOfBestMarkerSets(self, tree, simResults): numDescendants = [] nodesToBin = [] distanceToBin = [] avgDistanceToLeaf = [] percDiffs = [] for genomeId in simResults: bestUID, numDescendantsBest, _, _ = self.__bestMarkerSet( genomeId, simResults) nodesToBinBest, distanceToBinBest, avgDistanceToLeafBest = self.__bestNodeProperties( genomeId, tree, bestUID) numDescendants.append(numDescendantsBest) nodesToBin.append(nodesToBinBest) distanceToBin.append(distanceToBinBest) avgDistanceToLeaf.append(avgDistanceToLeafBest) percDiff = abs(distanceToBinBest - avgDistanceToLeafBest) * 100 / distanceToBinBest percDiffs.append(percDiff) print(' # descendants: %.2f +/- %.2f' % (mean(numDescendants), std(numDescendants))) print(' # nodes to bin: %.2f +/- %.2f' % (mean(nodesToBin), std(nodesToBin))) print(' Distance to bin: %.2f +/- %.2f' % (mean(distanceToBin), std(distanceToBin))) distanceToBin = array(distanceToBin) avgDistanceToLeaf = array(avgDistanceToLeaf) print(' Distance to bin - average distance to leaf: %.2f +/- %.2f' % (mean(abs(distanceToBin - avgDistanceToLeaf)), std(abs(distanceToBin - avgDistanceToLeaf)))) print( ' Percent difference to average leaf distance: %.2f +/- %.2f' % (mean(percDiffs), std(percDiffs))) print('') def run(self, numThreads): # read reference tree print('\n Reading reference genome tree.') treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # get all genomes with a given taxon label metadata = self.img.genomeMetadata() taxonToGenomeIds = defaultdict(set) for genomeId in metadata: for t in metadata[genomeId]['taxonomy']: taxonToGenomeIds[t].add(genomeId) # read simulation results print(' Reading simulation results.') simResults = defaultdict(dict) with open(self.simFile) as f: f.readline() for line in f: lineSplit = line.split('\t') simId = lineSplit[0] + '-' + lineSplit[1] + '-' + lineSplit[ 2] + '-' + lineSplit[3] uid = lineSplit[5].split('|')[0].strip() numDescendants = int(lineSplit[6]) comp = float(lineSplit[21]) cont = float(lineSplit[23]) simResults[simId][uid] = [numDescendants, comp, cont] #print '' #print ' Properties of best marker sets:' #self.__propertiesOfBestMarkerSets(tree, simResults) print(' Evaluating %d test genomes.' % len(simResults)) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in simResults: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [ mp.Process(target=self.__workerThread, args=(tree, simResults, metadata, taxonToGenomeIds, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(simResults), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None)) writeProc.join()
class GenomeTreeWorkflow(object): def __init__(self, outputDir): self.img = IMG() self.markerSetBuilder = MarkerSetBuilder() if os.path.exists(outputDir): print '[Error] Output directory already exists: ' + outputDir sys.exit(0) else: os.makedirs(outputDir) self.__checkForHMMER() self.__checkForFastTree() self.hmmDir = os.path.join(outputDir, 'phylo_hmms') self.alignmentDir = os.path.join(outputDir, 'gene_alignments') self.geneTreeDir = os.path.join(outputDir, 'gene_trees') self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific') self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final') self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv') self.concatenatedAlignFile = os.path.join( outputDir, 'genome_tree.concatenated.faa') self.derepConcatenatedAlignFile = os.path.join( outputDir, 'genome_tree.concatenated.derep.fasta') self.treeOut = os.path.join(outputDir, 'genome_tree.tre') self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre') self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre') self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre') self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre') self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre') self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre') self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre') self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv') self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv') self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm') self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt') self.phyloUbiquity = 0.90 self.phyloSingleCopy = 0.90 self.paralogAcceptPer = 0.01 # self.consistencyAcceptPer = 0.95 # for trees at the class-level self.consistencyAcceptPer = 0.906 # for trees at the phylum-level self.consistencyMinTaxa = 20 # create output directories os.makedirs(self.hmmDir) os.makedirs(self.alignmentDir) os.makedirs(self.geneTreeDir) os.makedirs(self.conspecificGeneTreeDir) os.makedirs(self.finalGeneTreeDir) def __checkForHMMER(self): """Check to see if HMMER is on the system path.""" try: exit_status = os.system('hmmfetch -h > /dev/null') except: print "Unexpected error!", sys.exc_info()[0] raise if exit_status != 0: print "[Error] hmmfetch is not on the system path" sys.exit() def __checkForFastTree(self): """Check to see if FastTree is on the system path.""" try: exit_status = os.system('FastTree 2> /dev/null') except: print "Unexpected error!", sys.exc_info()[0] raise if exit_status != 0: print "[Error] FastTree is not on the system path" sys.exit() def __genesInGenomes(self, genomeIds): genesInGenomes = {} for genomeId in genomeIds: markerIdToGeneIds = defaultdict(set) for line in open( os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.pfamExtension)): lineSplit = line.split('\t') markerIdToGeneIds[lineSplit[8]].add(lineSplit[0]) for line in open( os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.tigrExtension)): lineSplit = line.split('\t') markerIdToGeneIds[lineSplit[6]].add(lineSplit[0]) genesInGenomes[genomeId] = markerIdToGeneIds return genesInGenomes def __fetchMarkerModels(self, universalMarkerGenes, outputModelDir): markerIdToName = {} for line in open('/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'): if 'NAME' in line: name = line.split()[1].rstrip() elif 'ACC' in line: acc = line.split()[1].rstrip() markerId = acc.replace('PF', 'pfam') markerId = markerId[0:markerId.rfind('.')] markerIdToName[markerId] = name for markerId in universalMarkerGenes: if 'pfam' in markerId: os.system('hmmfetch /srv/whitlam/bio/db/pfam/27/Pfam-A.hmm ' + markerIdToName[markerId] + ' > ' + os.path.join(outputModelDir, markerId.replace('pfam', 'PF') + '.hmm')) else: os.system('hmmfetch /srv/whitlam/bio/db/tigrfam/13.0/' + markerId + '.HMM ' + markerId + ' > ' + os.path.join(outputModelDir, markerId + '.hmm')) def __alignMarkers(self, genomeIds, markerGenes, genesInGenomes, numThreads, outputGeneDir, outputModelDir): """Perform multithreaded alignment of marker genes using HMM align.""" workerQueue = mp.Queue() writerQueue = mp.Queue() for _, markerId in enumerate(markerGenes): workerQueue.put(markerId) for _ in range(numThreads): workerQueue.put(None) calcProc = [ mp.Process(target=self.__runHmmAlign, args=(genomeIds, genesInGenomes, outputGeneDir, outputModelDir, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__reportThreads, args=(len(markerGenes), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put(None) writeProc.join() def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut): """Run each marker gene in a separate thread.""" while True: markerId = queueIn.get(block=True, timeout=None) if markerId == None: break modelName = markerId if modelName.startswith('pfam'): modelName = modelName.replace('pfam', 'PF') markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa') fout = open(markerSeqFile, 'w') for genomeId in genomeIds: seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa') for geneId in genesInGenomes[genomeId].get(markerId, []): if geneId not in seqs: # this shouldn't be necessary, but the IMG metadata isn't always # perfectly in sync with the sequence data continue fout.write('>' + genomeId + '|' + geneId + '\n') fout.write(seqs[geneId] + '\n') fout.close() hmmer = HMMERRunner('align') hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam') self.__maskAlignment( os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa')) queueOut.put(modelName) def __reportThreads(self, numGenes, writerQueue): """Store confidence intervals (i.e., to shared memory).""" numProcessedGenes = 0 while True: markerId = writerQueue.get(block=True, timeout=None) if markerId == None: break numProcessedGenes += 1 statusStr = ' Finished processing %d of %d (%.2f%%) marker genes.' % ( numProcessedGenes, numGenes, float(numProcessedGenes) * 100 / numGenes) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() sys.stdout.write('\n') def __maskAlignment(self, inputFile, outputFile): """Read HMMER alignment in STOCKHOLM format and output masked alignment in FASTA format.""" # read STOCKHOLM alignment seqs = {} for line in open(inputFile): line = line.rstrip() if line == '' or line[0] == '#' or line == '//': if 'GC RF' in line: mask = line.split('GC RF')[1].strip() continue else: lineSplit = line.split() seqs[lineSplit[0]] = lineSplit[1].upper().replace('.', '-').strip() # output masked sequences in FASTA format fout = open(outputFile, 'w') for seqId, seq in seqs.iteritems(): fout.write('>' + seqId + '\n') maskedSeq = ''.join( [seq[i] for i in xrange(0, len(seq)) if mask[i] == 'x']) fout.write(maskedSeq + '\n') fout.close() def __taxonomicMarkers(self, taxaStr, metadata, ubiquityThreshold, singleCopyThreshold): """Get genomes and marker genes for a specific lineage.""" genomeIds = self.img.genomeIdsByTaxonomy(taxaStr, metadata) # hack to add in other genomes with incorrect taxonomy aceIds = set() for line in open('./taxonomicTrees/firmicutes.nds'): aceIds.add(line.strip()) aceIdsToImgIds = self.aceIdsToImgIds() for aceId in aceIds: if aceId in aceIdsToImgIds: genomeId = aceIdsToImgIds[aceId] if genomeId in metadata: genomeIds.add(genomeId) markerGenes = self.markerSetBuilder.buildMarkerGenes( genomeIds, ubiquityThreshold, singleCopyThreshold) return genomeIds, markerGenes def inferGeneTrees(self, phyloUbiquityThreshold, phyloSingleCopyThreshold, numThreads, outputGeneDir, outputModelDir, outgroupSize): # make sure output directory is empty if not os.path.exists(outputGeneDir): os.makedirs(outputGeneDir) if not os.path.exists(outputModelDir): os.makedirs(outputModelDir) files = os.listdir(outputGeneDir) for f in files: os.remove(os.path.join(outputGeneDir, f)) # get genomes and marker genes for taxonomic groups of interest print '' print 'Identifying genomes and marker genes of interest:' metadata = self.img.genomeMetadata() ingroupGenomeIds, ingroupMarkers = self.__taxonomicMarkers( 'Bacteria;Firmicutes', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold) outgroupGenomeIds = self.img.genomeIdsByTaxonomy( 'Bacteria;Coprothermobacter', metadata) # alphaGenomeIds, _ = self.__taxonomicMarkers('Bacteria;Proteobacteria;Alphaproteobacteria', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold) print ' Identified ingroup genomes: %d' % len(ingroupGenomeIds) print ' Identified outgroup genomes: %d' % len(outgroupGenomeIds) numOutgroupTaxa = min(outgroupSize, len(outgroupGenomeIds)) print '' print ' Selecting %d taxa from the outgroup.' % (numOutgroupTaxa) genomeIds = ingroupGenomeIds.union( random.sample(outgroupGenomeIds, numOutgroupTaxa)) self.imgIdsToAceIds(genomeIds) print ' Identified markers: %d' % len(ingroupMarkers) # get mapping of marker ids to gene ids for each genome print ' Determine genes for genomes of interest.' genesInGenomes = self.__genesInGenomes(genomeIds) # get HMM for each marker gene print ' Fetching HMM for each marker genes.' self.__fetchMarkerModels(ingroupMarkers, outputModelDir) # align gene sequences and infer gene trees print ' Aligning marker genes:' #***self.__alignMarkers(genomeIds, ingroupMarkers, genesInGenomes, numThreads, outputGeneDir, outputModelDir) return genomeIds def imgIdsToAceIds(self, imgIds): imgIdToAceId = {} for line in open('ggg_tax_img.feb_2014.txt'): lineSplit = line.split('\t') imgIdToAceId[lineSplit[1].rstrip()] = lineSplit[0] missing = 0 for imgId in imgIds: if imgId not in imgIdToAceId: missing += 1 print ' Number of genomes without an ACE id: ' + str(missing) return imgIdToAceId def aceIdsToImgIds(self): aceIdsToImgIds = {} for line in open('ggg_tax_img.feb_2014.txt'): lineSplit = line.split('\t') aceIdsToImgIds[lineSplit[0].strip()] = lineSplit[1].strip() return aceIdsToImgIds def run(self, numThreads, outgroupSize): # identify genes suitable for phylogenetic inference print '--- Identifying genes suitable for phylogenetic inference ---' genomeIds = self.inferGeneTrees(self.phyloUbiquity, self.phyloSingleCopy, numThreads, self.alignmentDir, self.hmmDir, outgroupSize) # infer gene trees print '' print '--- Inferring gene trees ---' makeTrees = MakeTrees() makeTrees.run(self.alignmentDir, self.geneTreeDir, '.aln.masked.faa', numThreads) # test gene trees for paralogs print '' print '--- Testing for paralogs in gene trees ---' paralogTest = ParalogTest() paralogTest.run(self.geneTreeDir, self.paralogAcceptPer, '.tre', self.conspecificGeneTreeDir) # test gene trees for consistency with IMG taxonomy print '' print '--- Testing taxonomic consistency of gene trees ---' consistencyTest = ConsistencyTest() consistencyTest.run(self.conspecificGeneTreeDir, '.tre', self.consistencyAcceptPer, self.consistencyMinTaxa, self.consistencyOut, self.finalGeneTreeDir) # gather phylogenetically informative HMMs into a single model file print '' print '--- Gathering phylogenetically informative HMMs ---' getPhylogeneticHMMs = GetPhylogeneticHMMs() getPhylogeneticHMMs.run(self.hmmDir, self.finalGeneTreeDir, self.phyloHMMsOut) # infer genome tree print '' print '--- Inferring full genome tree ---' inferGenomeTree = InferGenomeTree() inferGenomeTree.run(self.finalGeneTreeDir, self.alignmentDir, '.aln.masked.faa', self.concatenatedAlignFile, self.treeOut, self.taxonomyOut, bSupportValues=True) # replace IMG identifiers with ACE identifiers imgIdToAceId = self.imgIdsToAceIds(genomeIds) with open(self.treeOut) as f: tree = ''.join(f.readlines()) for genomeId in genomeIds: if genomeId in imgIdToAceId: tree = tree.replace('IMG_' + genomeId, imgIdToAceId[genomeId]) fout = open(self.treeOutAce, 'w') fout.write(tree) fout.close()
def __init__(self): self.simFile = './experiments/simulation.tuning.genus.summary.tsv' self.looRank = 5 self.markerSetBuilder = MarkerSetBuilder() self.img = IMG()
class MarkerSetStability(object): def __init__(self): self.img = IMG() self.markerset = MarkerSet() def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut): """Assess stability of marker set for a specific named taxonomic group.""" while True: lineage = queueIn.get(block=True, timeout=None) if lineage == None: break genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted') changeMarkerSetSize = {} markerGenes = [] if len(genomeIds) >= minGenomes: # calculate marker set for all genomes in lineage geneCountTable = self.img.geneCountTable(genomeIds) markerGenes = self.markerset.markerGenes( genomeIds, geneCountTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove for selectPer in range(50, 101, 5): numGenomesToSelect = int( float(selectPer) / 100 * len(genomeIds)) perChange = [] for _ in range(0, 10): # calculate marker set for subset of genomes subsetGenomeIds = random.sample( genomeIds, numGenomesToSelect) geneCountTable = self.img.geneCountTable( subsetGenomeIds) subsetMarkerGenes = self.markerset.markerGenes( subsetGenomeIds, geneCountTable, ubiquityThreshold * numGenomesToSelect, singleCopyThreshold * numGenomesToSelect) tigrToRemove = self.img.identifyRedundantTIGRFAMs( subsetMarkerGenes) subsetMarkerGenes = subsetMarkerGenes - tigrToRemove perChange.append( float( len( markerGenes.symmetric_difference( subsetMarkerGenes))) * 100.0 / len(markerGenes)) changeMarkerSetSize[selectPer] = [ mean(perChange), std(perChange) ] queueOut.put((lineage, len(genomeIds), len(markerGenes), changeMarkerSetSize)) def __storeResults(self, outputFile, totalLineages, writerQueue): """Store results to file.""" fout = open(outputFile, 'w') fout.write( 'Lineage\t# genomes\t# markers\tsubsample %\tmean % change\tstd % change\n' ) numProcessedLineages = 0 while True: lineage, numGenomes, numMarkerGenes, changeMarkerSetSize = writerQueue.get( block=True, timeout=None) if lineage == None: break numProcessedLineages += 1 statusStr = ' Finished processing %d of %d (%.2f%%) lineages.' % ( numProcessedLineages, totalLineages, float(numProcessedLineages) * 100 / totalLineages) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for selectPer in sorted(changeMarkerSetSize.keys()): fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, selectPer, changeMarkerSetSize[selectPer][0], changeMarkerSetSize[selectPer][1])) sys.stdout.write('\n') fout.close() def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads): """Calculate stability of marker sets for named taxonomic groups.""" print(' Calculating stability of marker sets:') random.seed(1) # process each sequence in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() metadata = self.img.genomeMetadata() lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank) #lineages = ['Bacteria'] #lineages += ['Bacteria;Proteobacteria'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;coli'] #lineages = ['Archaea'] #lineages += ['Archaea;Euryarchaeota'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosarcinaceae'] for lineage in lineages: workerQueue.put(lineage) for _ in range(numThreads): workerQueue.put(None) calcProc = [ mp.Process(target=self.__processLineage, args=(metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads) ] writeProc = mp.Process(target=self.__storeResults, args=(outputFile, len(lineages), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None, None, None)) writeProc.join()
class MarkerSetStability(object): def __init__(self): self.img = IMG() self.markerset = MarkerSet() def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut): """Assess stability of marker set for a specific named taxonomic group.""" while True: lineage = queueIn.get(block=True, timeout=None) if lineage == None: break genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted') changeMarkerSetSize = {} markerGenes = [] if len(genomeIds) >= minGenomes: # calculate marker set for all genomes in lineage geneCountTable = self.img.geneCountTable(genomeIds) markerGenes = self.markerset.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove for selectPer in xrange(50, 101, 5): numGenomesToSelect = int(float(selectPer)/100 * len(genomeIds)) perChange = [] for _ in xrange(0, 10): # calculate marker set for subset of genomes subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect) geneCountTable = self.img.geneCountTable(subsetGenomeIds) subsetMarkerGenes = self.markerset.markerGenes(subsetGenomeIds, geneCountTable, ubiquityThreshold*numGenomesToSelect, singleCopyThreshold*numGenomesToSelect) tigrToRemove = self.img.identifyRedundantTIGRFAMs(subsetMarkerGenes) subsetMarkerGenes = subsetMarkerGenes - tigrToRemove perChange.append(float(len(markerGenes.symmetric_difference(subsetMarkerGenes)))*100.0 / len(markerGenes)) changeMarkerSetSize[selectPer] = [mean(perChange), std(perChange)] queueOut.put((lineage, len(genomeIds), len(markerGenes), changeMarkerSetSize)) def __storeResults(self, outputFile, totalLineages, writerQueue): """Store results to file.""" fout = open(outputFile, 'w') fout.write('Lineage\t# genomes\t# markers\tsubsample %\tmean % change\tstd % change\n') numProcessedLineages = 0 while True: lineage, numGenomes, numMarkerGenes, changeMarkerSetSize = writerQueue.get(block=True, timeout=None) if lineage == None: break numProcessedLineages += 1 statusStr = ' Finished processing %d of %d (%.2f%%) lineages.' % (numProcessedLineages, totalLineages, float(numProcessedLineages)*100/totalLineages) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() for selectPer in sorted(changeMarkerSetSize.keys()): fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, selectPer, changeMarkerSetSize[selectPer][0], changeMarkerSetSize[selectPer][1])) sys.stdout.write('\n') fout.close() def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads): """Calculate stability of marker sets for named taxonomic groups.""" print ' Calculating stability of marker sets:' random.seed(1) # process each sequence in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() metadata = self.img.genomeMetadata() lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank) #lineages = ['Bacteria'] #lineages += ['Bacteria;Proteobacteria'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia'] #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;coli'] #lineages = ['Archaea'] #lineages += ['Archaea;Euryarchaeota'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales'] #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosarcinaceae'] for lineage in lineages: workerQueue.put(lineage) for _ in range(numThreads): workerQueue.put(None) calcProc = [mp.Process(target = self.__processLineage, args = (metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__storeResults, args = (outputFile, len(lineages), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None, None, None)) writeProc.join()
def run(self, metadataFile, percentThreshold): img = IMG() metadata = img.genomeMetadataFromFile(metadataFile) matches = {} pfamCount = {} tigrCount = {} for genomeCounter, genomeId in enumerate(metadata): statusStr = ' Finished processing %d of %d (%.2f%%) genomes.' % (genomeCounter+1, len(metadata), float(genomeCounter+1)*100/len(metadata)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if metadata[genomeId]['status'] == 'Finished': pfamFile = img.genomeDir + genomeId + '/' + genomeId + img.pfamExtension if not os.path.exists(pfamFile): continue # get PFAM hits geneIdToPfams = {} bHeader = True for line in open(pfamFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') if lineSplit[0] in geneIdToPfams: geneIdToPfams[lineSplit[0]].add(lineSplit[8]) else: geneIdToPfams[lineSplit[0]] = set([lineSplit[8]]) if lineSplit[8] in pfamCount: pfamCount[lineSplit[8]].add(genomeId) else: pfamCount[lineSplit[8]] = set([genomeId]) # get TIGRFAM hits geneIdToTigr = {} bHeader = True for line in open(img.genomeDir + genomeId + '/' + genomeId + img.tigrExtension): if bHeader: bHeader = False continue lineSplit = line.split('\t') if lineSplit[0] in geneIdToTigr: geneIdToTigr[lineSplit[0]].add(lineSplit[6]) else: geneIdToTigr[lineSplit[0]] = set([lineSplit[6]]) if lineSplit[6] in tigrCount: tigrCount[lineSplit[6]].add(genomeId) else: tigrCount[lineSplit[6]] = set([genomeId]) # keep track of TIGRFAMs matching the same gene as a PFAM geneIds = set(geneIdToPfams.keys()).union(set(geneIdToTigr.keys())) for geneId in geneIds: pfams = geneIdToPfams.get(geneId, None) tigrs = geneIdToTigr.get(geneId, None) if pfams == None or tigrs == None: continue for pfamId in pfams: for tigrId in tigrs: key = pfamId + '-' + tigrId if key in matches: matches[key].add(genomeId) else: matches[key] = set([genomeId]) sys.stdout.write('\n') # find TIGRFAMs that generally hit the same gene as a PFAM fout = open('../data/pfam/tigrfam2pfam.tsv', 'w') for key, genomeSet in matches.iteritems(): pfam, tigr = key.split('-') # deem a TIGRFAM HMM redundant if it is almost always hits that # same ORF as a PFAM HMM if float(len(genomeSet)) / len(tigrCount[tigr]) >= percentThreshold: fout.write(pfam + '\t' + tigr + '\n') fout.close()
class MarkerSetStabilityTest(object): def __init__(self): self.img = IMG() self.markerset = MarkerSet() def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut): """Assess stability of marker set for a specific named taxonomic group.""" while True: lineage = queueIn.get(block=True, timeout=None) if lineage == None: break genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted') markerGenes = [] perChange = [] numGenomesToSelect = int(0.9*len(genomeIds)) if len(genomeIds) >= minGenomes: # calculate marker set for all genomes in lineage geneCountTable = self.img.geneCountTable(genomeIds) markerGenes = self.markerset.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove for _ in xrange(0, 100): # calculate marker set for subset of genomes subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect) geneCountTable = self.img.geneCountTable(subsetGenomeIds) subsetMarkerGenes = self.markerset.markerGenes(subsetGenomeIds, geneCountTable, ubiquityThreshold*numGenomesToSelect, singleCopyThreshold*numGenomesToSelect) tigrToRemove = self.img.identifyRedundantTIGRFAMs(subsetMarkerGenes) subsetMarkerGenes = subsetMarkerGenes - tigrToRemove perChange.append(float(len(markerGenes.symmetric_difference(subsetMarkerGenes)))*100.0 / len(markerGenes)) if perChange != []: queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, mean(perChange), std(perChange))) else: queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, -1, -1)) def __storeResults(self, outputFile, totalLineages, writerQueue): """Store results to file.""" fout = open(outputFile, 'w') fout.write('Lineage\t# genomes\t# markers\t# sampled genomes\tmean % change\tstd % change\n') numProcessedLineages = 0 while True: lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange = writerQueue.get(block=True, timeout=None) if lineage == None: break numProcessedLineages += 1 statusStr = ' Finished processing %d of %d (%.2f%%) lineages.' % (numProcessedLineages, totalLineages, float(numProcessedLineages)*100/totalLineages) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange)) sys.stdout.write('\n') fout.close() def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads): """Calculate stability of marker sets for named taxonomic groups.""" print ' Testing stability of marker sets:' random.seed(1) # process each sequence in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() metadata = self.img.genomeMetadata() lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank) for lineage in lineages: workerQueue.put(lineage) for _ in range(numThreads): workerQueue.put(None) calcProc = [mp.Process(target = self.__processLineage, args = (metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__storeResults, args = (outputFile, len(lineages), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put((None, None, None, None, None, None)) writeProc.join()
def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() self.simContigLen = 10000
class Simulation(object): def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() def run(self): print '\n Reading reference genome tree.' treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # get all Finished, Trusted genomes metadata = self.img.genomeMetadata() bacteriaIds = self.img.getGenomesByClade('domain', 'Bacteria', metadata) print '# Bacteria: %d' % len(bacteriaIds) start = time.time() #self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print 'globalGeneCountTable: %.2f' % (end - start) start = time.time() #self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys()) end = time.time() print 'precomputeGenomeSeqLens: %.2f' % (end - start) start = time.time() #self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 5000) end = time.time() print 'precomputeGenomeFamilyPositions: %.2f' % (end - start) #start = time.time() #test = self.img.geneDistTable(metadata.keys(), self.markerSetBuilder.globalGeneCountTable.keys(), spacingBetweenContigs=1e6) #end = time.time() #print 'geneDistTable: %.2f' % (end - start) #t = raw_input('waiting...') start = time.time() #testGenomeId = archaeaIds.pop() #testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId) #binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, 0.97, 0.97, bMarkerSet = True) markerSet = self.markerSetBuilder.buildMarkerSet(bacteriaIds, 0.97, 0.97) end = time.time() print 'buildMarkerSet: %.2f' % (end - start) print len(markerSet.markerSet) test = eval("[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]") print len(test) for ms in markerSet.markerSet: bMatch = False for tms in test: if tms == ms: print ms print tms print '---------' bMatch = True break if not bMatch: print 'BOO!' if str(markerSet.markerSet) == "[set(['TIGR01080', 'pfam08071', 'pfam00900']), set(['pfam08069', 'pfam00312']), set(['pfam00276', 'pfam00573', 'pfam00297']), set(['pfam00333', 'pfam00327', 'pfam03719']), set(['pfam04563', 'pfam04560', 'pfam04983', 'pfam04566', 'pfam04567', 'pfam04565', 'pfam04997', 'pfam00562', 'pfam05000', 'pfam00623', 'pfam01191', 'pfam04561', 'TIGR02389']), set(['pfam00831', 'pfam01868', 'pfam00189', 'pfam00237']), set(['pfam01280', 'pfam00861', 'pfam01655']), set(['pfam01194', 'pfam00380', 'pfam00572']), set(['TIGR00425', 'pfam08068']), set(['pfam01157', 'pfam03874']), set(['pfam00416', 'TIGR01018']), set(['pfam00679', 'pfam03764']), set(['TIGR00344', 'TIGR03683']), set(['pfam01193', 'pfam01000']), set(['pfam00750', 'pfam05746']), set(['pfam00935', 'pfam01667']), set(['pfam00867', 'pfam00752']), set(['pfam01172', 'pfam09377']), set(['pfam03950', 'pfam00749']), set(['pfam00181', 'pfam03947']), set(['pfam00687', 'pfam00466']), set(['TIGR03679', 'TIGR00289']), set(['pfam01198', 'pfam01912']), set(['pfam00673', 'pfam00281']), set(['TIGR00134']), set(['pfam00410']), set(['pfam00411']), set(['pfam01090']), set(['pfam01092']), set(['pfam04919']), set(['TIGR00336']), set(['pfam01864']), set(['TIGR00442']), set(['pfam01866']), set(['pfam01780']), set(['TIGR01046']), set(['pfam00318']), set(['pfam00252']), set(['pfam09173']), set(['pfam00238']), set(['pfam01798']), set(['pfam01246']), set(['pfam07541']), set(['pfam00736']), set(['TIGR00522']), set(['pfam01269']), set(['TIGR00329']), set(['pfam01015']), set(['TIGR00392']), set(['pfam00203']), set(['TIGR00398']), set(['pfam01725']), set(['pfam02005']), set(['TIGR00422']), set(['pfam03439']), set(['pfam01351']), set(['pfam01922']), set(['pfam11987']), set(['pfam04127']), set(['TIGR00064']), set(['TIGR00389']), set(['pfam13656']), set(['pfam00298']), set(['TIGR00432']), set(['TIGR03677']), set(['pfam00958']), set(['pfam05221']), set(['pfam00347']), set(['TIGR03685']), set(['pfam03876']), set(['pfam01192']), set(['pfam01984']), set(['pfam00827']), set(['pfam01982']), set(['pfam01981']), set(['TIGR00408']), set(['TIGR00270']), set(['TIGR03665']), set(['pfam02978']), set(['pfam03484']), set(['pfam01201']), set(['TIGR02076']), set(['pfam00832']), set(['pfam00833']), set(['TIGR00419']), set(['pfam00177']), set(['pfam06418']), set(['TIGR00057']), set(['TIGR00549']), set(['pfam13685']), set(['pfam05670']), set(['pfam01849']), set(['TIGR02338']), set(['TIGR00468']), set(['pfam09249']), set(['pfam01287']), set(['pfam00164']), set(['pfam01282']), set(['TIGR03724']), set(['pfam01200']), set(['TIGR02153']), set(['TIGR00670']), set(['pfam00398']), set(['TIGR01213']), set(['pfam06026']), set(['pfam04019']), set(['pfam04010']), set(['pfam00366'])]": print 'Good to go!' else: print 'oh, shit!!!!'
class MarkerSetSelection(object): def __init__(self): self.simFile = './experiments/simulation.tuning.genus.summary.tsv' self.looRank = 5 self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() def __stabilityTest(self, genomeIds, ubiquityThreshold = 0.97, singleCopyThreshold = 0.97, stabilityThreshold = 0.05): """Test stability of marker set for a group of genomes using LOO-testing.""" # quick escape for lineage that are clearly stable if len(genomeIds) > 200: return True # calculate marker sets using a LOO-testing looMarkerGenes = [] for genomeId in genomeIds: looGenomeIds = genomeIds.difference([genomeId]) # calculate marker genes geneCountTable = self.img.geneCountTable(looGenomeIds) markerGenes = self.markerSetBuilder.markerGenes(looGenomeIds, geneCountTable, ubiquityThreshold*len(looGenomeIds), singleCopyThreshold*len(looGenomeIds)) tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) markerGenes = markerGenes - tigrToRemove looMarkerGenes.append(markerGenes) # calculate change in marker set for all pairs markerSetSize = [] diffMarkerSet = [] for i in xrange(0, len(looMarkerGenes)): markerSetSize.append(len(looMarkerGenes[i])) for j in xrange(i+1, len(looMarkerGenes)): symmDiff = looMarkerGenes[i].symmetric_difference(looMarkerGenes[j]) diffMarkerSet.append(len(symmDiff)) print len(genomeIds), mean(diffMarkerSet), mean(markerSetSize) return (float(mean(diffMarkerSet)) / mean(markerSetSize)) <= stabilityThreshold def __patristicDist(self, tree, taxa1, taxa2): mrca = tree.mrca(taxon_labels=[taxa1.taxon.label, taxa2.taxon.label]) if mrca.parent_node == None: # MRCA is the root of the tree return taxa1.distance_from_root() + taxa2.distance_from_root() else: dist = taxa1.edge_length parentNode = taxa1.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node dist += taxa2.edge_length parentNode = taxa2.parent_node while parentNode != mrca: dist += parentNode.edge_length parentNode = parentNode.parent_node return dist def __distToNodePercentileTest(self, genomeNode, markerSetNode, leaves, percentileTest): distToBin = self.__distanceToAncestor(genomeNode, markerSetNode) distToLeaves = [] for leaf in leaves: distToLeaves.append(self.__distanceToAncestor(leaf, markerSetNode)) return distToBin < percentile(distToLeaves, percentileTest) def __selectMarkerSetNode(self, tree, genomeId, metadata, taxonToGenomeIds): """Determine lineage-specific marker set to use for assessing the giving genome.""" # read genomes removed from tree as a result of duplicate sequences duplicateSeqs = self.markerSetBuilder.readDuplicateSeqs() # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # ascend tree to root looking for suitable marker set curNode = node.parent_node while curNode != None: uniqueId = curNode.label.split('|')[0] genomeIds = set() for leaf in curNode.leaf_nodes(): genomeIds.add(leaf.taxon.label.replace('IMG_', '')) duplicateGenomes = duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: genomeIds.add(dup.replace('IMG_', '')) # remove genome (LOO-style analysis) print 'Full:', len(genomeIds) genomeIds.difference_update([genomeId]) print 'LOO:', len(genomeIds) # remove all genomes from the same taxonomic group as the genome of interest taxon = metadata[genomeId]['taxonomy'][self.looRank] genomeIds.difference_update(taxonToGenomeIds[taxon]) print 'Rank reduced:', len(genomeIds) print uniqueId if len(genomeIds) > 10 and self.__stabilityTest(genomeIds): uidSelected = uniqueId break curNode = curNode.parent_node if curNode == None: # reach root so use universal marker set uidSelected = uniqueId return uidSelected def __bestMarkerSet(self, genomeId, simResults): """Get stats for best marker set.""" curBest = 1000 bestUID = None for uid, results in simResults[genomeId].iteritems(): numDescendants, dComp, dCont = results if (dComp + dCont) < curBest: numDescendantsBest = numDescendants dCompBest = dComp dContBest = dCont bestUID = uid curBest = dComp + dCont return bestUID, numDescendantsBest, dCompBest, dContBest def __workerThread(self, tree, simResults, metadata, taxonToGenomeIds, queueIn, queueOut): """Process each data item in parallel.""" while True: testGenomeId = queueIn.get(block=True, timeout=None) if testGenomeId == None: break uidSelected = self.__selectMarkerSetNode(tree, testGenomeId, metadata, taxonToGenomeIds) numDescendantsSelected, dCompSelected, dContSelected = simResults[testGenomeId][uidSelected] # find best marker set bestUID, numDescendantsBest, dCompBest, dContBest = self.__bestMarkerSet(testGenomeId, simResults) queueOut.put((testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest)) def __writerThread(self, numTestGenomes, writerQueue): """Store or write results of worker threads in a single thread.""" fout = open('./experiments/markerSetSelection.tsv', 'w') fout.write('Genome Id\tSelected UID\t# descendants\tSelected dComp\tSelected dCont\tBest UID\t# descendants\tBest dComp\tBest dCont\tdDescendants\tdComp\tdCont\n') itemsToProcess = 0 dComps = [] dConts = [] dCompsPer = [] dContsPer = [] bestComp = [] bestCont = [] selectedComp = [] selectedCont = [] while True: testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest = writerQueue.get(block=True, timeout=None) if testGenomeId == None: break itemsToProcess += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test genomes.' % (itemsToProcess, numTestGenomes, float(itemsToProcess)*100/(numTestGenomes)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() dComp = abs(dCompSelected - dCompBest) dCont = abs(dContSelected - dContBest) dDescendants = abs(numDescendantsSelected - numDescendantsBest) fout.write('%s\t%s\t%d\t%.4f\t%.4f\t%s\t%d\t%.4f\t%.4f\t%d\t%.4f\t%.4f\n' % (testGenomeId, uidSelected, numDescendantsSelected, dCompSelected, dContSelected, bestUID, numDescendantsBest, dCompBest, dContBest, dDescendants, dComp, dCont)) dComps.append(dComp) dConts.append(dCont) dCompsPer.append(dComp*100.0 / dCompBest) dContsPer.append(dCont*100.0 / max(dContBest, 0.01)) bestComp.append(dCompBest) bestCont.append(dContBest) selectedComp.append(dCompSelected) selectedCont.append(dContSelected) sys.stdout.write('\n') fout.close() print '' print ' General results:' print ' Best comp: %.2f +/- %.2f' % (mean(bestComp), std(bestComp)) print ' Best cont: %.2f +/- %.2f' % (mean(bestCont), std(bestCont)) print ' Selected comp: %.2f +/- %.2f' % (mean(selectedComp), std(selectedComp)) print ' Selected cont: %.2f +/- %.2f' % (mean(selectedCont), std(selectedCont)) print '' print ' Delta comp: %.2f +/- %.2f' % (mean(dComps), std(dComps)) print ' Delta cont: %.2f +/- %.2f' % (mean(dConts), std(dConts)) print ' Delta comp per error: %.1f +/- %.1f' % (mean(dCompsPer), std(dCompsPer)) print ' Delta cont per error: %.1f +/- %.1f' % (mean(dContsPer), std(dContsPer)) def __distanceToAncestor(self, leaf, ancestor): dist = 0 curNode = leaf while curNode != ancestor: dist += curNode.edge_length curNode = curNode.parent_node return dist def __bestNodeProperties(self, genomeId, tree, bestUID): # determine location of genome in tree node = tree.find_node_with_taxon_label('IMG_' + genomeId) # find node of best marker set curNode = node.parent_node nodesToBin = 0 distanceToBin = node.edge_length distanceToLeaves = [] while curNode != None: uniqueId = curNode.label.split('|')[0] nodesToBin += 1 if uniqueId == bestUID: for leaf in curNode.leaf_nodes(): if leaf != node: dist = self.__distanceToAncestor(leaf, curNode) distanceToLeaves.append(dist) break distanceToBin += curNode.edge_length curNode = curNode.parent_node return nodesToBin, distanceToBin, mean(distanceToLeaves) def __propertiesOfBestMarkerSets(self, tree, simResults): numDescendants = [] nodesToBin = [] distanceToBin = [] avgDistanceToLeaf = [] percDiffs = [] for genomeId in simResults: bestUID, numDescendantsBest, _, _ = self.__bestMarkerSet(genomeId, simResults) nodesToBinBest, distanceToBinBest, avgDistanceToLeafBest = self.__bestNodeProperties(genomeId, tree, bestUID) numDescendants.append(numDescendantsBest) nodesToBin.append(nodesToBinBest) distanceToBin.append(distanceToBinBest) avgDistanceToLeaf.append(avgDistanceToLeafBest) percDiff = abs(distanceToBinBest - avgDistanceToLeafBest) * 100 / distanceToBinBest percDiffs.append(percDiff) print ' # descendants: %.2f +/- %.2f' % (mean(numDescendants), std(numDescendants)) print ' # nodes to bin: %.2f +/- %.2f' % (mean(nodesToBin), std(nodesToBin)) print ' Distance to bin: %.2f +/- %.2f' % (mean(distanceToBin), std(distanceToBin)) distanceToBin = array(distanceToBin) avgDistanceToLeaf = array(avgDistanceToLeaf) print ' Distance to bin - average distance to leaf: %.2f +/- %.2f' % (mean(abs(distanceToBin - avgDistanceToLeaf)), std(abs(distanceToBin - avgDistanceToLeaf))) print ' Percent difference to average leaf distance: %.2f +/- %.2f' % (mean(percDiffs), std(percDiffs)) print '' def run(self, numThreads): # read reference tree print '\n Reading reference genome tree.' treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre') tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # get all genomes with a given taxon label metadata = self.img.genomeMetadata() taxonToGenomeIds = defaultdict(set) for genomeId in metadata: for t in metadata[genomeId]['taxonomy']: taxonToGenomeIds[t].add(genomeId) # read simulation results print ' Reading simulation results.' simResults = defaultdict(dict) with open(self.simFile) as f: f.readline() for line in f: lineSplit = line.split('\t') simId = lineSplit[0] + '-' + lineSplit[1] + '-' + lineSplit[2] + '-' + lineSplit[3] uid = lineSplit[5].split('|')[0].strip() numDescendants = int(lineSplit[6]) comp = float(lineSplit[21]) cont = float(lineSplit[23]) simResults[simId][uid] = [numDescendants, comp, cont] #print '' #print ' Properties of best marker sets:' #self.__propertiesOfBestMarkerSets(tree, simResults) print ' Evaluating %d test genomes.' % len(simResults) workerQueue = mp.Queue() writerQueue = mp.Queue() for testGenomeId in simResults: workerQueue.put(testGenomeId) for _ in range(numThreads): workerQueue.put(None) workerProc = [mp.Process(target = self.__workerThread, args = (tree, simResults, metadata, taxonToGenomeIds, workerQueue, writerQueue)) for _ in range(numThreads)] writeProc = mp.Process(target = self.__writerThread, args = (len(simResults), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put((None, None, None, None, None, None, None, None, None)) writeProc.join()