def parse_lineages(tax_file, fmt='fasta', style=GTDB): """ returns taxonomy object """ id_map = {} root = TaxNode('root', None, None) tree = {'root': root} logger.debug("Parsing %s", tax_file) if style == GTDB: add_lineage_to_tree = add_gtdb_lineage_to_tree generate_lineages = generate_gtdb_lineages else: add_lineage_to_tree = add_phylodb_lineage_to_tree generate_lineages = generate_phylodb_lineages # generate taxonomy tree for acc, lineage in generate_lineages(tax_file): # create TaxNode node = add_lineage_to_tree(lineage, tree) id_map[acc] = node logger.debug("Adding id numbers to %d nodes", len(tree)) # assign numeric IDs i = 0 for node in treeGenerator(root): i += 1 node.id = i logger.debug("Added %d id numbers", i) return Taxonomy(id_map, None, None, tax_file, root)
def writeDumpFiles(rootNode, nodeStream, nameStream): for node in treeGenerator(rootNode): nid = node.id nname = node.name if node==rootNode: nparent = node.id else: nparent = node.parent.id if node.rank == 'domain': nrank = 'superkingdom' elif node.rank in edl.taxon.ranks: nrank = node.rank else: nrank = "no rank" nodeStream.write("%s\t|\t%s\t|\t%s\t\n" % (nid,nparent,nrank)) nameStream.write("%s\t|\t%s\t|\t\t|\tscientific name\t\n" % (nid,nname))
def buildSilvaTree(taxFile, fastaFile, logger): """ Given a text taxonomy file (lineage <tab> id <tab> rank) and a fasta file with full lineages as the description: Return the root node from a taxonomy of edl.taxon.Node objects and a mapping from fasta record IDs to taxids. """ rankMap=parseMapFile(taxFile, keyCol=0, valueCol=2, skipFirst=0) silvaTaxidMap=parseMapFile(taxFile, keyCol=0, valueCol=1, valueType=int, skipFirst=0) # create core of tree from taxonomy text file silvaTree={} maxTaxid=max(silvaTaxidMap.values()) for (lineage, rank) in rankMap.items(): node=edl.silva.SilvaTaxNode.addToTreeFromString(lineage.strip("; "), silvaTree) node.rank = rankMapping.get(rank,rank) node.ncbi_tax_id = silvaTaxidMap[lineage] if not isinstance(node.ncbi_tax_id,int): logger.warn("NCBI taxid is not an int: %s (%s)" % (node.ncbi_tax_id, node.name)) logger.info("Built tree of %d taxa with the largest ID of %d" % (len(silvaTree),maxTaxid)) # Add leaves to tree from lineages in fasta file and build mapping taxmap={} for (hitid,lineage) in getOrgsFromSilvaFasta(fastaFile): node = edl.silva.SilvaTaxNode.addToTreeFromString(lineage, silvaTree) taxmap[hitid]=node logger.info("Added nodes from fasta file for a total of %d" % (len(silvaTree))) rootNode=next(iter(silvaTree.values())).getRootNode() # make sure everything is OK for node in treeGenerator(rootNode): if not isinstance(node.id,int): if "ncbi_tax_id" in dir(node): node.id = int(node.ncbi_tax_id) else: maxTaxid+=1 node.id=maxTaxid logger.info("Cleaning up taxmap") # change nodes in taxmap to IDs for hitid in taxmap: taxmap[hitid]=taxmap[hitid].id return (rootNode, taxmap)