Beispiel #1
0
def parse_lineages(tax_file, fmt='fasta', style=GTDB):
    """ returns taxonomy object """

    id_map = {}
    root = TaxNode('root', None, None)
    tree = {'root': root}

    logger.debug("Parsing %s", tax_file)

    if style == GTDB:
        add_lineage_to_tree = add_gtdb_lineage_to_tree
        generate_lineages = generate_gtdb_lineages
    else:
        add_lineage_to_tree = add_phylodb_lineage_to_tree
        generate_lineages = generate_phylodb_lineages

    # generate taxonomy tree
    for acc, lineage in generate_lineages(tax_file):
        # create TaxNode
        node = add_lineage_to_tree(lineage, tree)
        id_map[acc] = node

    logger.debug("Adding id numbers to %d nodes", len(tree))

    # assign numeric IDs
    i = 0
    for node in treeGenerator(root):
        i += 1
        node.id = i

    logger.debug("Added %d id numbers", i)

    return Taxonomy(id_map, None, None, tax_file, root)
def writeDumpFiles(rootNode, nodeStream, nameStream):
    for node in treeGenerator(rootNode):
        nid = node.id
        nname = node.name
        if node==rootNode:
            nparent = node.id
        else:
            nparent = node.parent.id
        if node.rank == 'domain':
            nrank = 'superkingdom'
        elif node.rank in edl.taxon.ranks:
            nrank = node.rank
        else:
            nrank = "no rank"
        nodeStream.write("%s\t|\t%s\t|\t%s\t\n" % (nid,nparent,nrank))
        nameStream.write("%s\t|\t%s\t|\t\t|\tscientific name\t\n" % (nid,nname))
def buildSilvaTree(taxFile, fastaFile, logger):
    """
    Given a text taxonomy file (lineage <tab> id <tab> rank) and a fasta file with full lineages as the description:
    Return the root node from a taxonomy of edl.taxon.Node objects and a mapping from fasta record IDs to taxids.
    """
    rankMap=parseMapFile(taxFile, keyCol=0, valueCol=2, skipFirst=0)
    silvaTaxidMap=parseMapFile(taxFile, keyCol=0, valueCol=1, valueType=int, skipFirst=0)

    # create core of tree from taxonomy text file
    silvaTree={}
    maxTaxid=max(silvaTaxidMap.values())
    for (lineage, rank) in rankMap.items():
        node=edl.silva.SilvaTaxNode.addToTreeFromString(lineage.strip("; "), silvaTree)
        node.rank = rankMapping.get(rank,rank)
        node.ncbi_tax_id = silvaTaxidMap[lineage]
        if not isinstance(node.ncbi_tax_id,int):
            logger.warn("NCBI taxid is not an int: %s (%s)" % (node.ncbi_tax_id, node.name))

    logger.info("Built tree of %d taxa with the largest ID of %d" % (len(silvaTree),maxTaxid))

    # Add leaves to tree from lineages in fasta file and build mapping
    taxmap={}
    for (hitid,lineage) in getOrgsFromSilvaFasta(fastaFile):
        node = edl.silva.SilvaTaxNode.addToTreeFromString(lineage, silvaTree)
        taxmap[hitid]=node

    logger.info("Added nodes from fasta file for a total of %d" % (len(silvaTree)))

    rootNode=next(iter(silvaTree.values())).getRootNode()
    # make sure everything is OK
    for node in treeGenerator(rootNode):
        if not isinstance(node.id,int):
            if "ncbi_tax_id" in dir(node):
                node.id = int(node.ncbi_tax_id)
            else:
                maxTaxid+=1
                node.id=maxTaxid

    logger.info("Cleaning up taxmap")

    # change nodes in taxmap to IDs
    for hitid in taxmap:
        taxmap[hitid]=taxmap[hitid].id

    return (rootNode, taxmap)