Beispiel #1
0
def construct_tree(taxon, db, includespecies, taxalist=None):
    conn = sqlite3.connect(db)
    c = conn.cursor()
    includelist = None
    if taxalist != None:
        tl = set()
        tlf = open(taxalist, "r")
        for i in tlf:
            tl.add(i)
        tlf.close()
        includelist = get_all_included(tl, c)
    species = []
    stack = []
    done = set()
    rt = None
    nodes = {}  # id is key, value is node
    if (taxon.isdigit()):
        c.execute("select ncbi_id from taxonomy where ncbi_id = ?", (taxon, ))
    else:
        c.execute(
            "select ncbi_id from taxonomy where name = ? and node_rank != 'species'",
            (taxon, ))
    for j in c:
        stack.append(str(j[0]))
        rt = node.Node()
        rt.label = taxon + "_" + str(j[0])
        rt.data["id"] = str(j[0])
        nodes[str(j[0])] = rt
    while len(stack) > 0:
        id = stack.pop()
        if id in done:
            continue
        done.add(id)
        if includespecies:
            c.execute(
                "select ncbi_id,name,name_class,edited_name from taxonomy where parent_ncbi_id = ?",
                (id, ))
        else:
            c.execute(
                "select ncbi_id,name,name_class,edited_name from taxonomy where parent_ncbi_id = ? and node_rank != 'species'",
                (id, ))
        childs = []
        for j in c:
            tid = str(j[0])
            if includelist != None and tid not in includelist:
                continue
            childs.append(tid)
            stack.append(tid)
            if str(j[2]) == "scientific name":
                name = str(j[1])
                edname = str(j[3])
                nn = node.Node()
                nn.label = clean_name(edname) + "_" + str(tid)
                nn.data["id"] = tid
                nodes[tid] = nn
                nn.parent = nodes[id]
                nodes[id].add_child(nn)
        if len(childs) == 0 and id not in species:
            species.append(id)
    return rt
Beispiel #2
0
            os.system(cmd)
    return newalns


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print "python " + sys.argv[0] + " startdir"
        sys.exit(0)

    cld = sys.argv[1]
    #take off the trailing slash if there is one
    if cld[-1] == "/":
        cld = cld[0:-1]

    count = 0
    tree = node.Node()
    nodes = {}
    firstnode = True

    #build a tree from the directory
    for root, dirs, files in os.walk(cld, topdown=True):
        if "clusters" in root:
            continue
        if "clusters" in dirs:
            if firstnode == True:
                tree.label = root.split("/")[-1]
                firstnode = False
                nodes[root.split("/")[-1]] = tree
            nd = nodes[root.split("/")[-1]]
            nd.data["dir"] = root
            nd.data["names"] = set()
Beispiel #3
0
def construct_tree_only_ids(baseid, c, ids):
    species = []
    stack = []
    done = set()
    rt = None
    includelist = get_all_included(ids, c)

    # node_ids = {}  # id is key, value is parent id
    nodes = {}  # id is key, value is node
    stack.append(str(baseid))
    rt = node.Node()
    rt.label = baseid + "_" + str(baseid)
    rt.data["id"] = str(baseid)
    nodes[str(baseid)] = rt
    while len(stack) > 0:
        id = stack.pop()
        if id in done:
            continue
        done.add(id)
        c.execute(
            "select ncbi_id,name,name_class,edited_name from taxonomy where parent_ncbi_id = ?",
            (id, ))
        childs = []
        for j in c:
            tid = str(j[0])
            if includelist != None and tid not in includelist:
                continue
            childs.append(tid)
            stack.append(tid)
            if str(j[2]) == "scientific name":
                name = str(j[1])
                edname = str(j[3])
                nn = node.Node()
                nn.label = clean_name(name)  #+"_"+str(tid)
                nn.data["id"] = tid
                nodes[tid] = nn
                # node_ids[tid] = id
                nn.parent = nodes[id]
                nodes[id].add_child(nn)
        if len(childs) == 0 and id not in species:
            species.append(id)
    for i in rt.iternodes():
        if len(i.children) == 0:
            continue
        else:
            i.label = ""
    going = True
    while going:
        found = False
        for i in rt.iternodes():
            if i.parent != None and len(i.children) == 1 and i.label == "":
                par = i.parent
                ch = i.children[0]
                par.remove_child(i)
                par.add_child(ch)
                found = True
                break
        if found == False:
            going = False
            break
    return rt
Beispiel #4
0
     sys.stderr.write("here\n")
 while len(diffnms) > 0:
     for j in diffnms:
         going = True
         cn = diffnds[j]
         while going:
             par = cn.parent
             pln = set(par.lvsnms()).intersection(rootnms)
             if len(pln) > 0:
                 amrca = tree_utils.get_mrca_wnms(pln,tree1)
                 #if VERBOSE:
                 #    sys.stderr.write("add at this node"+" "+par.get_newick_repr(False)+" "+amrca.get_newick_repr(False)+"\n")
                 if len(pln) == 1:
                     amrca = tree1.get_leaf_by_name(list(pln)[0])
                     #print "f",amrca.get_newick_repr(True)
                     nn = node.Node()
                     if EDITLEN:
                         nn.length = amrca.length/2.
                         nn.height = amrca.height+amrca.length/2.
                         amrca.length = nn.length
                     amrca.parent.add_child(nn)
                     amrca.parent.remove_child(amrca)
                     nn.add_child(amrca)
                     amrca = nn
                 for k in par.children:
                     if len(set(k.lvsnms()).intersection(rootnms)) > 0:
                         continue
                     else:
                         #tree_utils.set_heights(amrca)
                         #print "a",k.get_newick_repr(True),amrca.length,amrca.get_newick_repr(True),amrca.height
                         if EDITLEN:
def construct_tree_only_ids(baseid, c, ids):
    species = []
    stack = []
    done = set()
    rt = None
    includelist = get_all_included(ids, c)

    # node_ids = {}  # id is key, value is parent id
    nodes = {}  # id is key, value is node
    stack.append(str(baseid))
    rt = node.Node()
    rt.label = baseid + "_" + str(baseid)
    rt.data["id"] = str(baseid)
    nodes[str(baseid)] = rt
    while len(stack) > 0:
        id = stack.pop()
        if id in done:
            continue
        done.add(id)
        c.execute(
            "select ncbi_id,name_class,name,node_rank from taxonomy where parent_ncbi_id = ?",
            (id, ))
        for j in c:
            tid = str(j[0])
            if includelist != None and tid not in includelist:
                continue
            stack.append(tid)
            if str(j[1]) == "scientific name" and (noinclude == False
                                                   or str(j[3]) != stopat):
                nn = node.Node()
                nn.label = str(tid)
                nn.data["id"] = tid
                nn.data["rank"] = str(j[3])
                nodes[tid] = nn
                # node_ids[tid] = id
                if id in ids or "incertae" in str(
                        j[2]) or "unidentified" in str(
                            j[2]) or "unplaced" in str(j[2]):
                    #should lose these and not constrain
                    nodes[tid] = nodes[id]
                    #nn.parent = nodes[id].parent
                    #nodes[id].parent.add_child(nn)
                else:
                    nn.parent = nodes[id]
                    nodes[id].add_child(nn)
            elif (noinclude == True and str(j[3]) == stopat) and str(
                    j[1]) == "scientific name":
                if id in ids:
                    nodes[tid] = nodes[id].parent
                else:
                    nodes[tid] = nodes[id]
    if useonly:  #remove any constraint that insn't in the list
        toremove = set()
        for i in rt.iternodes():
            if len(i.children) == 0 or i == rt:
                continue
            else:
                if i.data["rank"] not in useonlylist:
                    toremove.add(i)
        for i in toremove:
            p = i.parent
            if p == None:
                continue
            p.remove_child(i)
            i.parent = None
            for j in i.children:
                p.add_child(j)
                j.parent = p
    for i in rt.iternodes():
        if len(i.children) == 0:
            continue
        else:
            i.label = ""
    going = True
    while going:
        found = False
        for i in rt.iternodes():
            if i.parent != None and len(i.children) == 1 and i.label == "":
                par = i.parent
                ch = i.children[0]
                par.remove_child(i)
                par.add_child(ch)
                found = True
                break
        if found == False:
            going = False
            break
    if len(rt.children) == 1:
        rt = rt.children[0]
    return rt