Example #1
0
#euk sequence is a singleton nested within a clade of bacteria, and there is only one eukaryote sequence in the tree
if len(eukaryote_seqs) == 1: #this is, I guess, an LGT candidate
    print sys.argv[1] + "\tSingleton"
#euk sequence is a singleton nested within a clade of bacteria, and the eukaryotes are not monophyletic in the tree
#print len(eukaryote_seqs)
else:
    try:
        answer = tree.check_monophyly(values=eukaryote_seqs, target_attr="name")
        if answer[0] == True:
            ca = tree.get_common_ancestor(eukaryote_seqs)
            print sys.argv[1] + "\tEuks monophyletic\t" + str(len(eukaryote_seqs)) + "\t" + str(ca.support) 
        elif answer[0] == False:
            mono_groups = []
            target_group = ''
            for node in tree.get_monophyletic(values=['Eukaryote'], target_attr="domain"):
                if target_leaf in node:
                    target_group = node
                else:
                    mono_groups.append(node)
            size_target_group = len(target_group)
            #get distance
            shortest_distance = 999999999999999.0
            closest_other_group = ''
            for subtree in mono_groups:
                curr_distance = tree.get_distance(target_group, subtree, topology_only=True)
                if curr_distance < shortest_distance:
                    shortest_distance = curr_distance
                    closest_other_group = subtree
            #attempt to calculate distance on a version of the tree in which branches below some support threshold have been deleted
#            closest_leaves = []
Example #2
0
#read the ML tree, set up the taxonomy stuff, and calculate the number of clades per label, and the sizes of those clades (to report at the end)
ml_tree = Tree(sys.argv[1])
for leaf in ml_tree:
    taxonomy = parse_taxonomy(leaf.name)
    name_to_tax_info[leaf.name] = taxonomy
    taxa_names.append(leaf.name)
    leaf.add_feature("tax", taxonomy[target_label])
    labels[taxonomy[target_label]] = 1
groups = labels.keys()

#compute the number of clades per label in the ML tree, and their sizes
ML_groups = defaultdict(
    list
)  #the list is the size of each clade, len(list) is the number of clades for that label in the ML tree
for label in groups:
    for node in ml_tree.get_monophyletic(values=[label], target_attr="tax"):
        size_clade = 0
        for leaf in node:
            size_clade += 1
        ML_groups[label].append(size_clade)

treeNum = -1
tree_sample_handle = open(sys.argv[2])
for line in tree_sample_handle:
    treeNum += 1
    tree = Tree(line.rstrip())
    for leaf in tree:
        tax = name_to_tax_info[
            leaf.name]  #this should set up taxonomy correctly...
        leaf.add_feature(
            "tax", tax[target_label]
def extract_subtrees(tree,
                     ali,
                     target_species,
                     ref_species,
                     treedir,
                     outali,
                     olore,
                     oaore,
                     species_groups,
                     restrict_sp=None):
    """
    For a full gene tree, extracts subtrees and builds AORe and LORe gene tree topologies for them.
    Writes aore and lore trees to file in nhx format and corresponding multiple alignement in fasta.

    Args:
        tree (str): tree file in nhx format for the considered gene family
        ali (str): alignment fasta file for the considered gene family
        target_species (list of str): duplicated+outgroup species
        ref_species (list of str): outgroup(s) species
        treedir (str): directory with SCORPiOs constrained gene tree topologies
        outali (str): output directory for the alignment
        olore (str): output directory for the lore topology (should exist)
        oaore (str): output directory for the aore topology (should exist)
        species_groups (list of str): groups of species for the LORe topology
        restrict_sp (list of str, optional): restrict the set of duplicated species to this set
    """

    tree = Tree(tree)

    #find all monophyletic groups (clades with only target species genes in the tree)
    #called duplicated for historical reason but here I fetch outgr+dup_sp
    tag_duplicated_species(tree.get_leaves(), target_species)
    subtrees = tree.get_monophyletic(values=["Y"], target_attr="duplicated")

    for subtree in subtrees:
        subtree_copy = subtree.copy()

        if restrict_sp:
            small_set = [
                i for i in subtree_copy.get_leaves() if i.S in restrict_sp
            ]
            if len(small_set) > 3:

                subtree_copy.prune([
                    i for i in subtree_copy.get_leaves() if i.S in restrict_sp
                ],
                                   preserve_branch_length=True)
            else:
                continue

        if not check_copy_number(subtree_copy, ref_species):
            continue

        #Build contrained AORe tree topology
        gene_list = {i.name: i.S for i in subtree_copy.get_leaves()}

        file_exist = [
            os.path.isfile(treedir + '/C_' + gene + ".nh")
            for gene in list(gene_list.keys())
        ]
        file_exist = list(compress(range(len(file_exist)), file_exist))

        if len(file_exist) == 1:
            outgr_gene = list(gene_list.keys())[file_exist.pop()]
            treefile = treedir + '/C_' + outgr_gene + ".nh"
            ctree_aore = get_scorpios_aore_tree(gene_list, treefile,
                                                ref_species, outgr_gene)

        elif file_exist == []:
            dup_sp = set(target_species).difference(ref_species)
            ctree_aore, outgr_gene = check_aore_consistent_tree(
                subtree_copy, ref_species, dup_sp)

        else:
            continue

        #Build contrained LORe tree topology
        ctree_lore, _ = make_tree_from_groups(subtree_copy.get_leaves(),
                                              species_groups)

        #check that LORe and AORe have been succesfully built and that they are different
        if ctree_aore is not None and ctree_lore is not None:
            assert {i.name for i in ctree_lore.get_leaves()} ==\
                   {i.name for i in ctree_aore.get_leaves()}, f"{ctree_aore}, {ctree_lore}"
            comp1 = ctree_aore.compare(ctree_lore)
            comp2 = ctree_lore.compare(ctree_aore)
            comp_res = max(comp1['source_edges_in_ref'],
                           comp2['source_edges_in_ref'])

            if comp_res != 1:

                ctree_lore.write(outfile=olore + '/' + outgr_gene + '.nh',
                                 format=9,
                                 features=["D"])
                ctree_aore.write(outfile=oaore + '/' + outgr_gene + '.nh',
                                 format=9,
                                 features=["D"])

                leaves = [i.name for i in subtree_copy.get_leaves()]

                seq = ut.get_subali(ali, leaves)

                ut.write_fasta(seq, outali + '/' + outgr_gene + '.fa')
Example #4
0
                        t = Tree(
                            t.write(format=1,
                                    features=all_features,
                                    format_root_node=True))
                        for wgd in wgds_dict:

                            leaves = t.get_leaves()
                            if len(leaves) == 1:
                                continue

                            #find all monphyletic teleost groups
                            tag_duplicated_species(leaves, wgds_dict[wgd])

                            #all clades of teleost genes,
                            #by definition corrected subtrees will only contain dup. sp
                            subtrees = t.get_monophyletic(
                                values=["Y"], target_attr="duplicated")

                            for subtree in subtrees:

                                if subtree.is_leaf():
                                    continue

                                #if corrected leaves at each side of the node: corrected node
                                child1, child2 = subtree.get_children()

                                tags_wgd = [
                                    i for i in COR_TAGS_ALL if wgd in i
                                ]

                                ok_child1 = corr_tag_below_node(
                                    child1, tags_wgd)
Example #5
0
def get_example_tree(File):
    adres=os.getcwd()
    file_out_supliment = open(adres+"/out_spliment/"+File, 'w')
    node_file = open(adres+"/node/"+File, 'w')
    # Create a random tree and add to each leaf a random set of motifs
    # from the original set
    #t = Tree("( (A, B, C, D, E, F, G), H, I);")
    #Считываем все домены
    domain_all_legend={}
    file_all_domen=os.listdir(adres+"/for_pic/1_tree_nwk/") 
    file_all_domen.remove(".DS_Store")
    file_all_domen.sort()
    i=0
    for file_domain in file_all_domen:
        file_open_domain = open(adres+"/for_pic/3_domain/"+file_domain, 'r')
        for line in file_open_domain:
            line_=line.split("\t")
            try:           
                if not (line_[2] in domain_all_legend):
                    domain_all_legend.setdefault(line_[2],dic_domain_pic_pic[i])
                    i+=1
                if i>len(dic_domain_pic_pic):
                    i=0
            except:
                a=0


    mem=""
    file_open = open(adres+"/for_pic/1_tree_nwk/"+File, 'r')
    for line in file_open:
        mem=mem+line
    tt = Tree(mem, format=0)   


    style = NodeStyle()   
    style["fgcolor"] = "#000000"
    style["size"] = 0
    style["vt_line_color"] = "#000000"
    style["hz_line_color"] = "#000000"
    style["vt_line_width"] = 4
    style["hz_line_width"] = 4
    style["vt_line_type"] = 8 # 0 solid, 1 dashed, 2 dotted
    style["hz_line_type"] = 8
    for node in tt.traverse("levelorder"):
        node.img_style = style
        if (len(node.name))>1:
            node_file.write(node.name+"\n")
        children1=node.children
        for element in children1:
            element.img_style = style
            
                
    for node in tt.traverse("preorder"):
        node.img_style = style
        children1=node.children
        for element in children1:
        	element.img_style = style
    node_file.close

    #вывести дерево с цветами
    #print (tt.get_ascii(attributes=["name", "color"], show_internal=False))
    #поиск предка
    ancestor1=""
    i=0
    for element in ancestor_grop:
        if i==0:
            for node in tt.traverse("postorder"):
                if i==0:
                    node_name=str(node.name)
                    if (node_name.startswith(element)) and  not((node_name.startswith("PPE"))):
                        ancestor1=str(node.name)
                        i=1
                        break
                else:
                    break
        else:
            break
    if not (ancestor1==""):
        tt.set_outgroup(ancestor1)
        #tt.render(adres+"/out/"+File[:-3]+"_2.png", tree_style=circular_style)
        print(str(ancestor1)+" - предок")
        file_out_supliment.write(str(ancestor1)+"\t"+" - предполагаемый корень"+"\n")
    else:
        print("Не нашел предка")



    file_out_supliment.write("\n\n\n Выявленные клады\n")
    #добавляем цвета к кладам
    for leaf in tt:
        i=0
        node_name=str(leaf.name)
        for clad in all_clad:
            collor=collor_list[i]
            i+=1
            for element in clad:
                if (node_name.startswith(element)):
                    leaf.add_features(color=collor)
                    #print(leaf)
    #print(tt)
    #забираем монофилитические цвета
    #print (tt.get_ascii(attributes=["name", "color"], show_internal=False))
    ii=-1
    for clad in all_clad:
        ii+=1
        collor=collor_list[ii]
        for monophyletic_tree in tt.get_monophyletic(values=[collor], target_attr="color"):
            i=[]
            name_node_mono_color=[]
            for leaf in monophyletic_tree:
                i.append(leaf)
                name_node_mono_color.append(leaf.name)
            if len(i)>1:
                n1 = tt.get_common_ancestor(i)
                nst1 = NodeStyle()
                nst1["bgcolor"] = collor
                nst1["fgcolor"] = "#000000"
                nst1["size"] = 0
                nst1["vt_line_color"] = "#000000"
                nst1["hz_line_color"] = "#000000"
                nst1["vt_line_width"] = 4
                nst1["hz_line_width"] = 4
                nst1["vt_line_type"] = 8 # 0 solid, 1 dashed, 2 dotted
                nst1["hz_line_type"] = 8
                n1.set_style(nst1)

                for element in name_node_mono_color:
                    file_out_supliment.write(str(element)+"\t"+" - "+collor+"\n")
                file_out_supliment.write("\n")

   
    file_out_supliment.write("\n\n\n Легенда доменного состава\n")
    #добавляем разметку по доменам
    dic_seq={}
    dic_domain={}
    dic_domain_pic={}
    i=0
    list_legend_domain3=[]
    for node in tt.traverse("postorder"):
        #длины белков
        fasta_sequences=SeqIO.parse(open(adres+"/for_pic/2_MSA/"+File), "fasta")
        for element in fasta_sequences:
            if str(element.id)==str(node.name):
                dic_seq.setdefault(str(node.name),str(element.seq))
        #доменный состав
        a=[]
        file_domain = open(adres+"/for_pic/3_domain/"+File, 'r')
        for line in file_domain:
            line_=line.split("\t")
            if line_[0]==str(node.name):
                if not (line_[2]  in list_legend_domain3):
                    list_legend_domain3.append(line_[2])



                if not (line_[2] in dic_domain_pic):
                    dic_domain_pic.setdefault(line_[2],dic_domain_pic_pic[i])
                    i+=1
                    #print(dic_domain_pic[line_[2]])
                    #print(i)

                    a1=[int(line_[3]),int(line_[4]), "()", None, 15, "black", domain_all_legend[line_[2]], "arial|9|black|"+line_[2]]
                    a.append(a1)
                    dic_domain.setdefault(str(node.name),a)
                    file_out_supliment.write(line_[2]+"\t"+domain_all_legend[line_[2]]+"\n")
                else:
                    a1=[int(line_[3]),int(line_[4]), "()", None, 15, "black", domain_all_legend[line_[2]], "arial|9|black|"+line_[2]]
                    a.append(a1)
                    dic_domain.setdefault(str(node.name),a)

    for element in dic_domain:
        #print(str(element)+" "+ str(dic_domain[element]))
        try:
            seqFace = SeqMotifFace(seq=dic_seq[element], motifs=dic_domain[element], seq_format="line")
            (tt & element).add_face(seqFace, 0, "aligned")
        except:
            seqFace = SeqMotifFace(seq=dic_seq[element],  seq_format="line", gapcolor="red")
            (tt & element).add_face(seqFace, 0, "aligned")
            print("except")

    #Рисуем легенду
    circular_style = TreeStyle()
    circular_style.show_leaf_name = False
    circular_style.show_branch_length = True
    circular_style.show_branch_support = True
    circular_style.scale = 75
    circular_style.tree_width = 50
    file_domain.close
    file_domain = open(adres+"/for_pic/3_domain/"+File, 'r')
    list_legend_domain={}
    list_legend_domain2=[]
    #считали список доменов
    i=0
    for line in file_domain:
        line_=line.split("\t")
        try:
            if  not(line_[2] in list_legend_domain2):
                #print(line_[2])
                list_legend_domain2.append(line_[2])
                list_legend_domain.setdefault("a"+str(i),line_[2])
                i+=1
        except:
            print("не понял что это за домен")
    i=0
    #считываем легенду доменов 
    file_domain_legend2={}
    file_domain_legend = open(adres+"/domain_legend.txt", 'r')
    for line in file_domain_legend:
        line_=line.split("\t")
        aaa=line_[1].replace(" ","_")
        aaa=aaa.replace("(","_")
        aaa=aaa.replace(")","_")
        aaa=aaa.replace(",","_")
        aaa=aaa.replace(":","_")
        aaa=aaa.replace(".","_")
        file_domain_legend2.setdefault(line_[0],aaa.replace("\n",""))
    #N = AttrFace("name", fsize=12)
    #faces.add_face_to_node(N, node, 1, position="branch-right")


    #рисуем домены
    ww=""
    for element in file_domain_legend2:
        ww=ww+","+file_domain_legend2[element]
    ww="("+ww[1:]+");"
    tree_domen_all=Tree(ww)
    for element in file_domain_legend2:
        try:
            element2=domain_all_legend[element]
            a1=[10,90, "()", None, 15, "black", domain_all_legend[element], "arial|9|black|"+element]
            i+=1
            a=[]
            a.append(a1)
            seqFace = SeqMotifFace(seq=seq_seq, motifs=a, seq_format="line")
            #node_node="a"+str(i)
            node_node=file_domain_legend2[element]
            try:
                (tree_domen_all & node_node).add_face(seqFace, 0, "aligned")
            except:
                q=1
                print("не нашел узел")
        except:
                q=1
    circular_style.layout_fn = layout
    tree_domen_all.render(adres+"/out_legend_all.png", tree_style=circular_style)


    file_domain_out = open(adres+"/123123123.txt", 'w')
    w=""
    for element in list_legend_domain3:
        w=w+","+file_domain_legend2[element]
    w="("+w[1:]+");"
    tree_domen=Tree(w)

    for element in list_legend_domain3:
        file_domain_out.write(element+"\n")
        a1=[10,90, "()", None, 15, "black", domain_all_legend[element], "arial|9|black|"+element]
        i+=1
        a=[]
        a.append(a1)
        try:
            seqFace = SeqMotifFace(seq=seq_seq, motifs=a, seq_format="line")
            #node_node="a"+str(i)
            node_node=file_domain_legend2[element]
            (tree_domen & node_node).add_face(seqFace, 0, "aligned")
        except:
            #print("Закончились узлы легенды")
            k=0
    circular_style.layout_fn = layout
    tree_domen.render(adres+"/out_legend/"+File[:-4]+".png", tree_style=circular_style)


    #удаленние части узлов 
    for node in tt.traverse("postorder"):
        try:                
            seqFace = SeqMotifFace(seq=dic_seq[str(node.name)], motifs=dic_domain[str(node.name)], seq_format="line")
            (tt & node.name).add_face(seqFace, 0, "aligned")
            a=0
            if len(node.name)<2:
                a=1
            for element_save in save_node:
                if (node.name).startswith(element_save):
                    a=1
            for element_dell in dell_node:
                if (node.name).startswith(element_dell):
                    a=0
            if a==0:
                node.delete()
        except:
            if len(node.name)>0:
                seqFace = SeqMotifFace(seq=dic_seq[str(node.name)],  seq_format="line", gapcolor="red")
                (tt & node.name).add_face(seqFace, 0, "aligned")
                node.delete()
                d0=0
    #удаленние части узлов ЗАВЕРШЕНО
    #особые точки
    node_color=[]
    file_node_color = open(adres+"/for_pic/4_color_node/out_list_gene2.txt", 'r')
    for line in file_node_color:
        node_color.append(line.replace("\n",""))
    for node in tt.traverse("postorder"):
        if node.name in node_color:
            style = NodeStyle()   
            style["fgcolor"] = "Red"
            style["size"] = 9
            style["vt_line_color"] = "#000000"
            style["hz_line_color"] = "#000000"
            style["vt_line_width"] = 4
            style["hz_line_width"] = 4
            style["vt_line_type"] = 8 # 0 solid, 1 dashed, 2 dotted
            style["hz_line_type"] = 8
            node.set_style(style)
    file_out_supliment.close
    #забираем монофилитические цвета
    #print (tt.get_ascii(attributes=["name", "color"], show_internal=False))
    ii=-1
    for clad in all_clad:
        ii+=1
        collor=collor_list[ii]
        for monophyletic_tree in tt.get_monophyletic(values=[collor], target_attr="color"):
            i=[]
            name_node_mono_color=[]
            for leaf in monophyletic_tree:
                i.append(leaf)
                name_node_mono_color.append(leaf.name)
            if len(i)>1:
                n1 = tt.get_common_ancestor(i)
                nst1 = NodeStyle()
                nst1["bgcolor"] = collor
                nst1["fgcolor"] = "#000000"
                nst1["size"] = 0
                nst1["vt_line_color"] = "#000000"
                nst1["hz_line_color"] = "#000000"
                nst1["vt_line_width"] = 4
                nst1["hz_line_width"] = 4
                nst1["vt_line_type"] = 8 # 0 solid, 1 dashed, 2 dotted
                nst1["hz_line_type"] = 8
                n1.set_style(nst1)

                for element in name_node_mono_color:
                    file_out_supliment.write(str(element)+"\t"+" - "+collor+"\n")
                file_out_supliment.write("\n")

    return tt
Example #6
0
def orthologies_with_outgroup(forest, duplicated_sp, outgroup, dict_genes, out):

    """
    Browses a gene tree forest and searches for orthologs with the outgroup.
    Writes genes without phylogenetic orthologs to a file.
    Also writes files with high-confidence orthologs and paralogs to use to otpimize the synteny
    support threshold to call orthology.

    Args:
        forest (str): Name of the gene trees forest file
        duplicated_sp (list of str): List of all duplicated species for the considered WGD
        outgroup (str): Non-duplicated outgroup
        dict_genes (dict of GeneSpeciesPosition tuples): All gene positions for each species
        out (str): Output file to write genes without phylogenetic orthologs

    Returns:
        dict: Orthologs of outgroup genes in each duplicated species

    Note (FIXME): Written to work within scorpios as orthologs and paralogs file names are derived
                  from output file patterns, assuming it contains an '_'.

    """

    ortho = {e: {} for e in duplicated_sp}

    orthofile = out.replace(out.split("/")[-1].split('_')[0], "orthologs")
    parafile = out.replace(out.split("/")[-1].split('_')[0], "paralogs")

    with open(out, 'w') as outfile, open(forest, 'r') as infile, open(parafile, 'w') as out_para,\
         open(orthofile, 'w') as out_ortho:

        sys.stderr.write("Browsing gene trees for orthologies with the outgroup...\n")

        for tree in ut.read_multiple_objects(infile):

            #load tree
            tree = Tree(tree.strip(), format=1)
            node2leaves = tree.get_cached_content()
            leaves = [i for i in tree.get_leaves()]

            #add a tag to genes of duplicated species
            tag_duplicated_species(leaves, duplicated_sp)

            #find all clades with only genes of duplicated species
            subtrees = tree.get_monophyletic(values=["Y"], target_attr="duplicated")

            #find all outgroup genes
            outgroup_genes = [i for i in leaves if i.S == outgroup]

            #search for an ortholog gene in the outgroup for all clades of teleost genes
            for subtree in subtrees:

                seen = {}
                subtree_leaves = subtree.get_leaves()
                found = False

                #browse all outgroup genes
                for j in outgroup_genes:

                    #find the node that splits the outgroup gene and duplicated species genes
                    lca = tree.get_common_ancestor(subtree, j)
                    topo_distance = len(node2leaves[lca])

                    # if it is a speciation or dubious duplication node --> speciation
                    if org.is_speciation(lca):
                        branch_distance = tree.get_distance(subtree, j)
                        if subtree not in seen:
                            seen[subtree] = []
                        seen[subtree].append((topo_distance, branch_distance, j))
                        found = True

                # if no 'true' ortholog
                # check if all descendants include only outgroup + duplicated species
                if not found:
                    for j in outgroup_genes:
                        lca = tree.get_common_ancestor(subtree, j)

                        for gene in lca.get_leaves():
                            if gene.duplicated != "Y" and gene.S != outgroup:
                                break

                        #if no break, it means all descendants are outgroup or dup.
                        else:
                            topo_distance = len(node2leaves[lca])
                            branch_distance = tree.get_distance(subtree, j)
                            seen[subtree] = seen.get(subtree, [])
                            seen[subtree].append((topo_distance, branch_distance, j))


                # if an ortholog was found, add it to the orthology dict
                if seen:
                    content = []
                    seen[subtree].sort(key=lambda x: (x[0], x[1]))
                    outgroup_gene = seen[subtree][0]
                    outgroup_gene = outgroup_gene[2].name
                    for species in duplicated_sp:
                        genes = [i.name for i in subtree_leaves if i.S == species]
                        genes = get_genes_positions(genes, species, dict_genes)

                        ortho[species][outgroup_gene] = ortho[species].get(outgroup_gene, [])
                        ortho[species][outgroup_gene] += genes


                        content += [g.name+'_'+species.replace(' ', '.')+\
                                         '|'+str(g.chromosome)+\
                                         '|'+str(g.index) for g in genes]

                    all_ortho = [i[2].name for i in seen[subtree]]
                    paralogs = [i.name for i in outgroup_genes if i.name not in all_ortho]

                    if paralogs:
                        paralog = random.choice(paralogs)

                        if paralog in dict_genes[outgroup]\
                           and outgroup_gene in dict_genes[outgroup]:

                            tmp_dict = dict_genes[outgroup]

                            out_ortho.write(' '.join(content)+'\t')
                            out_ortho.write(str(outgroup_gene)+'|'+\
                                            str(tmp_dict[outgroup_gene].chromosome)+'|'+\
                                            str(tmp_dict[outgroup_gene].index)+'|'+str(0)+'|'+\
                                            str(0)+'\n')

                            out_para.write(' '.join(content)+'\t')
                            out_para.write(str(paralog)+'|'+\
                                           str(tmp_dict[paralog].chromosome)+'|'+\
                                           str(tmp_dict[paralog].index)+'|'+\
                                           str(0)+'|'+str(0)+'\n')

                # if no ortholog found
                # write genes without ortholog along with all outgroup genes in tree
                # (potential candidate for orthology)
                elif any(i.name in dict_genes[outgroup] for i in outgroup_genes):

                    #genes without orthologs
                    missed_genes = []
                    for species in duplicated_sp:
                        genes = [i.name for i in subtree_leaves if i.S == species]
                        genes = get_genes_positions(genes, species, dict_genes)
                        missed_genes += [g.name+'_'+species.replace(' ', '.')+\
                                         '|'+str(g.chromosome)+\
                                         '|'+str(g.index) for g in genes]

                    if missed_genes:
                        outfile.write(' '.join(missed_genes)+'\t')

                        #candidate orthologs in the outgroup
                        outgr_genes = [i.name for i in outgroup_genes]

                        in_paralogs = []
                        for pair in itertools.combinations(outgr_genes, 2):
                            if tree.get_distance(pair[0], pair[1], topology_only=True) == 1:
                                in_paralogs.append(pair[0]+'|'+pair[1])

                        outgr_write = []
                        genome = dict_genes[outgroup]
                        for gene in outgr_genes:
                            if gene in genome:

                                lca = tree.get_common_ancestor(subtree, gene)
                                branch_distance = tree.get_distance(subtree, gene)
                                topo_distance = len(node2leaves[lca])

                                outgr_write.append(str(gene)+'|'+str(genome[gene].chromosome)+'|'+\
                                                   str(genome[gene].index)+'|'+str(topo_distance)+\
                                                   '|'+str(branch_distance))

                        outfile.write(' '.join(outgr_write)+'\t'+' '.join(in_paralogs)+'\n')

    sys.stderr.write("Phylogenetic orthologies with the outgroup OK\n")

    return ortho
Example #7
0
from ete3 import Tree
t =  Tree("((((((4, e), i), o),h), u), ((3, 4), (i, june)));")
# we annotate the tree using external data
colors = {"a":"red", "e":"green", "i":"yellow",
          "o":"black", "u":"purple", "4":"green",
          "3":"yellow", "1":"white", "5":"red",
          "june":"yellow"}
for leaf in t:
    leaf.add_features(color=colors.get(leaf.name, "none"))
print(t.get_ascii(attributes=["name", "color"], show_internal=False))


print("Green-yellow clusters:")
# And obtain clusters exclusively green and yellow
for node in t.get_monophyletic(values=["green", "yellow"], target_attr="color"):
   print(node.get_ascii(attributes=["color", "name"], show_internal=False))
   
   
#%%
   
#finding and saving nodes by their names 
   
C= t&"C"
H= t&"H"
I= t&"I"
   
   
   
#%%