コード例 #1
0
ファイル: files_utils.py プロジェクト: DessimozLab/HogProf
def get_tree(taxa, savename=None):
    """
    Generates a taxonomic tree using the ncbi taxonomy and
    :param oma:  a pyoma db object
    :param saveTree: Bool for whether or not to save a mastertree newick file
    :return: tree_string: a newick string tree: an ete3 object

    """
    ncbi = ete3.NCBITaxa()
    tax = set(tax)
    genomes = set(genomes)
    tax.remove(0)
    print(len(tax))

    tree = ete3.PhyloTree(name='')
    tree.add_child(name='131567')

    topo = ncbi.get_topology(tax, collapse_subspecies=False)
    tax = set([str(taxid) for taxid in tax])
    tree.add_child(topo)
    orphans = list(genomes - set([x.name for x in tree.get_leaves()]))
    print('missing taxa:')
    print(len(orphans))
    Entrez.email = config_utils.email
    orphans_info1 = {}
    orphans_info2 = {}
    for x in orphans:
        search_handle = Entrez.efetch('taxonomy', id=str(x), retmode='xml')
        record = next(Entrez.parse(search_handle))
        print(record)
        orphans_info1[record['ParentTaxId']] = x
        orphans_info2[x] = [x['TaxId'] for x in record['LineageEx']]
    for n in tree.traverse():
        if n.name in orphans_info1:
            n.add_sister(name=orphans_info1[n.name])
            print(n)
    orphans = set(genomes) - set([x.name for x in tree.get_leaves()])
    tree = add_orphans(orphans_info2, tree, genomes)
    orphans = set(genomes) - set([x.name for x in tree.get_leaves()])
    tree_string = tree.write(format=1)
    if savename is None:
        with open(config_utils.datadir + 'mastertree.nwk', 'w') as nwkout:
            nwkout.write(tree_string)
        with open(config_utils.datadir + 'mastertree.pkl', 'wb') as pklout:
            pklout.write(pickle.dumps(tree))
    else:
        with open(config_utils.datadir + savename + '_master_tree.nwk',
                  'w') as nwkout:
            nwkout.write(tree_string)
        with open(config_utils.datadir + savename + '_master_tree.pkl',
                  'wb') as pklout:
            pklout.write(pickle.dumps(tree))
    return tree_string, tree
コード例 #2
0
def main(treefile, ingroup, outfile):
    tree = ete3.PhyloTree(treefile, format=2)

    tax_map = get_tax_ids(tree)
    taxon_clade = get_clade_dict(tax_map)


    root_tree(tree, ingroup)
    tree.ladderize(direction=1)

    prepare_node_leave_names(tree)

    tree_string = collapse_groups(tree, taxon_clade)
    tree_string = tree_string.replace('INTERNAL_', "[&label=").replace('_SUPPORT',']')
    write_nexus(outfile, tree_string, tree)
コード例 #3
0
def parse_phylo(phy_fn, phy_id):

	# load input
	phy = ete3.PhyloTree("%s" % (phy_fn))
	logging.info("%s num nodes = %i" % (phy_id,len(phy)))
	# assign species names to tree
	phy.set_species_naming_function(lambda node: node.name.split("_")[0] )
	# resolve polytomies in a random fashion
	phy.resolve_polytomy(recursive=True)
	# check if tree is rooted, apply midpoint root if unrooted
	phy_root = phy.get_tree_root()
	phy_outg = phy_root.get_children()
	is_root  = len(phy_outg) == 2
	if is_root:
		pass
		logging.info("%s Tree is rooted, pass" % phy_id)
	else: 
		logging.info("%s Tree is unrooted, apply midpoint root" % phy_id)
		phy_outgroup = phy.get_midpoint_outgroup()
		phy.set_outgroup(phy_outgroup)

	# find evolutionary events (duplications and speciations)
	evev = phy.get_descendant_evol_events(sos_thr=0)

	# create empty array for network edges
	evou    = np.empty((len(evev)*1000, 5), dtype="object")
	evou[:] = np.nan
	# loop through in and out seqs, create edge table with orthologous events
	n = 0
	for ev in evev:
		if ev.etype == "S":
			for ii in ev.in_seqs:
				for oi in ev.out_seqs:
					evou[n,0] = ii
					evou[n,1] = oi
					evou[n,2] = ev.branch_supports[0]
					evou[n,3] = ev.etype
					evou[n,4] = ev.sos
					n = n + 1

	evou_d = pd.DataFrame(evou).dropna()
	evou_d.columns = ["in_gene","out_gene","branch_support","ev_type","sos"]

	return evou_d
コード例 #4
0
 def __init__(self, TreePath, AlignementPath, uniprotTaxonomy):
     """This class takes the path to the Newick Tree, the fasta alignment from which the tree is derived and the path to the parsed uniprot taxonomy."""
     self.TreePath = TreePath
     self.AlignementPath = AlignementPath
     f = open(self.AlignementPath)
     lines = f.readlines()
     out = []
     for line in lines:
         if line[0] == '>':
             out.append(line.split(' ')[0] + '\n')
         else:
             out.append(line)
     f.close()
     f = open(self.AlignementPath, 'w')
     for o in out:
         f.write(o)
     f.close()
     self.tree = ete3.PhyloTree(newick=TreePath, alignment=AlignementPath)
     self.tree.set_species_naming_function(self.parse_sp_name)
     self.uniprot2ncbi = {}
     self.uniprot2species = {}
     self.ncbiID2species = {}
     self.ncbi = ete3.NCBITaxa()
     f = open(uniprotTaxonomy)
     lines = f.readlines()
     for line in lines:
         s = line.strip().split('\t')
         uniprotID = s[0]
         ncbiID = s[1].split(' ')[0]
         specie = s[2].split(',')[-1]
         self.uniprot2ncbi[uniprotID] = ncbiID
         self.uniprot2species[uniprotID] = specie
         self.ncbiID2species[ncbiID] = specie
     self.treeTaxa = []
     leaves = self.tree.get_leaves()
     for leaf in leaves:
         uniprotID = leaf.name.split('|')[0].split('_')[1]
         ncbiID = self.uniprot2ncbi[uniprotID]
         leaf.name = "%s_%s" % (ncbiID, leaf.name.split('|')[0].split('_')[1])
         # leaf.species = sel`f.uniprot2species[uniprotID]
         self.treeTaxa.append(int(ncbiID))
     self.NCBITaxonomy = self.ncbi.get_topology(self.treeTaxa, intermediate_nodes=True)
コード例 #5
0
ファイル: calc_expconv.py プロジェクト: ztzou/conv_cal
def read_ancestral_tree(rst_file_name):
    rst_file = open(rst_file_name)
    flag0 = False
    flag1 = False
    flag2 = True
    species_list = []
    for line in rst_file:
        if (flag2 == True) and line.startswith('('):
            length_tree = ete3.Tree(line.strip())
            flag2 = False
        if flag0 == True:
            species_tree = ete3.PhyloTree(line.strip(), format=8)
            re_root = re.search(r'\)\s+([_\-\.\w]+)\s+;', line)
            if re_root:
                species_tree.name = re_root.group(1)
            for node in species_tree.traverse():
                if node.is_leaf():
                    node.name = '_'.join(node.name.split('_')[1:])
                    species_list.append(node.name)
            line_set = set(species_list + [
                'node',
            ])
            flag0 = False
            flag1 = True
        if (flag1 == True) and (len(line) > 1) and (line.split()[0]
                                                    in line_set):
            cols = line.strip().split()
            if cols[0] in species_list:
                (species_tree & cols[0]).sequence = ''.join(cols[1:])
            else:
                (species_tree & cols[1][1:]).sequence = ''.join(cols[2:])
        if line.startswith("tree with node labels for Rod Page's TreeView"):
            flag0 = True
    for node in species_tree.traverse('preorder'):
        leaves = set(node.get_leaf_names())
        for length_node in length_tree.traverse('preorder'):
            if set(length_node.get_leaf_names()) == leaves:
                node.dist = length_node.dist
    return species_tree
コード例 #6
0
def main(inputtree,
         outbase,
         div=True,
         features=None,
         stem_or_crown="crown",
         byrank='',
         byage=None,
         bylist=None,
         bysize=None):
    """byrank: when the rank is included in or equal to 'byrank';
       byage:  collapse any node of age <= byage;
       bylist: read list of nodes from file;
       bysize: collapse oldest nodes with size < bysize."""
    group_feature_rate = def_group_feature_rate(stem_or_crown)

    tree = ete3.PhyloTree(inputtree, format=1, quoted_node_names=False)

    outsuffix = '-stem' if stem_or_crown == 'stem' else ''

    if byrank:
        outsuffix += '-%s' % byrank
    if byage:
        outsuffix += '-age%g' % byage
    if bylist:
        outsuffix += '-list' + op.splitext(op.basename(bylist))[0]
    if bysize:
        outsuffix += '-size%d' % bysize

    outnames = {
        'tsv': (outbase + '%s.tsv' % outsuffix),
        'subtrees': (outbase + '%s.subtrees.nwk' % outsuffix),
        'tree': (outbase + '%s.nwk' % outsuffix)
    }

    for out in outnames.values():
        if op.exists(out):
            logger.error("%r already exists, quitting", out)
            return 1

    columns = [outsuffix.lstrip('-'), 'size', 'branches', 'age',
               'tot_len']  #'crown_age', 'stem_age']
    if div: columns.extend(('div_rate', 'gamma', 'ncbi_sp_sampling'))
    if features: columns.extend(features)

    if byrank or div:
        logger.info("Loading taxonomy")
        ncbi = ete3.NCBITaxa()

        name2taxid = ncbi.get_name_translator(
                            [node.name.replace('_', ' ') if node.is_leaf() \
                                else node.name for node in tree.traverse()])
        # Won't return anything for names not found

        #if rank:
        #taxid2rank = ncbi.get_rank(chain(*name2taxid.values()))
        taxid2name = ncbi.get_taxid_translator(chain(*name2taxid.values()))
    else:
        name2taxid, taxid2name = None, None

    is_leaf_fn = make_is_leaf_fn(byrank, byage, bylist, bysize, name2taxid,
                                 taxid2name)

    with open(outnames['tsv'], 'w') as outtsv, \
         open(outnames['subtrees'], 'w') as outsub:

        outtsv.write('\t'.join(columns) + '\n')

        logger.info("Iterating over found clades")
        for node in tree.iter_leaves(is_leaf_fn):
            outsub.write(
                node.write(features, format=1, format_root_node=True) + '\n')

            # Collapse
            size = len(node)
            branches = len(node.get_descendants())
            _, age = node.get_farthest_leaf()
            tot_len = sum(d.dist for d in node.iter_descendants())
            if stem_or_crown == 'stem':
                age += node.dist
                tot_len += node.dist
            values = [node.name, size, branches, age, tot_len]
            if div:
                div_rate = float(size) / age if age else np.NaN
                gamma_stat = div_gamma(node)

                try:
                    nodetaxids = name2taxid[node.name.replace('_', ' ')]
                    if len(nodetaxids) > 1:
                        nodetaxids = [
                            match_duplicate_taxid(taxids, node, taxid2name,
                                                  ncbi)
                        ]

                except KeyError:
                    # This clade isn't in the taxonomy (example: Atlantogenata)
                    # take descendant nodes and join them
                    valid_tax_children = get_valid_tax_children(
                        node, name2taxid)
                    vtc_names = [
                        vtc.name.replace(' ', '_')
                        for vtc in valid_tax_children
                    ]

                    logger.warning(
                        '%r not found in NCBI Taxonomy. Merging '
                        'the node children %s to get the '
                        'descendant counts.', node.name, vtc_names)

                    nodetaxids = []
                    for vtc_n, vtc in zip(vtc_names, valid_tax_children):
                        vtc_taxids = name2taxid[vtc_n]
                        if len(vtc_taxids) == 1:
                            nodetaxids.append(vtc_taxids[0])
                        else:
                            nodetaxids.append(
                                match_duplicate_taxid(vtc_taxids, vtc,
                                                      taxid2name, ncbi))

                ncbi_sp = list(chain(*(ncbi.get_descendant_taxa(nt,
                                                rank_limit='species') \
                                       for nt in nodetaxids)))
                #collapse_subspecies=True))
                sp_sampling = float(size) / len(ncbi_sp)
                values.extend((div_rate, gamma_stat, sp_sampling))

            if features:
                ft_rates = group_feature_rate(node, features)
                values += ft_rates.tolist()

            outtsv.write('\t'.join(str(v) for v in values) + '\n')

    tree.write(outfile=outnames['tree'],
               format=1,
               is_leaf_fn=is_leaf_fn,
               format_root_node=True)
コード例 #7
0
import re
from math import modf
import seaborn as sns
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import ete3


def get_nodes(tree, clade):
    subtree = tree & clade
    clade_nodes = [n.name for n in subtree.traverse()]
    return clade_nodes


tree = ete3.PhyloTree('species_trees/MethHikHalo_f30_19000_clean.tree')
for n in tree.traverse():
    if not n.is_leaf():
        n.name = str(int(n.support))

clades = {
    'Halobacteria': '107',
    'Hikarchaea': '93',
    'Methanomicrobia': '98',
    'Archaeoglobales': '101',
    'Methanocellales': '91',
    'Methanosarcinales': '104',
    'Syntrophoarchaea': '62'
}
clades = {v: get_nodes(tree, k) for v, k in clades.items()}
clades = {s: c for c, specs in clades.items() for s in specs}
コード例 #8
0
ファイル: possvm.py プロジェクト: xgrau/possvm-orthology
else:
    outgroup = []

# select clustering method
valid_methods = ["mcl", "louvain", "lpa", "mclw"]
if method in valid_methods:
    clusters_function_string = "clusters_%s" % method
else:
    print("Error, invalid clustering method \'%s\'!" % method)
    print("Valid methods are: %s" % valid_methods)
    sys.exit()

# use species tree reconciliation?
if spstree is not None:
    do_sps_reconciliation = True
    phs = ete3.PhyloTree(spstree)
else:
    do_sps_reconciliation = False

#########################
####### FUNCTIONS #######
#########################

# logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)-5.5s]\t%(message)s",
                    handlers=[logging.StreamHandler()])


def write_tree(phy,
               out,
コード例 #9
0
# # define species phylogeny
# sp = "((((Hsa,Ptr),(Mmu,Mms)),Cfa),Dme);"
# #sp = "((((Hsa,Ptr),(Mmu,Mms)),(Cfa,(Lup,(Can,Cam)))),(Dme,Ano));"
# phs = ete3.PhyloTree(sp)
# phs.set_species_naming_function(lambda node: node.name )
# print(phs)


# gene tree
phy_fn = "/home/xavi/dades/Anotacions/orthofinder_Ano14sps_noclu_9oct19/output/Trees/OG0000058.iqt.treefile"
#phy    = ete3.PhyloTree("%s" % (phy_fn))
#phy.set_species_naming_function(lambda node: node.name.split("_")[0] )

# species tree
phs_fn = "/home/xavi/Documents/auto-orthology/orthofinder_Ano14sp/tree.newick"
phs    = ete3.PhyloTree("%s" % (phs_fn))
phs.set_species_naming_function(lambda node: node.name.split("_")[0] )

# define a dictionary of species-two-species relative ages
# for each species in the phyolgeny
def species_age_dict(phs):
		
	# init dict of dicts
	sps_age_dict = dict()
	sps_list     = phs.get_leaf_names()
	age_root     = 0
	for n,i in enumerate(sps_list):

		# init dict for species i
		sps_age_dict[i] = dict()
		for m,j in enumerate(sps_list):
コード例 #10
0
import pandas as pd
import ete3
import sys

treefile = sys.argv[1] 

focus_taxon = sys.argv[2]

column = sys.argv[3]

df = pd.read_csv("taxonomy_orthofinder_selection.csv", sep='\t')
tree = ete3.PhyloTree(treefile, format=2)

df.loc[df[column].apply(lambda x: focus_taxon in x), 'group'] = focus_taxon

for l in tree.iter_leaves():
    sp = l.name.split('..')[0]
    group = df[df['Name'] == sp].iloc[0]['group']
    group = group.replace(' ', '_').replace('"', '')
    l.name = "{}..{}".format(group, l.name)
    
tree.write(outfile=treefile.replace('.treefile', '.{}.treefile'.format(focus_taxon)), format=2)
コード例 #11
0
align = AlignIO.read(child.stdout, "clustal")
print(align)

# convert into PHYLIP format
phylip = "seqs.phy"
with open(phylip, 'w') as out:
    AlignIO.write(align, out, 'phylip')

###
# reconstruct phylogenetic tree
from Bio import Phylo
from Bio.Phylo.Applications import PhymlCommandline, FastTreeCommandline
#cmd = PhymlCommandline(input=phylip, datatype='aa')
cmd = FastTreeCommandline(input=phylip, out=phylip + ".nw")
out_log, err_log = cmd()

tree = Phylo.read(phylip + ".nw", 'newick')  # '_phyml_tree.txt'
Phylo.draw_ascii(tree)

# ete
import ete3

t = ete3.PhyloTree(phylip + ".nw")

# root by mid-point
t.set_outgroup(t.get_midpoint_outgroup())

print(t)
t.show()
t.render(fn + ".svg")
コード例 #12
0
            elif node_type == 2:
                get_node_order(node, None)

            elif node_type == 3:

                get_node_order(node, mypartitions[node.name])

        # Now we get the node orders of the tree

        print(",".join(root.order))


def test_tree():
    all_partitions = write_partitions(mytree)
    generate_node_orders(all_partitions)


if __name__ == "__main__":

    if len(sys.argv) != 2:
        print("usage: python generate_orders.py mytree")
        exit(0)

    scr, mytree_file = sys.argv

    with open(mytree_file) as f:
        mytree = ete3.PhyloTree(f.next().strip(), format=1)

    all_partitions = write_partitions(mytree)
    generate_node_orders(all_partitions)
コード例 #13
0
def combine_features(data, dsizes, tree, taxid2sp, prot2taxid, taxa_to_merge):
    """ create a ete3 tree with domain features from jackhmmer
    """
    # motif example from http://etetoolkit.org/docs/latest/tutorial/tutorial_drawing.html#phylogenetic-trees-and-sequence-domains
    #simple_motifs = [
    ## seq.start, seq.end, shape, width, height, fgcolor, bgcolor
    #[10, 60, "[]", None, 10, "black", "rgradient:blue", "arial|8|white|long text clipped long text clipped"],
    #[120, 150, "o", None, 10, "blue", "pink", None],
    #[200, 300, "()", None, 10, "blue", "red", "arial|8|white|hello"],
    #]

    # add domain match
    # and ount number of sequences by taxid
    # and get size
    #dsize = dict()
    motifs = dict()
    dcnt_sp = dict()
    for tremolo_dom in data:
        tremolo_dom_start, tremolo_dom_stop = data[tremolo_dom]["QPos"]
        del data[tremolo_dom]["QPos"]
        for prot in data[tremolo_dom]:
            taxid = prot2taxid[prot]
            sp = taxid2sp[taxid]
            sp = sp.replace("(", "").replace(")",
                                             "").replace(",",
                                                         "").replace(";", "")
            taxid2sp[taxid] = sp

            full_domains = data[tremolo_dom][prot].get("Tpos", list())[:]

            target_domains = filter_domain_arch(full_domains)

            dom_arch = list()
            ordered_domains = sorted(target_domains)
            for start, stop, dom in ordered_domains:
                dom_arch.append(dom)
            dom_arch = ";".join(dom_arch)

            if sp not in dcnt_sp:
                dcnt_sp[sp] = dict()
            if dom_arch not in dcnt_sp[sp]:
                dcnt_sp[sp][dom_arch] = list()
            dcnt_sp[sp][dom_arch].append(prot)

            # add domains
            for start, stop, dom in ordered_domains:
                color = get_domain_color(dom)
                motifs.setdefault(sp, dict())\
                        .setdefault(prot, list())\
                        .append([start+1, stop, "[]", None, 10, "black", color, "arial|1|black|{}".format(dom)])
                #motifs[sp][prot].sort()
            for hitnb in data[tremolo_dom][prot]["Hit"]:
                start, stop = data[tremolo_dom][prot]["Hit"][hitnb]["Tali"]
                motifs.setdefault(sp, dict())\
                        .setdefault(prot, list())\
                        .append([start, stop, "o", None, 10, "black", "red", "arial|1|black|HCAdom {}-{}".format(tremolo_dom, hitnb)])

    #print(motifs)

    # merge taxonomic groups
    for taxid_taxa in taxa_to_merge:
        taxid, taxa = taxid_taxa.split(",")
        lnode = tree.search_nodes(name=taxid)
        if len(lnode) > 0:
            taxnode = lnode[0]
            children = [child.name for child in taxnode.children]
            leaves = list()
            for child in taxnode.traverse():
                if child.is_leaf():
                    leaves.append(child)
                    #print(taxid_taxa, child.name)
            taxid2sp[taxid] = taxa
            dcnt_sp[taxa] = dict()
            motifs[taxa] = dict()
            for leafnode in leaves:
                nodeid = leafnode.name
                nodesp = taxid2sp[nodeid]
                #print(taxid_taxa, nodeid, nodesp)
                # merge protein list
                for dom_arch in dcnt_sp[nodesp]:
                    for prot in dcnt_sp[nodesp][dom_arch]:
                        dcnt_sp[taxa].setdefault(dom_arch, list()).append(prot)

                # merge motif
                for prot in motifs[nodesp]:
                    for m in motifs[nodesp][prot]:
                        motifs[taxa].setdefault(prot, list()).append(m)
                # delete obsoletes species
                del dcnt_sp[nodesp]
                del motifs[nodesp]
                del taxid2sp[nodeid]
            #for dom_arch in dcnt_sp[taxa]:
            #for prot in dcnt_sp[taxa][dom_arch]:
            #print(prot, dom_arch)
            for child in children:
                node = tree.search_nodes(name=child)[0]
                node.detach()
            #print(taxnode.get_ascii(show_internal=True))
        else:
            print("Unable to find taxid {} in tree".format(taxid))
            print(lnode)

    for taxid in taxid2sp:
        #print(taxid)
        node = tree.search_nodes(name=taxid)
        if node != []:
            node[0].name = taxid2sp[taxid]
        else:
            print("Unable to find node for taxid {}".format(taxid))

    # expand taxonomic tree by the number of sequences in each taxa
    for node in tree:
        if node.is_leaf():
            if node.name in dcnt_sp:
                node_sp = node.name
                proteins = list()
                features = dict()
                for dom_arch in dcnt_sp[node_sp]:
                    sizes_and_proteins = list()
                    for prot in dcnt_sp[node_sp][dom_arch]:
                        new_name = node_sp + " | " + prot.split("|")[1]
                        if len(dcnt_sp[node_sp][dom_arch]) > 1:
                            new_name += " [+{}]".format(
                                len(dcnt_sp[node_sp][dom_arch]) - 1)
                        sizes_and_proteins.append(
                            (dsizes[prot], new_name, dom_arch, prot))
                    sizes_and_proteins.sort(reverse=True)
                    sizes, names, dom_archs, prots = zip(*sizes_and_proteins)
                    proteins.append(names[0].replace(":", " "))
                    features[names[0].replace(":",
                                              " ")] = (prots[0], dom_archs[0])
                subtree = ete3.PhyloTree("({});".format(", ".join(proteins)))
                node.add_child(subtree)
                for new_node in subtree:
                    prot, dom_arch = features[new_node.name]
                    seq = "G" * dsizes[prot]
                    m = motifs[node_sp][prot]
                    seqFace = SeqMotifFace(seq, seq_format="line", motifs=m)
                    new_node.add_face(seqFace, 0, "aligned")
    #for dom in domain_color:
    #print(dom, domain_color[dom])
    return tree
コード例 #14
0
def parse_tree_data(args, c):
    # create a phylo tree object
    newick_tree = c['cft.reconstruction:asr_tree']['tripl.file:contents']
    newick_tree_path = str(c['cft.reconstruction:asr_tree']['tripl.file:path'])
    tree = ete3.PhyloTree(newick_tree, format=1)
    # parse out sequences and other sequence metadata
    aa_seqs_dict = create_seqs_dict(
        c['cft.reconstruction:cluster_aa']['bio.seq:set'])
    nt_seqs_dict = create_seqs_dict(
        c['cft.reconstruction:asr_seqs']['bio.seq:set'])
    seqmeta_dict = create_seqmeta_dict(
        c['cft.reconstruction:seqmeta']['tripl.csv:data'])

    # Note that this function is impure; it's mutable over the internal nodes
    def process_node(node):
        node.label = node.id = node.name
        node.nt_seq = nt_seqs_dict[node.name]
        node.aa_seq = aa_seqs_dict[node.name]
        for attr, parser in [['cft.seq:multiplicity', int],
                             ['cft.seq:timepoint_multiplicities', listofint],
                             ['cft.seq:cluster_multiplicity', int],
                             [
                                 'cft.seq:cluster_timepoint_multiplicities',
                                 listofint
                             ], ['cft.seq:timepoint', None],
                             ['cft.seq:timepoints', listof],
                             ['cft.seq:cluster_timepoints', listof],
                             ['cft.seq:affinity', float],
                             ['cft.tree.node:lbi', float],
                             ['cft.tree.node:lbr', float]]:
            seqmeta = seqmeta_dict.get(node.name, {})
            node.__dict__[attr.split(':')[1]] = (parser or (lambda x: x))(
                seqmeta.get(attr)) if seqmeta.get(attr) else None
        node.type = "node"
        if node.is_leaf():
            node.type = "leaf"
        if node.up:
            # get parent info, distance for non root
            node.parent = node.up.name
            node.length = node.get_distance(node.up)
            try:
                node.distance = node.get_distance(args.inferred_naive_name)
            except Exception as e:
                if args.verbose:
                    warnings.warn("Unable to compute distance to naive '" +
                                  str(args.inferred_naive_name) +
                                  "' in file " + newick_tree_path)
                    print("newick tree:", newick_tree)
                raise e
        else:
            # node is root
            node.type = "root"
            node.parent = None
            node.length = 0.0
            node.distance = 0.0
        node = node
        #import pdb; pdb.set_trace()
        return ({
            'id':
            node.id,
            'label':
            node.label,
            'type':
            node.type,
            'parent':
            node.parent,
            'length':
            node.length,
            'distance':
            node.distance,
            'nt_seq':
            node.nt_seq,
            'aa_seq':
            node.aa_seq,
            'affinity':
            node.affinity,
            'lbi':
            node.lbi,
            'lbr':
            node.lbr,
            'timepoint':
            node.timepoint,
            'multiplicity':
            node.multiplicity,
            'cluster_multiplicity':
            node.cluster_multiplicity,
            # change this to real list of key value objects for timepoint multiplicities for #56
            'timepoint_multiplicities': [{
                'timepoint': t,
                'multiplicity': m
            } for t, m in zip(node.timepoints or [],
                              node.timepoint_multiplicities or [])],
            'cluster_timepoint_multiplicities': [{
                'timepoint': t,
                'multiplicity': m
            } for t, m in zip(node.cluster_timepoints or [],
                              node.cluster_timepoint_multiplicities or [])]
        })

    # map through and process the nodes
    return map(process_node, tree.traverse('postorder'))
コード例 #15
0
ファイル: possvm.py プロジェクト: xgrau/possvm-orthology
def parse_phylo(phy_fn,
                phy_id,
                do_root,
                do_allpairs,
                clusters_function_string,
                outgroup=outgroup,
                do_sps_reconciliation=do_sps_reconciliation):

    # load input
    phy = ete3.PhyloTree("%s" % (phy_fn))
    logging.info("%s num nodes = %i" % (phy_id, len(phy)))
    logging.info("%s clustering function is %s" %
                 (phy_id, clusters_function_string))
    clusters_function = eval(clusters_function_string)
    if do_sps_reconciliation:
        logging.info("%s do species tree reconciliation" % (phy_id))

    # assign species names to tree
    phy.set_species_naming_function(lambda node: node.name.split(split_ch)[0])

    # resolve polytomies (randomly)
    phy.resolve_polytomy(recursive=True)

    # try to find root if unrooted
    if do_root:

        # shall we do it with iterative midpoint rooting?
        if itermidroot is not None:

            niter = itermidroot
            num_evs_per_iter = np.zeros(niter)
            out_nod_per_iter = np.empty(niter, dtype=object)

            # then, iterate to try to find better candidates
            phy_it = phy.copy(method="newick")
            phy_outgroup_it = phy_it.get_midpoint_outgroup()
            phy_it.set_outgroup(phy_outgroup_it)

            # select very short length to shorten every rooting candidate branch (based on in-tree branch distribution)
            dist_lengths = [
                node.dist for node in phy.traverse("postorder")
                if node.dist > 0
            ]
            shrunk_length = np.quantile(dist_lengths, 0.1)

            for roi in range(niter):

                # parse events and re-do clustering
                evs_it, _, _, phy_lis_it = parse_events(
                    phy=phy_it,
                    outgroup=outgroup,
                    do_allpairs=False,
                    min_support_node=min_support_node)
                clu_it = clusters_function(evs=evs_it,
                                           node_list=phy_lis_it,
                                           verbose=False)

                # store number of orthogroups in this particular iteration
                num_evs_per_iter[roi] = len(np.unique(
                    clu_it["cluster"].values))
                out_nod_per_iter[roi] = phy_outgroup_it

                print("%s Iterative midpoint root | %i/%i | n OGs = %i" %
                      (phy_id, roi + 1, niter, num_evs_per_iter[roi]))
                # in subsequent iterations, shrink the previous outgroup branch, and try to find second-best candidate
                phy_outgroup_it.dist = shrunk_length
                phy_outgroup_it = phy_it.get_midpoint_outgroup()
                phy_it.set_outgroup(phy_outgroup_it)

            # select outgroup that minimises number of OGs (more agglomerative)
            phy_outgroup_ix = np.argmin(num_evs_per_iter)

            # outgroup node in iterated tree
            phy_outgroup_from_it = out_nod_per_iter[phy_outgroup_ix]
            phy_outgroup_descendants = [
                t for t in phy_outgroup_from_it.get_leaf_names()
            ]

            # outgroup node in original tree
            phy_outgroup = phy.get_common_ancestor(phy_outgroup_descendants)

            if len(phy_outgroup_descendants) != len(
                    phy_outgroup.get_leaf_names()):
                print(
                    "%s Iterative midpoint root found and impossible root, default to midpoint"
                    % (phy_id))
                phy_outgroup_ix = 0
                phy_outgroup = phy.get_midpoint_outgroup()

            # set outgroup
            # print(phy_outgroup_ix, phy_outgroup)
            logging.info("%s Best root at iteration  | %i/%i | n OGs = %i" %
                         (phy_id, phy_outgroup_ix + 1, niter,
                          num_evs_per_iter[phy_outgroup_ix]))

        # ...or shall we do it with simple midpoint rooting?
        else:

            # set outgroup using normal midpoint rooting
            logging.info("%s Midpoint root" % phy_id)
            phy_outgroup = phy.get_midpoint_outgroup()

        # set root
        phy.set_outgroup(phy_outgroup)

    # ignore rooting
    else:

        pass
        logging.info("%s Skip rooting (assume tree is already rooted)" %
                     phy_id)

    # ladderise phylogeny
    phy.ladderize()

    # parse events
    if do_sps_reconciliation:
        evs, eva, phy, phy_lis = parse_events_sps_reconciliation(
            phy=phy, phs=phs, outgroup=outgroup, do_allpairs=do_allpairs)
    else:
        evs, eva, phy, phy_lis = parse_events(
            phy=phy,
            outgroup=outgroup,
            do_allpairs=do_allpairs,
            min_support_node=min_support_node)
    clu = clusters_function(evs=evs, node_list=phy_lis)

    # output from event parsing
    return evs, eva, phy, phy_lis, clu
コード例 #16
0
import ete3


def get_ancestor(nodes):
    ancestor = nodes[0]
    for n in nodes:
        ancestor = ancestor.get_common_ancestor(n)
    return ancestor


tree = ete3.PhyloTree('astral/astral_tree.main_tree', quoted_node_names=False)
treepp = ete3.PhyloTree('astral/astral_tree_pp.new', quoted_node_names=False)

counter = 0
c2supp = {}
for n in tree.traverse():
    if not n.is_leaf():
        kids = [c.name for c in n.get_leaves()]
        pp_n = get_ancestor([treepp.get_leaves_by_name(k)[0] for k in kids])
        c2supp[counter] = "{}/{}".format(round(n.support, 1), pp_n.support)
        n.support = counter
        counter += 1

tree_str = tree.write()
for c, s in c2supp.items():
    tree_str = tree_str.replace(")" + str(c) + ":", ")'" + s + "':")

with open('astral/astral_tree_combined.new', 'w') as out:
    print(tree_str, file=out)
コード例 #17
0
ファイル: process_cft_data.py プロジェクト: matsengrp/olmsted
def parse_tree_data(args, c):
    # create a phylo tree object
    newick = c["newick"]
    tree = ete3.PhyloTree(newick, format=1)
    # parse out sequences and other sequence metadata
    aa_seqs_dict = create_seqs_dict(
        c["cft.reconstruction:cluster_aa"]["bio.seq:set"])
    dna_seqs_dict = create_seqs_dict(
        c["cft.reconstruction:asr_seqs"]["bio.seq:set"])
    seqmeta_dict = create_seqmeta_dict(
        c["cft.reconstruction:seqmeta"]["tripl.csv:data"])

    # Note that this function is impure; it's mutable over the internal nodes
    def process_node(node):
        node.dna_seq = dna_seqs_dict[node.name]
        node.aa_seq = aa_seqs_dict[node.name]
        for attr, parser in [
            ["cft.seq:multiplicity", int],
            ["cft.seq:timepoint_multiplicities", listofint],
            ["cft.seq:cluster_multiplicity", int],
            ["cft.seq:cluster_timepoint_multiplicities", listofint],
            ["cft.seq:timepoint", None],
            ["cft.seq:timepoints", listof],
            ["cft.seq:cluster_timepoints", listof],
            ["cft.seq:affinity", float],
            ["cft.tree.node:lbi", float],
            ["cft.tree.node:lbr", float],
        ]:
            seqmeta = seqmeta_dict.get(node.name, {})
            try:
                value = ((parser or (lambda x: x))(seqmeta.get(attr))
                         if seqmeta.get(attr) else None)
            except ValueError as e:
                value = None
            node.__dict__[attr.split(":")[1]] = value
        node.type = "node"
        if node.is_leaf():
            node.type = "leaf"
        if node.up:
            # get parent info, distance for non root
            node.parent = node.up.name
            node.length = node.get_distance(node.up)
            try:
                node.distance = node.get_distance(args.inferred_naive_name)
            except Exception as e:
                if args.verbose:
                    warnings.warn(
                        "Unable to compute distance to naive '{}' in file {}".
                        format(
                            str(args.inferred_naive_name),
                            str(c["cft.reconstruction:asr_tree"]
                                ["tripl.file:path"]),
                        ))
                    print("newick:", newick)
                raise e
        else:
            # node is root
            node.type = "root"
            node.parent = None
            node.length = 0.0
            node.distance = 0.0
        node = node
        # import pdb; pdb.set_trace()
        return {
            "id":
            node.name,
            "type":
            node.type,
            "parent":
            node.parent,
            "length":
            node.length,
            "distance":
            node.distance,
            "dna_seq":
            node.dna_seq,
            "aa_seq":
            node.aa_seq,
            "affinity":
            node.affinity,
            "lbi":
            node.lbi,
            "lbr":
            node.lbr,
            "timepoint_id":
            node.timepoint,
            "multiplicity":
            node.multiplicity,
            "cluster_multiplicity":
            node.cluster_multiplicity,
            # change this to real list of key value objects for timepoint multiplicities for #56
            "timepoint_multiplicities": [{
                "timepoint_id": t,
                "multiplicity": m
            } for t, m in zip(node.timepoints or [],
                              node.timepoint_multiplicities or [])],
            "cluster_timepoint_multiplicities": [{
                "timepoint_id": t,
                "multiplicity": m
            } for t, m in zip(
                node.cluster_timepoints or [],
                node.cluster_timepoint_multiplicities or [],
            )],
        }

    # map through and process the nodes
    return {n.name: process_node(n) for n in tree.traverse("postorder")}
コード例 #18
0
    sis_up = sis.up
    sis.detach()

    new_node = ete3.PhyloNode()
    sis_up.add_child(new_node)
    new_node.add_child(sis)
    new_node.add_child(anc)


topologies = [(['Picozoa'], ['Rhodelphis', 'Rhodophyta']),
              (['Picozoa'], ['Rhodophyta']), (['Picozoa'], ['Rhodelphis']),
              (['Picozoa'], ['Viridiplantae', 'Glaucophyta']),
              (['Picozoa'], ['Glaucophyta']), (['Picozoa'], ['Viridiplantae']),
              (['Picozoa'], ['Archaeplastida']), (['Picozoa'], ['Telonemia']),
              (['Picozoa'],
               ['Telonemia', 'Rhizaria', 'Stramenopila', 'Alveolata']),
              (['Picozoa'], ['Cryptista']),
              (['Picozoa', 'Cryptista'], ['Rhodophyta', 'Rhodelphis']),
              (['Picozoa', 'Cryptista'], ['Rhodophyta']),
              (['Picozoa', 'Cryptista'], ['Viridiplantae', 'Glaucophyta']),
              (['Picozoa', 'Cryptista'], ['Glaucophyta']),
              (['Picozoa', 'Cryptista'], ['Viridiplantae'])]

tree = ete3.PhyloTree("orig_topology.new", format=0)

with open('all_topologies2test.new', 'w') as out:
    for c1, c2 in topologies:
        # print("({}),({})".format(','.join([c for c in c1]), ','.join([c for c in c2])))
        make_sisters(tree, c1, c2)
        print(tree.write(format=9), file=out)
コード例 #19
0
import os 
    
def get_mono_clades(tree, taxon):
    seeds = set([l for l in tree.get_leaves() if l.clade == taxon])
    nodes = set()
    for s in seeds:
        n = s
        while all([l.clade == taxon for l in n.up.get_leaves()]):
            n = n.up
        nodes.add(n)
    return nodes
    
def get_sister_id(node):
    if node.up.get_children()[0] == node:
        return set([l.clade for l in node.up.get_children()[1]])
    elif node.up.get_children()[1] == node:
        return set([l.clade for l in node.up.get_children()[0]])


with open('sisters_picozoa_sgt.csv', 'w') as out:
    for f in glob.glob("trees_for_fabien_renamed/*.new"):
        clst = os.path.basename(f).replace('.new', '')
        tree = ete3.PhyloTree(f, format=2)
        for l in tree.iter_leaves():
            l.add_feature(pr_name='clade', pr_value=l.name.replace("'", "").split('_')[0])
        clades = get_mono_clades(tree, 'Picozoa')
        for i, n in enumerate(clades, 1):
            sister = get_sister_id(n)
            if sister:
                print("{}\tclade {}\t{}".format(clst, i, ";".join(list(sister))), file=out)
コード例 #20
0
    if all([l in prr for l in anc.get_leaves()]):
        return anc.support
    else:
        return 0.0


steps = ["step{}".format(i) for i in range(11)]
clades = [
    'Picozoa+Rhodelphis+Rhodophyta', 'Rhodelphis+Rhodophyta', 'Archaeplastida',
    "Archaeplastida+Cryptista", 'Amorphea', 'TSAR', 'SAR'
]
df = pd.DataFrame(index=steps, columns=clades)

for f in glob.glob("*.treefile"):
    step = f.replace('.treefile', '')
    tree = ete3.PhyloTree(f)
    df.loc[step, 'Picozoa+Rhodelphis+Rhodophyta'] = check_monophyly_support(
        tree, ['Picozoa', 'Rhodelphis', 'Rhodophyta'])
    df.loc[step, 'Rhodelphis+Rhodophyta'] = check_monophyly_support(
        tree, ['Rhodelphis', 'Rhodophyta'])
    df.loc[step, 'Archaeplastida'] = check_monophyly_support(
        tree, ['Picozoa', 'Archaeplastida'])
    df.loc[step, 'Archaeplastida+Cryptista'] = check_monophyly_support(
        tree, ['Picozoa', 'Archaeplastida', 'Cryptista'])
    df.loc[step, 'Amorphea'] = check_monophyly_support(
        tree, ['Opisthokonta', 'Amoebozoa', 'Apusomonadida', 'Breviatea'])
    df.loc[step, 'TSAR'] = check_monophyly_support(
        tree, ['Telonemia', 'Stramenopila', 'Rhizaria', 'Alveolata'])
    df.loc[step, 'SAR'] = check_monophyly_support(
        tree, ['Stramenopila', 'Rhizaria', 'Alveolata'])
コード例 #21
0
def load_spTree(fileName):
    spTree = ete3.PhyloTree(fileName, sp_naming_function=whole_name)
    for leaf in spTree.iter_leaves():
        if len(leaf.name) != 5:
            leaf.delete()
    return spTree
コード例 #22
0
ファイル: tree.py プロジェクト: jolespin/soothsayer
def create_tree(
        # Base
        newick=None,
        name=None,
        format=0,
        dist=1.0,
        support=1.0,
        quoted_node_names=False,
        # ClusterTree
        text_array=None,
        fdist=None,
        # PhyloTree
        alignment=None,
        alg_format='fasta',
        sp_naming_function=None,
        # PhyloxmlTree
        phyloxml_clade=None,
        phyloxml_phylogeny=None,
        # Constructor
        node_prefix="y",
        into=ete3.Tree,
        prune=None,
        force_bifuraction=True,
        # Keywords
        tree_kws=dict(),
        bifurcation_kws=dict(recursive=True),
):
    """
    Next: Convert to NetworkX
    """
    # Should the tree be converted to skbio
    convert_to_skbio = False
    if into in [skbio.TreeNode]:
        into = ete3.Tree
        convert_to_skbio = True

    # ete3 construction
    if into == ete3.Tree:
        tree = ete3.Tree(newick=newick,
                         format=format,
                         quoted_node_names=quoted_node_names,
                         **tree_kws)
    if into == ete3.ClusterTree:
        if isinstance(text_array, pd.DataFrame):
            text_array = dataframe_to_matrixstring(text_array)
        tree = ete3.ClusterTree(newick=newick,
                                text_array=text_array,
                                fdist=fdist,
                                **tree_kws)
    if into == ete3.PhyloTree:
        tree = ete3.PhyloTree(newick=newick,
                              alignment=alignment,
                              alg_format=alg_format,
                              sp_naming_function=sp_naming_function,
                              format=format,
                              **tree_kws)
    if into == ete3.PhyloxmlTree:
        tree = ete3.PhyloxmlTree(phyloxml_clade=phyloxml_clade,
                                 phyloxml_phylogeny=phyloxml_phylogeny,
                                 **tree_kws)

    # Set base attributes
    for k, v in dict(name=name, dist=dist, support=support).items():
        setattr(tree, k, v)

    # Prune
    if prune is not None:
        tree.prune(prune)

    # Bifurcation
    if force_bifuraction:
        n_internal_nodes = len(
            [*filter(lambda node: node.is_leaf() == False, tree.traverse())])
        n_leaves = len([*filter(lambda node: node.is_leaf(), tree.traverse())])
        if n_internal_nodes < (n_leaves - 1):
            tree.resolve_polytomy(**bifurcation_kws)

    # Node prefix
    if node_prefix is not None:
        tree = name_tree_nodes(tree, node_prefix=node_prefix)
    if not convert_to_skbio:
        return tree
    # skbio
    else:
        return ete_to_skbio(tree, node_prefix=None)
コード例 #23
0
# input variables
# phy_fn = "set_raxml.newick"
# out_fn = "set_raxml.out_ete"
phy_fn = sys.argv[1]
out_fn = sys.argv[2]

# logging
logging.basicConfig(level=logging.DEBUG,
                    format="%(asctime)s [%(levelname)-5.5s]\t%(message)s",
                    handlers=[
                        logging.FileHandler("%s.log" % out_fn, mode="w"),
                        logging.StreamHandler()
                    ])

# load input
phy = ete3.PhyloTree(phy_fn)
logging.info("Phylogeny = %s" % phy_fn)
logging.info("Nodes = %i" % len(phy))

# assign species names to tree
phy.set_species_naming_function(lambda node: node.name.split("_")[0])
phy_sps = [n.species for n in phy.get_leaves()]
phy_sps_set = set(phy_sps)
phy_seq = [n.name for n in phy.get_leaves()]

# check if tree is rooted, apply midpoint root if unrooted
phy_root = phy.get_tree_root()
phy_outg = phy_root.get_children()
is_root = len(phy_outg) == 2
if is_root:
    pass
コード例 #24
0
            "n_presences": mat_pres.sum(axis=1)
        },
        columns=[
            "orthogroup", "presence", "gain", "loss", "n_gains", "n_losses",
            "n_presences"
        ])

    # output
    return dat, mat_gain, mat_loss, mat_pres


#### MAIN WORK ####

# load input tree
print("# Load tree from %s" % phs_fn)
phs = ete3.PhyloTree("%s" % (phs_fn), format=1)
# assign species names to tree
phs.set_species_naming_function(lambda node: node.name)
# resolve polytomies in a random fashion
#phs.resolve_polytomy(recursive=True)

# load orthoclusters
print("# Load orthoclusters from %s" % ort_fn)
ort = pd.read_csv(ort_fn, sep="\t")
ort = ort[[gene_col, clus_col]]

# obtain species-to-species dictionary of relative ages
print("# Species-to-species relative ages from %s" % phs_fn)
species_orig_dict, species_ages_dict, sps_list, anc_list = do_species_orig_dict(
    phs=phs)
dat, mat_gain, mat_loss, mat_pres = do_ancestral_reconstruction(
コード例 #25
0
# input variables
# phy_fn = "set_raxml.newick"
# out_fn = "set_raxml.out_ete"
phy_fn = sys.argv[1]
out_fn = "%s.out_ete" % phy_fn.split(sep=".")[0]

# logging
logging.basicConfig(level=logging.DEBUG,
                    format="%(asctime)s [%(levelname)-5.5s]\t%(message)s",
                    handlers=[
                        logging.FileHandler("%s.log" % out_fn, mode="w"),
                        logging.StreamHandler()
                    ])

# load input
phy = ete3.PhyloTree("%s" % (phy_fn))
logging.info("Phylogeny = %s" % phy_fn)
logging.info("Nodes = %i" % len(phy))

# assign species names to tree
phy.set_species_naming_function(lambda node: node.name.split("_")[0])
phy_sps = [n.species for n in phy.get_leaves()]
phy_sps_set = set(phy_sps)
phy_seq = [n.name for n in phy.get_leaves()]

# check if tree is rooted, apply midpoint root if unrooted
phy_root = phy.get_tree_root()
phy_outg = phy_root.get_children()
is_root = len(phy_outg) == 2
if is_root:
    pass
コード例 #26
0
ファイル: possom_dev.py プロジェクト: xgrau/auto-orthology
def parse_phylo(phy_fn, phy_id, is_root):

	# load input
	phy = ete3.PhyloTree("%s" % (phy_fn))
	logging.info("%s num nodes = %i" % (phy_id,len(phy)))
	# assign species names to tree
	phy.set_species_naming_function(lambda node: node.name.split(split_ch)[0] )
	# resolve polytomies in a random fashion
	phy.resolve_polytomy(recursive=True)
	# check if tree is rooted, apply midpoint root if unrooted
	# NOT APPLIED: GLOBAL VARIBLE USED INSTEAD
	# phy_root = phy.get_tree_root()
	# phy_outg = phy_root.get_children()
	# is_root  = len(phy_outg) == 2
	if is_root:
		pass
		logging.info("%s Tree is rooted, pass" % phy_id)
	else: 
		logging.info("%s Tree is unrooted, apply midpoint root" % phy_id)
		phy_outgroup = phy.get_midpoint_outgroup()
		phy.set_outgroup(phy_outgroup)

	# ladderise phylogeny
	phy.ladderize()

	# list of genes in phylogeny
	phy_lis = phy.get_leaf_names()

	# find evolutionary events (duplications and speciations)
	evev = phy.get_descendant_evol_events(sos_thr=sos)

	# speciation events
	evs    = np.empty((len(evev)*len(evev), 5), dtype="object")
	evs[:] = np.nan
	n = 0
	for ev in evev:
		if ev.etype == "S":
			for ii in ev.in_seqs:
				for oi in ev.out_seqs:
					evs[n,0] = ii
					evs[n,1] = oi
					evs[n,2] = ev.branch_supports[0]
					evs[n,3] = ev.etype
					evs[n,4] = ev.sos
					n = n + 1
	evs = pd.DataFrame(evs).dropna()
	evs.columns = ["in_gene","out_gene","branch_support","ev_type","sos"]

	# duplications
	evd    = np.empty((len(evev)*len(evev), 5), dtype="object")
	# evd[:] = np.nan
	# n = 0
	# for ev in evev:
	# 	if ev.etype == "D":
	# 		for ii in ev.in_seqs:
	# 			for oi in ev.out_seqs:
	# 				evd[n,0] = ii
	# 				evd[n,1] = oi
	# 				evd[n,2] = ev.branch_supports[0]
	# 				evd[n,3] = ev.etype
	# 				evd[n,4] = ev.sos
	# 				n = n + 1
	# evd = pd.DataFrame(evd).dropna()
	# evd.columns = ["in_gene","out_gene","branch_support","ev_type","sos"]

	return evs, evd, phy, phy_lis