def improve_tree_from_align (tree, align, if_tre = None, of_tre = None, a_file = None, prefix = "/tmp/", n_threads = 4, params=None):
    if (tree is None) and (if_tre is None):
        print ("ERROR: You must give me a tree object or file")
    if (align is None) and (a_file is None):
        print ("ERROR: You must give me an alignment or file")
    if params is None: params = "-m HKY+G -me 0.05 -blmin 0.000005 -blmax 4"
    if prefix is None: prefix = "./"
    if if_tre is None: ifl = prefix + "in_iqtre.tre"
    else:              ifl = if_tre
    if of_tre is None: ofl = prefix + "out_iqtre.tre"
    else:              ofl = of_tre
    if a_file is None: aln = prefix + "seq.aln"
    else:              aln = a_file

    if align: SeqIO.write(align, aln, "fasta") ## else it should be present in infile
    if tree:  tree.write(format=1, outfile=ifl) ## to recycle the file make sure tree and align are None

    n_threads = str (n_threads)
    runstr = f"iqtree -g {ifl} -s {aln} {params} -ninit 1 -nt {n_threads} -redo; mv {aln}.treefile {ofl}"
    proc_run = subprocess.check_output(runstr, shell=True, universal_newlines=True)    

    treestring = open(ofl).readline().rstrip().replace("\'","").replace("\"","").replace("[&R]","")
    tree_out = ete3.Tree (treestring)

#    os.system(f"rm -f {aln}.* ifl")
#    if a_file is None: os.system(f"rm -f {aln}")
#    if of_tre is None: os.system(f"rm -f {ofl}")
    return tree_out
Example #2
0
def check_supersets(tree):
    if tree.is_leaf():
        return False

    moved = False
    for c1 in tree.children:
        for c2 in tree.children:
            if c1 == c2:
                continue
            if c1.mutations.issubset(c2.mutations):
                c1.detach()
                c2.add_child(c1)
                moved = True
            elif c2.mutations.issubset(c1.mutations):
                c2.detach()
                c1.add_child(c2)
                moved = True

            overlap = c1.mutations.intersection(c2.mutations)
            if len(overlap) > 0:
                c1.detach()
                c2.detach()
                intermediate = instantiate_node(ete3.Tree(name='NoName'))
                intermediate.mutations = overlap
                intermediate.add_child(c1)
                intermediate.add_child(c2)
                tree.add_child(intermediate)
            moved = moved or check_supersets(c1)

    return moved
Example #3
0
    def grab_trees(self, db=None, collapse=True):

        if (db is None):
            db = tables.open_file(self.h5name, mode="r")
            close = True
        else:
            close = False
        trees = [
            ete3.Tree(tree.decode('utf-8'))
            for tree in db.get_node("/" + self.scanName + "/trees",
                                    classname="Array")
        ]
        intvals = [(x[0], x[1])
                   for x in db.get_node("/" + self.scanName + "/ivals",
                                        classname="Array")]

        if (close):
            db.close()

        if (collapse):
            for tree in trees:
                for node in tree.get_descendants():
                    if (not node.is_leaf() and node.dist < self.branch_cutoff):
                        node.delete()

        return trees, intvals
 def load_tree(self, nhx_fn):
     """Load an ete3 tree
     """
     self.tree = ete3.Tree(nhx_fn, format=1)
     for i, n in enumerate(self.tree.traverse()):
         if n.name == "":
             n.name = str("node_{}".format(i))
Example #5
0
 def get_tree_with_famsizes(self):
     self.fam_sizes = []
     size_tree = ete3.Tree(self.nwk_famsize_str)
     self.tree = copy.deepcopy(self.c.tree)
     # parse family sizes:
     for node, size_tree_node in zip(
         self.tree.traverse(),
         size_tree.traverse()
     ):
         if size_tree_node.is_leaf():
             node.fam_size = int(size_tree_node.name.split('_')[1])
         else:
             node.fam_size = int(size_tree_node.support)
         self.fam_sizes.append(node.fam_size)
         node.event = None
     # parse family pvalues:
     node_pvalues = re.findall(r'[\d\.]+|-', self.branch_pvalue_str)
     for node_id, node_size in zip(self.c.cafe_node_id_order, node_pvalues):
         nodes = self.tree.search_nodes(id=node_id)
         assert len(nodes) == 1
         node = nodes[0]
         if node_size == '-' or self.pvalue > self.c.family_p_cutoff:
             node.pvalue = None
         else:
             node.pvalue = float(node_size)
             if node.pvalue <= self.c.branch_p_cutoff:
                 if node.fam_size > node.up.fam_size:
                     node.event = '+'
                 elif node.fam_size < node.up.fam_size:
                     node.event = '-'
     return
Example #6
0
def fit_clustering_model(args):
    logger.info("Started estimating clustering model parameters "
                "for each organism")
    lambdas, phis = estimate_parameters.estimate_individual_parameters(args)

    logger.info("Computing global mean estimated parameter values")

    if args["weight"] is True:
        logger.info("Importing phylogenetic tree %s", args["tree_file"])
        tree = ete3.Tree(args["tree_file"])
        logger.info("Phylogenetic tree %s imported", args["tree_file"])
        organism_weights = GSC.GSC_normalised(tree)
        lambd = estimate_parameters.compute_weighted_means(
            lambdas, organism_weights)
        phi = estimate_parameters.compute_weighted_means(
            phis, organism_weights)
    else:
        lambd = np.mean(lambdas.values())
        phi = np.mean(phis.values())

    args["lambd"], args["phi"] = lambd, phi
    logger.info("Global mean estimated parameter "
                "values: lambda={:.3g}, phi={:.3g}".format(
                    args["lambd"], args["phi"]))
    import_export_parameters. \
        save_general_parameters(lambd, phi, args["general_parameters_filename"])
    logger.info("Global mean estimated parameter values saved in file %s",
                args["general_parameters_filename"])

    return lambd, phi
def distances_between_roots(folder):
    trees = []
    count = 1
    while os.path.isfile('%s/%s.optResolution%i.ranger_input' %
                         (folder, folder, count)):
        trees.append(
            ete3.Tree(
                open('%s/%s.optResolution%i.ranger_input' %
                     (folder, folder, count)).readlines()[1]))
        count += 1

    root_distances = []
    for tree1, tree2 in combinations(trees, 2):
        for child in tree2.children:
            if child.is_leaf():
                new_root = tree1.get_leaves_by_name(name=child.name)[0]
                break
            else:
                is_it_monophyletic, clade_type, fucking_up = tree1.check_monophyly(
                    child.get_leaf_names(), 'name', unrooted=False)
                if is_it_monophyletic or child.is_leaf():
                    new_root = tree1.get_common_ancestor(
                        child.get_leaf_names())
                    break
        root_distances.append(tree1.get_distance(new_root, topology_only=True))

    return root_distances
def find_omitted_seqs_in_al(genetreelist,
                            ancestor,
                            rootdir='.',
                            subtreesdir='subtreesGoodQualO2'):

    count_ok = 0
    count_bad = 0
    for alfile, subtree, genetree in iter_glob_subtree_files(
            genetreelist, ancestor, '_genes.fa', rootdir, subtreesdir):
        al = AlignIO.read(alfile, format='fasta')
        #al = ungap(al)
        subtreefile = alfile.replace('_genes.fa', '.nwk')
        tree = ete3.Tree(subtreefile, format=1)

        al_seqnames = set([record.name for record in al])
        tree_leaves = set(tree.get_leaf_names())

        al_lacking = al_seqnames - tree_leaves
        tree_lacking = tree_leaves - al_seqnames
        if al_lacking:
            count_bad += 1
            print('%s:AL lacks: %s' % (subtree, ' '.join(al_lacking)))
        elif tree_lacking:
            count_bad += 1
            print('%s:TREE lacks: %s' % (subtree, ' '.join(al_lacking)))
        else:
            count_ok += 1

        if al_lacking and tree_lacking:
            print('AL & TREE mismatched!!!')

    print('%s ok; %s bad' % (count_ok, count_bad))
Example #9
0
def root_tree_in_position(tree_file, sp1, sp2):

    with open(tree_file) as f:
        t = ete3.Tree(f.readline().strip(), format=1)

    if len(t.get_children()) != 2:
        print(
            "Tree is not rooted. Please, use this tree as input and run this script again"
        )
        t.set_outgroup(t.get_leaves()[0])
        print(t.write(format=1))
        return None

    try:
        nsp1 = t & sp1
    except:
        print("Could not find %s in tree" % sp1)
        return None
    try:
        nsp2 = t & sp2
    except:
        print("Could not find %s in tree" % sp2)
        return None

    ca = t.get_common_ancestor(nsp1, nsp2)
    try:
        t.set_outgroup(ca)
    except:
        print("Cannot root there! Is it already rooted in that branch?")
        return None

    print(t.write(format=1))
Example #10
0
def read_newick(newick_path):
	newick_file = open(newick_path)
	newick = newick_file.read().strip()
	newick_file.close()

	tree = ete3.Tree(newick)
	return tree
def recursive_generate_topologies_bifurcating_unique(neighbour_list = ['1', '2', '3'], tree_list = [], tree_id_set_ete = set()):
    """
    This generates all possible bifurcating trees for the neighbour_list of *non-repeating* taxa.
    It uses the ete3 method get_topology_id to ensure that only unique topology trees are kept.
    It only works in reasonable time if len(neighbout_list) is <=7 or perhaps 8.
    """
    if len(neighbour_list) == 2:
        new_tree = "(%s,%s);" %(neighbour_list[0], neighbour_list[1])
        new_tree_ete_id = ete3.Tree(newick = new_tree).get_topology_id()
        if new_tree_ete_id not in tree_id_set_ete:
            tree_list.append(new_tree)
            tree_id_set_ete.add(new_tree_ete_id)
    else:
        for i in xrange(len(neighbour_list)):
            for j in xrange(i):
                remaining = copy.copy(neighbour_list)
                if i > j:
                    left = remaining.pop(remaining.index(neighbour_list[i]))
                    right = remaining.pop(remaining.index(neighbour_list[j]))
                else:
                    left = remaining.pop(remaining.index(neighbour_list[j]))
                    right = remaining.pop(remaining.index(neighbour_list[i]))
                recursive_generate_uniquetopologies(["(%s,%s)" %(left, right)] + remaining, tree_list = tree_list, tree_id_set_ete = tree_id_set_ete)
                if len(tree_list) % 1000 == 0:
                    print "Generated %d trees" %(len(tree_list))
    return tree_list
def parse_and_clean_tree(n_pairs, args):
    subclades_final_probabilities_files = [args[
                                               "final_probabilities_filename"]] + \
                                          file_utilities.get_file_list(
                                              args["subclade_clustering_dir"],
                                              args[
                                                  "final_probabilities_pattern"],
                                              verbose=True)

    processed_files = load_processed_files_list(
        len(subclades_final_probabilities_files), args)
    already_processed = np.sum(processed_files)
    if (len(subclades_final_probabilities_files) - already_processed) > 0:
        logger.info(
            "%s subclade-specific conserved clustering "
            "probabilities files have to be parsed",
            len(subclades_final_probabilities_files) - already_processed)

        processed_files = manager.list(processed_files)
        tree = ete3.Tree(args["labelled_tree_file"], format=1)
        subclades_to_process = [
            node.name for node in tree.traverse('levelorder')
        ]

        multi = MyMultiProcess(threads=args["threads"],
                               target=process_child_node,
                               input=subclades_to_process,
                               args=[tree, processed_files, n_pairs, args],
                               destroy=True)
        multi.run()
Example #13
0
def preorder(m):
    if m == 'dendropy':
        tree = dendropy.Tree.get(data=treestr, schema='newick')
        t_start = time()
        for node in tree.preorder_node_iter():
            pass
        t_end = time()
    elif m == 'biophylo':
        tree = Phylo.read(treeio, 'newick')
        t_start = time()
        for node in tree.find_clades(order='preorder'):
            pass
        t_end = time()
    elif m == 'treeswift':
        tree = read_tree_newick(treestr)
        t_start = time()
        for node in tree.traverse_preorder():
            pass
        t_end = time()
    elif m == 'ete3':
        tree = ete3.Tree(treestr,format=1)
        t_start = time()
        for node in tree.traverse(strategy='preorder'):
            pass
        t_end = time()
    else:
        assert False, "Invalid tool: %s"%m
    return t_end-t_start
Example #14
0
def mrca(m):
    if m == 'dendropy':
        tree = dendropy.Tree.get(data=treestr, schema='newick')
        t_start = time()
        leaves = {l.taxon for l in tree.leaf_node_iter()}
        tree.mrca(taxa=leaves)
        t_end = time()
    elif m == 'biophylo':
        tree = Phylo.read(treeio, 'newick')
        t_start = time()
        leaves = tree.get_terminals()
        tree.common_ancestor(leaves)
        t_end = time()
    elif m == 'treeswift':
        tree = read_tree_newick(treestr)
        t_start = time()
        leaves = {str(l) for l in tree.traverse_leaves()}
        tree.mrca(leaves)
        t_end = time()
    elif m == 'ete3':
        tree = ete3.Tree(treestr,format=1)
        t_start = time()
        leaves = tree.get_leaf_names()
        tree.get_common_ancestor(leaves)
        t_end = time()
    else:
        assert False, "Invalid tool: %s"%m
    return t_end-t_start
Example #15
0
    def test_mark_dups_below_v2(self):
        seq_ids = defaultdict(str)
        sp_ids = defaultdict(str)
        sp_tree, sp_tree_node_names = species_tree()
        hogw = t2o.HogWriter(sp_tree, sp_tree_node_names, seq_ids, sp_ids)
        empty_set = set()

        # act-assert
        gt = gene_tree()
        gt = hogw.mark_dups_below_v2(gt)
        self.assertTrue(
            all(n.dups_below == empty_set for n in gt.traverse()
                if not n.is_leaf()))

        # act-assert
        gt = gene_tree_dup_at_root_N12()
        gt = hogw.mark_dups_below_v2(gt)
        # root has a duplication
        self.assertEqual({'N12'}, gt.dups_below)
        # none of the other nodes do
        self.assertTrue(
            all(n.dups_below == empty_set for n in gt.traverse()
                if not (n.is_leaf() or n.is_root())))

        # act-assert
        gt = ete3.Tree("((1_0, (12_0, 7_7),(5_0,(12_1, 6_0)));")
        gt = hogw.mark_dups_below_v2(gt)
        self.assertEqual({'N3'}, gt.dups_below)
Example #16
0
def from_ClusterNode(root):
    """
    Converts a scipy.cluster.hierarchy.ClusterNode object into an ETE Tree object.

    Parameters
    ----------
    root : scipy.cluster.hierarchy.ClusterNode instance
        ClusterNode instance to convert to ETE Tree object.

    Returns
    -------
    tree : ete3.Tree instance
        New tree.
    """
    if root is None:
        return None

    # create copy of root node
    ete3_node = ete3.Tree(name=root.get_id(), dist=root.dist)

    # recursively create clone of left and right sub tree
    if root.get_left():
        new_node = from_ClusterNode(root.get_left())
        ete3_node.add_child(new_node)
    if root.get_right():
        new_node = from_ClusterNode(root.get_right())
        ete3_node.add_child(new_node)

    return ete3_node
Example #17
0
def main(argv):
    if len(argv) != 3:
        print "invalid arguments"
        print "please specify the prism adversary file location and then the prism state file"
    else:
        adversaryTransitionDict = {}
        with open(argv[1], 'r') as fin:
            for line in fin:
                lineItems = line.split()
                if len(lineItems) == 4:
                    a = AdversaryTransition(lineItems[0], lineItems[1],
                                            lineItems[2], lineItems[3])
                    if lineItems[0] in adversaryTransitionDict:
                        adversaryTransitionDict[lineItems[0]].append(a)
                    else:
                        adversaryTransitionDict[lineItems[0]] = [a]
        with open(argv[2], 'r') as fin:
            for line in fin:
                if line.find('(1,1,1,0,0,0,4,4,4,false,0)') != -1:
                    lineItems = line.split(':')
                    startingState = lineItems[0]
        print startingState
        t = ete3.Tree()
        buildTreeDepthFirst(adversaryTransitionDict, t, startingState)
        t.show()
Example #18
0
 def validate_newick(newick):
     """Validates a Newick string by attempting to make a tree with ete3"""
     try:
         ete3.Tree(newick)
     except NewickError:
         return False
     return True
Example #19
0
def allTopos(branches, _topos=None, _topo_IDs=None):
    if _topos is None or _topo_IDs is None:
        _topos = []
        _topo_IDs = set([])
    assert 4 <= len(
        branches) <= 8, "Please specify between 4 and 8 unique taxon names."
    #print "topos contains", len(_topos), "topologies."
    #print "current tree is:", branches
    for x in range(len(branches) - 1):
        for y in range(x + 1, len(branches)):
            #print "Joining branch", x, branches[x], "with branch", y, branches[y]
            new_branches = list(branches)
            new_branches[x] = [new_branches[x], new_branches.pop(y)]
            #print "New tree is:", new_branches
            if len(new_branches) == 3:
                #print "Tree has three branches, so appending to topos."
                #now check that the topo doesn't match a topology already in trees, and if not add it
                t = ete3.Tree(listToNwk(new_branches))
                ID = t.get_topology_id()
                if ID not in _topo_IDs:
                    _topos.append(t)
                    _topo_IDs.add(ID)
            else:
                #print "Tree still unresolved, so re-calling function."
                _topos = allTopos(new_branches, _topos, _topo_IDs)
    #print _topo_IDs
    #print [t.write(format=9) for t in _topos]
    return (_topos)
Example #20
0
def parse_newick(newick_file: str,
                 remove_zero_edges: bool = True,
                 epsilon: float = _EPSILON) -> NumpyRootedTree:
    """Return leaves followed by nodes (postorder)"""
    t = ete3.Tree(newick_file)
    ordered_nodes = sorted(t.traverse("postorder"),
                           key=lambda n: not n.is_leaf())

    indices = {n: i for i, n in enumerate(ordered_nodes)}
    parent_indices = np.array([indices[n.up] for n in ordered_nodes[:-1]])

    root_distances = np.array([t.get_distance(n) for n in ordered_nodes],
                              dtype=DEFAULT_FLOAT_DTYPE_NP)  # TODO: Optimise
    root_height = max(root_distances)
    heights = root_height - root_distances

    taxon_count = (len(ordered_nodes) + 1) // 2
    taxon_set = DictTaxonSet([x.name for x in ordered_nodes[:taxon_count]])
    tree = NumpyRootedTree(
        heights=heights,
        parent_indices=parent_indices,
        taxon_set=taxon_set,
    )
    if remove_zero_edges:
        tree = _remove_zero_edges_func(tree, epsilon=epsilon)

    return tree
Example #21
0
    def generate_history(self):
        """ Generate Forest composed of Trees which replay the history of this Tree """

        trees_forward = []  # list of trees which will become the Forest
        T_forward = ete3.Tree()  # initialize as empty tree

        for node_original in itertools.islice(self.T.traverse("levelorder"), 1,
                                              None):

            node = copy.deepcopy(
                node_original
            )  # copy node (so we do not change original object)

            # remove children of node
            for child in node.get_children():
                node.remove_child(child)

            # add node to growing tree
            search_results = T_forward.iter_search_nodes(
                name=node.up.name)  # find ancestor
            next(search_results).add_child(
                node)  # add node as child of ancestor

            T_forward_frozen = copy.deepcopy(T_forward)
            T_forward_frozen_as_jungle_tree = Tree(T_forward_frozen,
                                                   name=None,
                                                   params=None)
            trees_forward.append(T_forward_frozen_as_jungle_tree)

        return trees_forward
def assess_separation_ranger(folder):
    trees = []
    count = 1
    while os.path.isfile('%s/%s.optResolution%i.ranger_input' %(folder, folder, count)):
        trees.append(ete3.Tree(open('%s/%s.optResolution%i.ranger_input' %(folder, folder, count)).readlines()[1]))
        count += 1

    if not trees:
        return 0

    for tree in trees:
        taxids = genbank_summary.loc[set([leaf.split('_')[0] for leaf in tree.get_leaf_names()]), 'taxid']
        phyla  = lineages.loc[taxids, 'phylum'].values.astype('int').astype('str').tolist()

        if not len(set(phyla)) == 2 or not set(['1117', '1224']).issubset(phyla):
            continue

        child1_phyla = set(lineages.loc[genbank_summary.loc[set([leaf.split('_')[0] for leaf in tree.children[0].get_leaf_names()]), 'taxid'],
                                    'phylum'].values.astype('int').astype('str').tolist())
        child2_phyla = set(lineages.loc[genbank_summary.loc[set([leaf.split('_')[0] for leaf in tree.children[1].get_leaf_names()]), 'taxid'],
                                    'phylum'].values.astype('int').astype('str').tolist())

        if len(child1_phyla) == len(child2_phyla) == 1:
            return 1

    return 0
 def __init__(self, newick):
   etree = ete3.Tree(newick, format = 0)
   etree.unroot()
   self.all_nodes = []
   print("Number of leaves in the ete3 tree: " + str(len(etree)))
   n = self.rec_build(etree, True)
   assert(n == None)
def assess_tree_balance_ranger(folder):
    ratios = []
    trees = []
    count = 1
    num_transfers = []
    num_duplications = []
    sub_count = 1
    while os.path.isfile('%s/%s.optResolution%i.ranger_input' %(folder, folder, count)):
#        trees.append(ete3.Tree(open('%s/%s.optResolution%i.ranger_input' %(folder, folder, count)).readlines()[1]))
        tree     = ete3.Tree(open('%s/%s.optResolution%i.ranger_input' %(folder, folder, count)).readlines()[1])
        children = sorted(tree.children, key=len)
        ratio    = len(children[0])/float(len(children[1]))
        while sub_count <= count * 20:
            reconciliation = open('%s/%s.reconciliation%i' %(folder, folder, sub_count)).read()
            ratios.append(ratio)
            num_transfers.append(   len(re.findall('Transfer, Mapping --> \S+, Recipient --> \S+$', reconciliation, re.M)))
            num_duplications.append(len(re.findall('Duplication, Mapping --> \S+$',                 reconciliation, re.M)))
            sub_count += 1

        count += 1

#    for tree in trees:
#        children = sorted(tree.children, key=len)
#        ratios.append(len(children[0])/float(len(children[1])))

    num_internal_nodes = len(re.findall('^m\d+ = LCA', reconciliation, re.M))
    dtl_cost = re.search('The minimum reconciliation cost is: (\d+)', reconciliation).group(1)

    return {'balance_ratios':ratios, 'num_transfers':num_transfers, 'num_duplications':num_duplications,
            'num_internal_nodes':num_internal_nodes, 'dtl_cost':int(dtl_cost)}
    return ratios, int(dtl_cost), len(tree)
Example #25
0
class NewickTreeDatum(Data):
    type_name = "newicktree"
    native_type = ete3.Tree
    cast_function = lambda x: ete3.Tree(x, format=1
                                        )  #TODO: support other formats
    db_cast_function = lambda x: x.write()
    value = models.TextField()
Example #26
0
def shuffle_main(args):
    tree = read_tree(args.infile, args.format)
    tree_original = tree
    if args.topology:
        num_leaf = len(tree.get_leaves())
        new_tree = ete3.Tree()
        new_tree.populate(size=num_leaf)
        num_new_branch = len([n for n in new_tree.traverse()])
        branch_lengths = get_shuffled_branch_lengths(nodes=tree)
        population = branch_lengths + branch_lengths  # Random tree may have more branches
        new_branch_lengths = random.sample(population=population,
                                           k=num_new_branch)
        new_branch_lengths[0] = 0  # Root node
        for n, new_branch_length in zip(new_tree.traverse(),
                                        new_branch_lengths):
            n.dist = new_branch_length
        leaf_names = tree.get_leaf_names()
        for leaf, leaf_name in zip(new_tree.get_leaves(), leaf_names):
            leaf.name = leaf_name
        tree = new_tree
    if args.topology | args.branch_length:
        nodes = [n for n in tree.traverse() if not n.is_root()]
        branch_lengths = get_shuffled_branch_lengths(nodes)
        for node, new_bl in zip(nodes, branch_lengths):
            node.dist = new_bl
    if args.label:
        leaf_names = tree.get_leaf_names()
        random.shuffle(leaf_names)
        for leaf, new_name in zip(tree.get_leaves(), leaf_names):
            leaf.name = new_name
    print_rf_dist(tree1=tree_original, tree2=tree)
    write_tree(tree, args, format=args.format)
Example #27
0
def treeCheck(treePath, alnf, optionTree):
    """
	Check if the tree isn't corruped

	@param1 treePath: tree's path
	@param2 alnf: alignment's path
	@param3 optionTree: Boolean (treerecs option)
	@return1 treePath: tree's path
	@return2 optionTree: Boolean (treerecs option)
	"""
    logger = logging.getLogger("main")
    if treePath != "":
        if not os.path.exists(treePath):
            try:
                treePath = buildSpeciesTree(treePath, alnf)
            except:
                logger.warning("The path for the species tree doesn't exist.")
                logger.warning("Duplication option will be set to False.")
                optionTree = False
                treePath = ""
        else:
            if not ete3.Tree(treePath):
                logger.warning("The species tree is corrupted.")
                logger.warning("Duplication option will be set to False.")
                optionTree = False
                treePath = ""

    return treePath, optionTree
Example #28
0
def gene_tree():
    t = ete3.Tree("((1_0, 5_1),4_3);")  # (C. elegans, Drosophila),Danio
    for n in t.traverse():
        n.add_feature('dup', False)
    n = t & '4_3'
    n.up.add_feature('sp_node', 'N3')
    return t
Example #29
0
def gather_tree_family_genus(genus_dict):
    '''this function gathers the newick tree from MEGAN genera and gives all genera belonging to a specific family'''
    megan_tree = ete3.Tree("/ebio/abt6_projects9/metagenomic_controlled/Programs/metagenomics_pipeline/data/megan_genus_tree_10_2_2018.tre", format=1)
    # now iterate through every leaf and indicate its parent node (the leaves are all genera)
    genus_family_map = {}
    for leaf in megan_tree.get_leaves():
        genus_family_map[leaf.name] = leaf.get_ancestors()[0].name
    family_average = {value: [] for value in set(genus_family_map.values())}
    for key in genus_family_map:
        try:
            size = genus_dict[key][0]
            fam = genus_family_map[key]
            family_average[fam].append(size)
        except KeyError:
            fam = genus_family_map[key]
            if len(family_average[fam]) > 0:
                print(family_average[fam])
                pass
            if len(family_average[fam]) == 0:
                family_average[fam].append(49.99999)
            # if there is no representation for genus give genome size of 50Mb
            # pass
    family_final = {}
    for key in family_average:
        if len([rec for rec in family_average[key] if rec != 49.99999]) > 0:
            fam_intermediate = [rec for rec in family_average[key] if rec != 49.99999]
            family_final[key] = [np.mean(fam_intermediate), 0]
        else:
            family_final[key] = [50, 0]
    return family_final, genus_family_map
Example #30
0
def test_scoring(GSC_function):
    """Test a function implementing the GSC scoring algorithm (GSCfunction) on the tree used as an example in the paper.
    Note: the weights are assessed as before the normalisation to average=1 step described in the paper."""
    import ete3

    function_name = GSC_function.__name__

    if not callable(GSC_function):
        raise ValueError(
            "The provided input function {} is not a function.".format(
                function_name))

    newick_tree = "(D:80,(C:50,(A:20,B:20)two:30)three:30);"
    tree = ete3.Tree(newick_tree, format=1)

    scores = GSC_function(tree)

    correct_scores = {'A': 43.75, 'C': 62.5, 'B': 43.75, 'D': 80.0}

    correct = 0

    for key in sorted(scores):
        print("Leaf {}: Correct score: {} - Score obtained by function {}: {}".
              format(key, correct_scores[key], function_name, scores[key]))
        if correct_scores[key] == scores[key]:
            correct += 1

    print("{}% scores were correctly predicted ({} out of {})".format(
        float(correct) / len(scores) * 100, correct, len(scores)))

    if correct == len(scores):
        return True
    else:
        return False