Example #1
0
def get_tree_object_in_newick(tree, id_to_sample_dict=None):
    """Take a tree object, and create a newick formatted representation of it"""

    new_tree = Tree()
    new_tree.dist = 0
    new_tree.name = "root"

    node_id = 0
    node_id_to_node_in_old_tree = {node_id: tree}
    node_id_to_node_in_new_tree = {node_id: new_tree}

    node_ids_to_visit_in_old_tree = [node_id]

    while node_ids_to_visit_in_old_tree:
        node_id_in_old_tree = node_ids_to_visit_in_old_tree.pop()
        node_in_old_tree = node_id_to_node_in_old_tree[node_id_in_old_tree]
        cl_dist = node_in_old_tree.dist / 2.0

        for ch_node_in_old_tree in [node_in_old_tree.left, node_in_old_tree.right]:
            if ch_node_in_old_tree:
                ch_for_new_tree = Tree()
                ch_for_new_tree.dist = cl_dist

                node_id += 1
                node_id_to_node_in_new_tree[node_id] = ch_for_new_tree

                if ch_node_in_old_tree.is_leaf():
                    if id_to_sample_dict:
                        ch_for_new_tree.name = id_to_sample_dict[ch_node_in_old_tree.id]
                    else:
                        ch_for_new_tree.name = ch_node_in_old_tree.id
                else:
                    ch_for_new_tree.name = 'Int' + str(ch_node_in_old_tree.id)

                node_id_to_node_in_new_tree[node_id_in_old_tree].add_child(ch_for_new_tree)
                node_id_to_node_in_old_tree[node_id] = ch_node_in_old_tree
                node_ids_to_visit_in_old_tree.append(node_id)

    for node in new_tree.traverse("preorder"):
        if node.is_leaf():
            continue

        has_child_with_dist_or_int = False

        for child in node.get_children():
            if not child.is_leaf() or child.dist > 0:
                has_child_with_dist_or_int = True
                break

        if has_child_with_dist_or_int:
            continue

        # swap childs alphabetically
        node.children = sorted(node.get_children(), key=lambda x:x.name, reverse=True)

    return new_tree.write(format=1)
def createPseudonodes(node):
    if node.is_leaf():
        return node
    for child in node.get_children():
        createPseudonodes(child)
    if len(node.get_children()) > 2:
        dDominantTaxon2Children = {}
        for child in node.get_children():
            sDominantTaxon = 'prokaryota'
            if 'eukaryota' in child.taxonomy and child.taxonomy[
                    'eukaryota'] >= 0.5:
                sDominantTaxon = 'eukaryota'
            if sDominantTaxon not in dDominantTaxon2Children:
                dDominantTaxon2Children[sDominantTaxon] = []
            dDominantTaxon2Children[sDominantTaxon].append(child)
        if len(dDominantTaxon2Children) > 1:
            for (sDominantTaxon,
                 lDominantTaxonChildren) in dDominantTaxon2Children.items():
                if len(lDominantTaxonChildren) > 1:
                    newChild = Tree()
                    newChild.dist = min(
                        map(lambda x: x.dist, node.get_children())) / 2.
                    for child in lDominantTaxonChildren:
                        child.dist -= newChild.dist
                        newChild.add_child(child)
                        node.remove_child(child)
                    node.add_child(newChild)
    return node
Example #3
0
def cluster(items, cache_clustering_file = None, dist_fn = euc_dist, \
    prefix_output = None):

    if not cache_clustering_file:
        print "Generating distance matrix..."
        sys.stdout.flush()
        Y = dist_matrix(items, dist_fn)

        print "Linkage clustering..."
        sys.stdout.flush()
        Z = linkage(Y, "single")  # average, complete = max, single = min ?

        print "Dumping clustering information into cache file"
        sys.stdout.flush()
        cPickle.dump([Y, Z], open(prefix_output + "clustering_dump.pkl", "w"))

    else:
        print "Loading clustering cache from '%s'" % cache_clustering_file.name
        Y, Z = cPickle.load(cache_clustering_file)

    print "Converting into ETE tree..."
    sys.stdout.flush()
    T = to_tree(Z)

    root = Tree()
    root.dist = 0
    root.name = "root"
    item2node = {T: root}

    to_visit = [T]
    while to_visit:
        node = to_visit.pop()
        cl_dist = node.dist / 2.0
        for ch_node in [node.left, node.right]:
            if ch_node:
                ch = Tree()
                #try:
                #  ch.add_features(content = str(items[ch_node.id]))
                #except IndexError:
                #  pass
                ch.dist = cl_dist
                ch.name = str(ch_node.id)
                item2node[node].add_child(ch)
                item2node[ch_node] = ch
                to_visit.append(ch_node)

    return root
Example #4
0
def createNode():
    """Creates a domain node with required fields precreated"""
    node = Tree()
    node.name = 'placeholder'
    node.add_feature('pos', 0)
    node.add_feature('event', 'SPECIATION')
    node.dist = 0
    return node
def ASR_parser(args):
    try:
        import cPickle as pickle
    except:
        import pickle
    from GCutils import CollapsedForest, CollapsedTree, hamming_distance

    try:
        tree = Tree(args.tree, format=1)
    except Exception as e:
        print(e)
        raise TreeFileParsingError('Could not read the input tree. Is this really newick format?')

    counts = {l.split(',')[0]:int(l.split(',')[1]) for l in open(args.counts)}
    tree.add_feature('frequency', 0)       # Placeholder will be deleted when rerooting
    tree.add_feature('sequence', 'DUMMY')  # Placeholder will be deleted when rerooting
    tree = map_asr_to_tree(args.asr_seq, args.leaf_seq, tree, args.naive, counts)

    # Reroot to make the naive sequence the real root instead of just an outgroup:
    tree = reroot_tree(tree, pattern=args.naive)

    # Recompute branch lengths as hamming distances:
    tree.dist = 0  # No branch above root
    for node in tree.iter_descendants():
        node.dist = hamming_distance(node.sequence, node.up.sequence)

    iqtree_tree = CollapsedTree(tree=tree, name=args.name)
    # Add colors:
    if args.colormap is not None:
        with open(args.colormap, 'rb') as fh:
            colormap = pickle.load(fh)
        with open(args.idmap, 'rb') as fh:
            id_map = pickle.load(fh)
        # Reverse the id_map:
        id_map = {cs:seq_id for seq_id, cell_ids in id_map.items() for cs in cell_ids}
        # Expand the colormap and map to sequence ids:
        colormap_seqid = dict()
        for key, color in colormap.items():
            if isinstance(key, str) and key in id_map:
                colormap_seqid[id_map[key]] = color
            else:
                for cell_id in key:
                    if cell_id in id_map:
                        colormap_seqid[id_map[cell_id]] = color
        colormap = colormap_seqid
    else:
        colormap = None
    iqtree_tree.render(args.outbase + '.svg', colormap=colormap)
    iqtree_forest = CollapsedForest(forest=[iqtree_tree], name=args.name)
    # Dump tree as newick:
    iqtree_forest.write_random_tree(args.outbase+'.tree')
    print('number of trees with integer branch lengths:', iqtree_forest.n_trees)

    with open(args.outbase + '.p', 'wb') as f:
        pickle.dump(iqtree_forest, f)

    print('Done parsing IQ-TREE tree')
Example #6
0
def build_conv_topo(annotated_tree, vnodes):

      tconv = annotated_tree.copy(method="deepcopy")
      for n in tconv.iter_leaves():
        n.add_features(L=1)
      for n in tconv.traverse():
        n.add_features(COPY=0)
      # get the most recent ancestral node of all the convergent clades
      l_convergent_clades = tconv.search_nodes(T=True)
      common_anc_conv=tconv.get_common_ancestor(l_convergent_clades)

      # duplicate it at its same location (branch lenght = 0). we get
      # a duplicated subtree with subtrees A and B (A == B)

      dist_dup = common_anc_conv.dist
      if not common_anc_conv.is_root():
        dup_point = common_anc_conv.add_sister(name="dup_point",dist=0.000001)
        dup_point_root = False
      else:
        dup_point = Tree()
        dup_point_root = True
        dup_point.dist=0.000001

      dup_point.add_features(ND=0,T=False, C=False, Cz=False)

      common_anc_conv.detach()
      common_anc_conv_copy = common_anc_conv.copy(method="deepcopy")

      # tag duplicated nodes:

      for n in common_anc_conv_copy.traverse():
        n.COPY=1
        if n.ND not in vnodes and not n.is_root():
            n.dist=0.000001

      # pruned A from all branches not leading to any convergent clade
      l_leaves_to_keep_A = common_anc_conv.search_nodes(COPY=0, C=False, L=1)
      #logger.debug("A: %s",l_leaves_to_keep_A)
      common_anc_conv.prune(l_leaves_to_keep_A, preserve_branch_length=True)

      # pruned B from all branches not leading to any non-convergent clade
      l_leaves_to_keep_B = common_anc_conv_copy.search_nodes(COPY=1, C=True, L=1)
      #logger.debug("B : %s", l_leaves_to_keep_B)
      common_anc_conv_copy.prune(l_leaves_to_keep_B, preserve_branch_length=True)


      dup_point.add_child(common_anc_conv_copy)
      dup_point.add_child(common_anc_conv)

      tconv = dup_point.get_tree_root()

      nodeId = 0
      for node in tconv.traverse("postorder"):
          node.ND = nodeId
          nodeId += 1

      return tconv
Example #7
0
def birth(tree,
          node):  #subpop is the subpopulation where the event is to occur,
    #setpop is the set of nodes in subpop
    child1, child2 = Tree(), Tree()
    child1.dist, child2.dist = 0, 0
    child1.add_features(extinct=False)
    child2.add_features(extinct=False)
    #add children to nodes
    node.add_child(child1)
    node.add_child(child2)
    return tree
Example #8
0
def initialise(rate):
    tree = Tree()
    tree.add_features(extinct=False)
    tree.dist = 0.0
    node = random.choice(tree.get_leaves())
    tree = birth(tree, node)
    leaf_nodes = tree.get_leaves()
    wtime = random.expovariate(rate)
    for leaf in leaf_nodes:
        if not leaf.extinct:
            leaf.dist += wtime
    return tree
Example #9
0
def createTree(n):
    tree = Tree()
    tree.dist = 0

    for i in range(n):
        node = createNode()
        node.add_feature('position', i)
        node.pos = i
        node.name = "g0_" + str(i)
        tree.children.append(node)
        node.up = tree

    return tree
Example #10
0
def ASR_parser(args):
    try:
        import cPickle as pickle
    except:
        import pickle
    from gctree import CollapsedForest, CollapsedTree, hamming_distance

    try:
        tree = Tree(args.tree)
    except:
        raise TreeFileParsingError(
            'Could not read the input tree. Is this really newick format?')

    counts = {l.split(',')[0]: int(l.split(',')[1]) for l in open(args.counts)}
    tree.add_feature('frequency',
                     0)  # Placeholder will be deleted when rerooting
    tree.add_feature('sequence',
                     'DUMMY')  # Placeholder will be deleted when rerooting
    tree = map_asr_to_tree(args.asr_seq, tree, args.naive, counts)

    # Reroot to make the naive sequence the real root instead of just an outgroup:
    tree = reroot_tree(tree)

    # Recompute branch lengths as hamming distances:
    tree.dist = 0  # No branch above root
    for node in tree.iter_descendants():
        node.dist = hamming_distance(node.sequence, node.up.sequence)

    igphyml_tree = CollapsedTree(tree=tree)
    igphyml_tree.render(args.outbase + '.svg')
    igphyml_forest = CollapsedForest(forest=[igphyml_tree])
    print('number of trees with integer branch lengths:',
          igphyml_forest.n_trees)

    # check for unifurcations at root
    unifurcations = sum(
        tree.tree.frequency == 0 and len(tree.tree.children) == 1
        for tree in igphyml_forest.forest)
    if unifurcations:
        print(
            'WARNING: {} trees exhibit unifurcation from root, which is not possible under current model. Such nodes will be ommitted from likelihood calculation'
            .format(unifurcations))

    with open(args.outbase + '.p', 'wb') as f:
        pickle.dump(igphyml_forest, f)

    print('Done parsing IgPhyML tree')
Example #11
0
def build_tree(sequences, parents, counts=None, naive='naive'):
    # build an ete tree
    # first a dictionary of disconnected nodes
    nodes = {}
    for name in sequences:
        node = Tree()
        node.name = name
        node.add_feature('sequence', sequences[node.name])

        ### Removed by KD because it is replaced by a count file
        #            if '_' in node.name:
        #                node.add_feature('frequency', int(node.name.split('_')[-1]))
        #                node.name = '_'.join(node.name.split('_')[:-1])
        #            else:
        #                node.add_feature('frequency', 0)
        if counts is not None:
            if node.name in counts:
                node.add_feature('frequency', counts[node.name])
            else:
                node.add_feature('frequency', 0)
        nodes[name] = node
    for name in sequences:
        if name in parents:
            nodes[parents[name]].add_child(nodes[name])
        else:
            tree = nodes[name]
    # reroot on naive
    if naive is not None:
        naive_id = [node for node in nodes if naive in node][0]
        assert len(nodes[naive_id].children) == 0
        assert nodes[naive_id] in tree.children
        tree.remove_child(nodes[naive_id])
        nodes[naive_id].add_child(tree)
        tree = nodes[naive_id]

    # make random choices for ambiguous bases
    tree = disambiguate(tree)

    # compute branch lengths
    tree.dist = 0  # no branch above root
    for node in tree.iter_descendants():
        node.dist = gctree.hamming_distance(node.sequence, node.up.sequence)

    return tree
Example #12
0
def build_tree(sequences, parents, counts=None, naive='naive'):
    # build an ete tree
    # first a dictionary of disconnected nodes
    nodes = {}
    for name in sequences:
        node = Tree()
        node.name = name
        node.add_feature('nuc_seq', sequences[node.name])
        node.add_feature('aa_seq', local_translate(sequences[node.name]))
        if counts is not None and node.name in counts:
            node.add_feature('frequency', counts[node.name])
        else:
            node.add_feature('frequency', 0)
        nodes[name] = node
    for name in sequences:
        if name in parents:
            nodes[parents[name]].add_child(nodes[name])
        else:
            tree = nodes[name]
    # Reroot on naive:
    if naive is not None:
        naive_id = [n for n in nodes if naive in n][0]
        assert len(nodes[naive_id].children) == 0
        naive_parent = nodes[naive_id].up
        naive_parent.remove_child(nodes[naive_id])
        nodes[naive_id].add_child(naive_parent)
        # remove possible unecessary unifurcation after rerooting
        if len(naive_parent.children) == 1:
            naive_parent.delete(prevent_nondicotomic=False)
            naive_parent.children[0].dist = hamming_distance(
                naive_parent.children[0].nuc_seq, nodes[naive_id].nuc_seq)
        tree = nodes[naive_id]

    # make random choices for ambiguous bases
    tree = disambiguate(tree)

    # compute branch lengths
    tree.dist = 0  # no branch above root
    for node in tree.iter_descendants():
        node.dist = hamming_distance(node.nuc_seq, node.up.nuc_seq)

    return tree
Example #13
0
def partitionTreeSet(N):
    if N == 1:
        x = Tree(";",format=100)
        x.add_features(value=N, name=str(N))
        
        xFace = styleFace(x.name)
        x.add_face(xFace,column=0,position="branch-top")

        return (x,)
    else:
        y = ()
        base = Tree(";",format=100)
        base.dist = 1

        for k in range(lam(N)):
            left    = partitionTreeSet(N-(k+1))
            right   = partitionTreeSet(k+1)

            for l in left:
                for r in right:
                    l.dist = 1
                    r.dist = 1

                    z = base.copy()
                    z.dist = 1
                    
                    z.add_features(value=N, name=str(N))
                    z.add_child(l.copy())
                    z.add_child(r.copy())

                    zFace = styleFace(z.name)
                    z.add_face(zFace,column=0,position="branch-top")

                    y = y + (z,)
        
        return y
Example #14
0
def parse_union_tree(history_1, history_2, base_tree_path, debug=False):
    base_tree = Tree(base_tree_path, format=1)
    # add for debugging
    base_tree.get_tree_root().name = "_baseInternal_30"
    united_tree = Tree()
    united_tree.dist = 0  # initialize distance to 0
    united_tree.get_tree_root().name = history_1.get_tree_root(
    ).name  # set the name of the root
    united_tree.add_feature("history_1_label", history_1.get_tree_root().label)
    united_tree.add_feature("history_2_label", history_2.get_tree_root().label)
    union_nodes_number = 0
    for original_node in base_tree.traverse(
            "preorder"
    ):  # traverse the tree in pre-order to assure that for any visited node, its parent from the base branch is already in the united tree
        original_parent = original_node.up
        if original_parent != None:  # will be none only in the case the original node is the root
            if debug:
                print("handled branch: (", original_node.name, ",",
                      original_parent.name, ")")
            curr_union_parent = united_tree.search_nodes(
                name=original_parent.name)[0]
            hist_1_done = True
            hist_1_curr_child = None
            hist_1_parent = history_1.search_nodes(name=original_parent.name)[
                0]  # need to check names consistency across the 3 trees
            for child in hist_1_parent.children:
                if len(base_tree.search_nodes(name=child.name)) == 0 and len(
                        child.search_nodes(name=original_node.name)
                ) > 0:  # if the child is a root in a tree that holds the original child node, then this child must be on the branch of interest
                    hist_1_curr_child = child
                    hist_1_done = False
                    break
            if hist_1_done:
                hist_1_curr_child = history_1.search_nodes(
                    name=original_node.name)[0]
            hist_1_current_label = hist_1_curr_child.label

            hist_2_done = True
            hist_2_curr_child = None
            hist_2_parent = history_2.search_nodes(name=original_parent.name)[
                0]  # need to check names consistency across the 3 trees
            for child in hist_2_parent.children:
                if len(base_tree.search_nodes(name=child.name)) == 0 and len(
                        child.search_nodes(name=original_node.name)
                ) > 0:  # if the child is a root in a tree that holds the original child node, then this child must be on the branch of interest
                    hist_2_curr_child = child
                    hist_2_done = False
                    break
            if hist_2_done:
                hist_2_curr_child = history_2.search_nodes(
                    name=original_node.name)[0]
            hist_2_current_label = hist_2_curr_child.label

            while not hist_1_done or not hist_2_done:

                hist_1_dist = float("inf")
                hist_2_dist = float("inf")
                if not hist_1_done:  # if there is a node closer to the original node in history 1 -> add it to the united tree first
                    hist_1_dist = hist_1_curr_child.get_distance(
                        original_parent.name) - curr_union_parent.get_distance(
                            original_parent.name)
                if not hist_2_done:
                    hist_2_dist = hist_2_curr_child.get_distance(
                        original_parent.name) - curr_union_parent.get_distance(
                            original_parent.name)

                if debug:
                    if not hist_1_done:
                        print("history 1 has current child of ",
                              original_parent.name, ": ",
                              hist_1_curr_child.name, " with label: ",
                              hist_1_current_label,
                              " and distance from parent is: ", hist_1_dist)
                    if not hist_2_done:
                        print("history 2 has current child of ",
                              original_parent.name, ": ",
                              hist_2_curr_child.name, " with label: ",
                              hist_2_current_label,
                              " and distance from parent is: ", hist_2_dist)

                # first, check if now the two current children have the same name, and if this name is in the base tree - exit
                if hist_1_curr_child.name == hist_2_curr_child.name and len(
                        base_tree.search_nodes(
                            name=hist_1_curr_child.name)) > 0:
                    break

                # else, at least one of the histories has more than one step to go before reaching the bottom of the branch
                if hist_1_dist < hist_2_dist:  # add the node from history 1 and travel down to the next node in history 1
                    if debug:
                        print(
                            "adding child from history 1 which precedes to the one from history 2"
                        )
                        print("the label of the added node in history 1 is: ",
                              hist_1_curr_child.label)
                        print(
                            "the label of the added node in histroy 2 remains like papa: ",
                            hist_2_current_label)
                    curr_union_parent = curr_union_parent.add_child(
                        child=None,
                        name="internal_" + str(union_nodes_number),
                        dist=hist_1_dist,
                        support=None)
                    curr_union_parent.add_feature("history_1_label",
                                                  hist_1_curr_child.label)
                    curr_union_parent.add_feature("history_2_label",
                                                  hist_2_current_label)
                    hist_1_parent = hist_1_curr_child
                    if len(hist_1_parent.children) == 1:
                        hist_1_curr_child = hist_1_parent.children[0]
                    else:
                        hist_1_done = True
                    if debug:
                        print("united tree is now: \n", united_tree)
                        if hist_1_done:
                            print(
                                "history 1 on the handled branch is complete")
                        else:
                            print(
                                "history 1 on the handled branch isn't complete yet"
                            )

                else:  # add the node from history 2 and travel down to the next node in history 2
                    if debug:
                        print(
                            "adding child from history 2 which precedes to the one from history 1"
                        )
                        print("the label of the added node in history 2 is: ",
                              hist_2_curr_child.label)
                        print(
                            "the label of the added node in history 1 remains like papa: ",
                            hist_1_current_label)
                    curr_union_parent = curr_union_parent.add_child(
                        child=None,
                        name="internal_" + str(union_nodes_number),
                        dist=hist_2_dist)  # added as a new branch
                    curr_union_parent.add_feature("history_1_label",
                                                  hist_1_current_label)
                    curr_union_parent.add_feature("history_2_label",
                                                  hist_2_curr_child.label)
                    hist_2_parent = hist_2_curr_child
                    if len(hist_2_parent.children) == 1:
                        hist_2_curr_child = hist_2_parent.children[0]
                    else:
                        hist_2_done = True
                    if debug:
                        print("united tree is now: \n", united_tree)
                        if hist_2_done:
                            print(
                                "history 2 on the handled branch is complete")
                        else:
                            print(
                                "history 2 on the handled branch isn't complete yet"
                            )
                union_nodes_number += 1

            # now add the original node as the child of the current parent
            original_dist = original_node.dist
            residual = original_dist - curr_union_parent.get_distance(
                united_tree.search_nodes(name=original_parent.name)[0])
            curr_union_parent = curr_union_parent.add_child(
                child=None, name=original_node.name, dist=residual)
            curr_union_parent.add_feature(
                "history_1_label",
                history_1.search_nodes(name=original_node.name)[0].label)
            curr_union_parent.add_feature(
                "history_2_label",
                history_2.search_nodes(name=original_node.name)[0].label)

    return united_tree
        t = Tree()
        t.populate(N_LEAVES, random_branches=True, branch_range=(0.5, 1.5))
        print(get_tree_length(t))
        rescale_tree(
            t,
            float(EXPECTED_N_MUTATIONS_PER_BRANCH) * float(N_BRANCHES) /
            float(GENOME_LENGTH))
        print(get_tree_length(t))
        with open(f"{OUTPUT_FOLDER}/tree_{i}.tree", "w") as tree_out:
            tree_out.write(t.write())

    # create a root genome with given root frequencies, using an empty tree
    # this also does a phastSim simulation, but ignore it
    null_tree = Tree()
    null_tree.populate(0, names_library=["ref"])
    null_tree.dist = 0.0

    with open(f"{OUTPUT_FOLDER}/null_tree.tree", "w") as null_tree_file:
        null_tree_file.write(null_tree.write())

    os.system(f"""phastSim \
        --rootGenomeLength {GENOME_LENGTH} \
        --rootGenomeFrequencies {ROOT_GENOME_FREQUENCIES_STRING.replace("+", " ")} \
        --treeFile {OUTPUT_FOLDER}/null_tree.tree \
        --outpath {OUTPUT_FOLDER}/ \
        --outputFile my_ref \
        --createFasta {PHASTSIM_OPTIONS.replace("+", " ")} \
        --seed {np.random.randint(1000000000)}
        """)

    reference = SeqIO.read(f"{OUTPUT_FOLDER}/my_ref.fasta", format="fasta")
Example #16
0
def evolveAlongTree(host, guest, reverseMap, rootSequence, hmmfile,
                    emissionProbs, transmat):
    """
    Evolves a root sequence along an entire host tree, taking into account the domain level 
    events present in the guest tree (duplication, loss, speciation)

    Args:
        host (Tree)       : The host tree (ete3 format) inside which the guest tree evolved
        guest (Tree)      : The guest tree over which to evolve a sequence 
        reverseMap (dict) : mapping from nodes in the host node -> guest nodes 
        rootSequence (str): Initial sequence to evolve. Should contain sequence with ONE domain
        hmmfile (str )    : path to hmmfile used to identify domains
        emissionProbs     : matrix with dimensions (n x 20) where n is the length of 
                            the domain. Each row contains the probability of each 
                            aa appearing at that position (in pfam hmm order) 
    """

    for node in host.traverse():
        node.add_feature('sequence', "")

    for hostNode in host.traverse():
        tempSequence = rootSequence if hostNode == host else hostNode.up.sequence

        #No events occured at this node
        if hostNode not in reverseMap:
            hostNode.sequence = evolveSequence(tempSequence, 0.05, hostNode.dist, \
                                    emissionProbs, hmmfile, transmat)
            continue

        allGuestNodes = reverseMap[hostNode]
        allGuestNodesSet = set(allGuestNodes)
        upAncestors, leafChildren = {}, {}

        for guestNode in allGuestNodes:
            if guestNode.up not in allGuestNodesSet:
                upAncestors[guestNode] = guestNode.up
                #pass positional information on from the previous species
                if guestNode.up != None:
                    guestNode.add_feature('position', guestNode.up.position)
                guestNode.up = None
            if guestNode.children != [] and guestNode.children[
                    0] not in allGuestNodesSet:
                leafChildren[guestNode] = guestNode.children
                guestNode.children = []

        if hostNode != host:
            t = Tree()
            t.dist = 0
            t.children = upAncestors.keys()
            for guestNode in upAncestors.keys():
                guestNode.up = t

        else:
            t = guest

        #Actually do the work
        tempSequence = domainOrder(tempSequence, .75, hmmfile, emissionProbs,
                                   t, hostNode.name, transmat)
        hostNode.sequence = tempSequence

        #Reconnect all root and leaf nodes to the rest of the guest tree
        for node in upAncestors:
            node.up = upAncestors[node]
        for node in leafChildren:
            node.children = leafChildren[node]
Example #17
0
def birthDeathTree(birthRate, deathRate, treeHeight):
    """
    Generates a tree topology according to the birth-death model.
    
    Args:
        birthRate (float): birth rate 
        deathRate (float): death rate
        treeHeight (float): The average overall length of a root to leaf path
        numLeaves (int): The number of leaves desired at the end of the run. If 
                         the input is <= 0, this parameter is ignored and   
    """
    birthRate = float(birthRate)
    deathRate = float(deathRate)

    host = Tree()
    host.dist = 0
    lineages = [(host, treeHeight)]

    while lineages != []:
        #waiting time is exp(b + d), P(b) = b/(b+d), P(d) = 1 - P(b)
        node, height = lineages.pop(0)
        eventTime = stats.exp(1. / (1. / birthRate + 1. / deathRate))

        #event occurs
        if eventTime <= height:
            #duplication
            if np.random.random() < birthRate / (birthRate + deathRate):
                left = node.add_child(dist=eventTime)
                right = node.add_child(dist=eventTime)
                lineages.append((left, height - eventTime))
                lineages.append((right, height - eventTime))
            #loss: Remove from queue, delete node later (cleanup process)
            else:
                node.dist *= -1
        #If no event occurs, credit remaining branch length to this node
        else:
            node.dist += height

    if host.children == []:
        host.name = "h0"
        return host

    #remove lost nodes
    for node in host.traverse():
        if node.dist < 0:
            node.up.remove_child(node)

    #remove nodes with only one child (ensure full binary tree)
    for node in [a for a in host.traverse()]:
        if len(node.children) == 1:
            #This is the root node
            if node.up == None:
                host.children[0].dist += host.dist
                host = host.children[0]
                host.up = None
            else:
                parent = node.up
                child = node.children[0]
                child.dist += node.dist
                child.up = parent
                parent.remove_child(node)
                parent.children.append(child)

    nameCounter = 0
    for node in host.traverse():
        node.name = "h" + str(nameCounter)
        nameCounter += 1

    return host
Example #18
0
def parse_union_tree(history_1, history_2, base_tree_path, debug=False):
    base_tree = Tree(base_tree_path, format=1)
    base_tree.get_tree_root().name = "root"
    united_tree = Tree()
    united_tree.dist = 0  # initialize distance to 0
    united_tree.get_tree_root().name = history_1.get_tree_root(
    ).name  # set the name of the root
    united_tree.add_feature("history_1_label", history_1.get_tree_root().label)
    united_tree.add_feature("history_2_label", history_2.get_tree_root().label)
    union_nodes_number = 0
    for original_node in base_tree.traverse(
            "preorder"
    ):  # traverse the tree in pre-order to assure that for any visited node, its parent from the base branch is already in the united tree
        original_parent = original_node.up
        if original_parent != None:  # will be none only in the case the original node is the root
            if debug:
                print("handled branch: (", original_node.name, ",",
                      original_parent.name, ")")
            curr_union_parent = united_tree.search_nodes(
                name=original_parent.name.rstrip())[0]
            hist_1_done = True
            hist_1_curr_child = None
            hist_1_parent = history_1.search_nodes(
                name=original_parent.name.rstrip())[
                    0]  # need to check names consistency across the 3 trees
            for child in hist_1_parent.children:
                if len(
                        base_tree.search_nodes(name=child.name)
                ) == 0 and len(child.get_children()) == 1 and len(
                        child.search_nodes(name=original_node.name)
                ) > 0:  # if the child does not exist in the base tree, it represents a mapping node that was created out of breaking a branch in the original tree
                    hist_1_curr_child = child
                    hist_1_done = False
                    break
            if hist_1_done:
                hist_1_curr_child = history_1.search_nodes(
                    name=original_node.name.rstrip())[0]
            hist_1_current_label = hist_1_curr_child.label

            hist_2_done = True
            hist_2_curr_child = None
            hist_2_parent = history_2.search_nodes(
                name=original_parent.name.rstrip())[
                    0]  # need to check names consistency across the 3 trees
            for child in hist_2_parent.children:
                if len(
                        base_tree.search_nodes(name=child.name)
                ) == 0 and len(child.get_children()) == 1 and len(
                        child.search_nodes(name=original_node.name)
                ) > 0:  #:  # if the child is a root in a tree that holds the original child node, then this child must be on the branch of interest
                    hist_2_curr_child = child
                    hist_2_done = False  # should be false for _baseInternal_52
                    break
            if hist_2_done:
                try:
                    hist_2_curr_child = history_2.search_nodes(
                        name=original_node.name.rstrip())[0]
                except:
                    name = original_node.name.rstrip()
                    original_children = original_node.get_children()
                    exit(1)
            hist_2_current_label = hist_2_curr_child.label

            original_dist = original_node.dist

            while not hist_1_done or not hist_2_done:

                if hist_1_curr_child.name == hist_2_curr_child.name and hist_1_curr_child.name == original_node.name:  # both have reached the original child
                    print(
                        "error! original child wasn't recognized in the end of the loop"
                    )
                    exit(1)

                hist_1_dist = history_1.search_nodes(
                    name=original_node.name.rstrip())[0].dist
                hist_2_dist = history_2.search_nodes(
                    name=original_node.name.rstrip())[0].dist
                if not hist_1_done:  # if there is a node closer to the original node in history 1 -> add it to the united tree first
                    hist_1_dist = hist_1_curr_child.get_distance(
                        original_parent.name) - curr_union_parent.get_distance(
                            original_parent.name)
                if not hist_2_done:
                    hist_2_dist = hist_2_curr_child.get_distance(
                        original_parent.name) - curr_union_parent.get_distance(
                            original_parent.name)

                if debug:
                    if not hist_1_done:
                        print("history 1 has current child of ",
                              original_parent.name, ": ",
                              hist_1_curr_child.name, " with label: ",
                              hist_1_current_label,
                              " and distance from parent is: ", hist_1_dist)
                    if not hist_2_done:
                        print("history 2 has current child of ",
                              original_parent.name, ": ",
                              hist_2_curr_child.name, " with label: ",
                              hist_2_current_label,
                              " and distance from parent is: ", hist_2_dist)

                # first, check if now the two current children have the same name, and if this name is in the base tree - exit
                if hist_1_curr_child.name == hist_2_curr_child.name and len(
                        base_tree.search_nodes(
                            name=hist_1_curr_child.name)) > 0:
                    break

                # else, at least one of the histories has more than one step to go before reaching the bottom of the branch
                if hist_1_dist < hist_2_dist:  # add the node from history 1 and travel down to the next node in history 1
                    if debug:
                        print(
                            "adding child from history 1 which precedes to the one from history 2"
                        )
                        print("the label of the added node in history 1 is: ",
                              hist_1_curr_child.label)
                        print(
                            "the label of the added node in history 2 remains like papa: ",
                            hist_2_current_label)
                    curr_union_parent = curr_union_parent.add_child(
                        child=None,
                        name="internal_" + str(union_nodes_number),
                        dist=hist_1_dist,
                        support=None)
                    curr_union_parent.add_feature("history_1_label",
                                                  hist_1_curr_child.label)
                    curr_union_parent.add_feature("history_2_label",
                                                  hist_2_current_label)
                    hist_1_parent = hist_1_curr_child
                    if len(hist_1_parent.children) == 1:
                        hist_1_curr_child = hist_1_parent.children[0]
                        if hist_1_curr_child.name == original_node.name:
                            hist_1_done = True
                    else:  # two children only occur when reaching a junction from the base tree
                        hist_1_done = True
                    if debug:
                        if hist_1_done:
                            print(
                                "history 1 on the handled branch is complete")
                            continue
                        else:
                            print(
                                "history 1 on the handled branch isn't complete yet"
                            )

                else:  # add the node from history 2 and travel down to the next node in history 2
                    if debug:
                        print(
                            "adding child from history 2 which precedes to the one from history 1"
                        )
                        print("the label of the added node in history 2 is: ",
                              hist_2_curr_child.label)
                        print(
                            "the label of the added node in history 1 remains like papa: ",
                            hist_1_current_label)
                    curr_union_parent = curr_union_parent.add_child(
                        child=None,
                        name="internal_" + str(union_nodes_number),
                        dist=hist_2_dist)  # added as a new branch
                    curr_union_parent.add_feature("history_1_label",
                                                  hist_1_current_label)
                    curr_union_parent.add_feature("history_2_label",
                                                  hist_2_curr_child.label)
                    hist_2_parent = hist_2_curr_child
                    if len(hist_2_parent.children) == 1:
                        hist_2_curr_child = hist_2_parent.children[0]
                        if hist_2_curr_child.name == original_node.name:
                            hist_2_done = True
                    else:
                        hist_2_done = True
                    if debug:
                        if hist_2_done:
                            print(
                                "history 2 on the handled branch is complete")
                            continue
                        else:
                            print(
                                "history 2 on the handled branch isn't complete yet"
                            )
                union_nodes_number += 1

            # now add the original node as the child of the current parent
            residual = original_dist - curr_union_parent.get_distance(
                united_tree.search_nodes(
                    name=original_parent.name.rstrip())[0])
            if residual < 0:
                print("error on residual computation for branch leading to ",
                      original_node.name)
                print("residual: ", residual)
                print("original_dist: ", original_dist)
                print(
                    "curr_union_parent.get_distance(united_tree.search_nodes(name=original_parent.name.rstrip())[0]): ",
                    curr_union_parent.get_distance(
                        united_tree.search_nodes(
                            name=original_parent.name.rstrip())[0]))
                exit(1)
            curr_union_parent = curr_union_parent.add_child(
                child=None, name=original_node.name, dist=residual)
            curr_union_parent.add_feature(
                "history_1_label",
                history_1.search_nodes(
                    name=original_parent.name.rstrip())[0].label)
            curr_union_parent.add_feature(
                "history_2_label",
                history_2.search_nodes(
                    name=original_parent.name.rstrip())[0].label)

    if debug:
        for node in united_tree.traverse("postorder"):
            print("node=", node.name)
            print("label in hist1=", node.history_1_label)
            print("label in hist2=", node.history_2_label)
            print("branch length=", node.dist)
    return united_tree
Example #19
0
def __gen_tree(**kwargs):
    """
    Internal function for tree generation.

    This is an internal function for the tree generation, whose main
    difference to `gen_tree()`, the one exposed to the user, is that it
    does not guarantee that a tree will be generated, as the parameters and
    the random sampling might lead to dead-ends where all the leaves in
    a tree are extinct before any or all the stopping criteria are met.

    As an internal function, it does not set default values to the arguments
    and does not perform any checking on the values. Information on the
    arguments, which have the same variable names and properties, are given
    in the documentation for `gen_tree()`.
    """

    # Initialize the RNG
    utils.set_seeds(kwargs["seed"])

    # Compute the overall event rate (birth plus death), from which the
    # random expovariate will be drawn. `birth` is here normalized in range
    # [0..1] so that we can directly compare with the results of
    # `.random()` and decide if the event is a birth or a death.
    # `death` does not need to be normalized, as it is not used anymore (the
    # only check, below, is `.random() <= birth`).
    event_rate = kwargs["birth"] + kwargs["death"]
    birth = kwargs["birth"] / event_rate

    # Create the tree root as a node. Given that the root is at first set as
    # non-extinct and with a branch length of 0.0, it will be immediately
    # subject to either a speciation or extinction event.
    tree = Tree()
    tree.dist = 0.0
    tree.extinct = False

    # Iterate until an acceptable tree is generated (breaking the loop with
    # a tree) or all leaves go extinct (breaking the loop with `tree` as None).
    # `total_time`, of which we keep track in case `max_time` is provided,
    # is the total evolution time (sum of branch lengths) from the root to the
    # extant nodes.
    total_time = 0.0
    while True:
        # Get the list of extant leaves
        leaf_nodes = __extant(tree)

        # Compute the event time before the next birth/death event from a
        # random exporaviate reflecting the number of extant leaves and the
        # combined event probability.
        event_time = random.expovariate(len(leaf_nodes) * event_rate)

        # Update the total evolution time. If a maximum alloted time
        # `max_time` is provided and we overshoot it, break the loop
        # without implementing the event (as, by the random event time, it
        # would take place *after* our maximum time, in the future).
        total_time += event_time
        if kwargs["max_time"] and total_time > kwargs["max_time"]:
            break

        # Select a random node among the extant ones and set it as extinct
        # before simulating either a birth or death event; the type of
        # event is decided based on the comparison of the result of a
        # `random.random()` call with `birth` (here already normalized in
        # relation to `event_rate`)
        node = np.random.choice(leaf_nodes)
        node.extinct = True
        if np.random.random() <= birth:
            # The event will be a birth (i.e., speciation one), with at least
            # two children (the number is increased by a random sample from a
            # Poisson distribution using the `lam` parameter, so that
            # hard politomies are possible). The distance
            # of the children is here initially set to zero, and will be
            # increased by `event_time` in the loop below, along with all
            # other extant nodes.
            for _ in range(2 + np.random.poisson(kwargs["lam"])):
                child_node = Tree()
                child_node.dist = 0
                child_node.extinct = False

                node.add_child(child_node)

        # (Re)Extract the list of extant nodes, now that we might have new
        # children and that the randomly selected node went extinct
        # (easier than directly manipulating the Python list). From the
        # updated list, we will extend the branch length of all extant leaves
        # (thus including any new children) by the `event_time` computed
        # above.
        leaf_nodes = __extant(tree)
        for leaf in leaf_nodes:
            new_leaf_dist = leaf.dist + event_time
            leaf.dist = min(new_leaf_dist,
                            (kwargs["max_time"] or new_leaf_dist))

        # If the event above was a death event, we might be in the undesirable
        # situation where all lineages went extinct before we
        # could finish the random generation according to the
        # user-requested parameters, so that one or both stopping criteria
        # cannot be satisfied. A solution could
        # be to recursively call this function, with the same
        # parameters, until a valid tree is found, but this is not
        # optimal (nor elegant) and might get us stuck in a
        # loop if we don't keep track of the number of iterations
        # (especially if we got to this point by using a
        # user-provided random seed and/or set of unfortunate parameters).
        # In face of that, it is preferable to be explicit about the problem by
        # returning a `None` value, with the user (or a wrapper
        # function) being in charge of asserting that the desired
        # number of random trees is collected (even if it is a single one).
        if not leaf_nodes:
            tree = None
            break

        # Check whether one or both the stopping criteria were reached
        if kwargs["min_leaves"] and len(leaf_nodes) >= kwargs["min_leaves"]:
            break

        if kwargs["max_time"] and total_time >= kwargs["max_time"]:
            break

    # In some cases we might end up with technically valid trees composed
    # only of the root. We make sure at least one speciation event took
    # place, returning `None` as failure in other cases.
    if tree and len(__extant(tree)) <= 2:
        tree = None

    # Prune the tree, removing extinct leaves, if requested and if a
    # tree was found. Remember that the ete3 `prune()` method takes a list
    # of the nodes that will be kept, removing the other ones.
    if kwargs["prune"] and tree:
        tree.prune(__extant(tree))

    # Label the tree before returning it, if it was provided
    if kwargs["labels"] and tree:
        label_tree(tree, kwargs["labels"], seed=kwargs["seed"])

    return tree
Example #20
0
def disambiguate(tree: Tree, random_state=None) -> Tree:
    """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on
    subtrees of consecutive ambiguity codes."""
    if random_state is None:
        random.seed(tree.write(format=1))
    else:
        random.setstate(random_state)
    for node in tree.traverse():
        for site, base in enumerate(node.sequence):
            if base not in gctree.utils.bases:

                def is_leaf(node):
                    return (node.is_leaf()) or (node.sequence[site]
                                                in gctree.utils.bases)

                # First pass of Sankoff: compute cost vectors
                for node2 in node.traverse(strategy="postorder",
                                           is_leaf_fn=is_leaf):
                    base2 = node2.sequence[site]
                    node2.add_feature("cv", code_vectors[base2].copy())
                    if not is_leaf(node2):
                        for i in range(5):
                            for child in node2.children:
                                node2.cv[i] += min([
                                    sum(v) for v in zip(
                                        child.cv, cost_adjust[
                                            gctree.utils.bases[i]])
                                ])
                # Second pass: Choose base and adjust children's cost vectors
                if not node.is_root():
                    node.cv = [
                        sum(v) for v in zip(
                            node.cv, cost_adjust[node.up.sequence[site]])
                    ]
                # traverse evaluates is_leaf(node) after yielding node.
                # Resolving base makes is_leaf true; must get order before
                # making changes.
                preorder = list(
                    node.traverse(strategy="preorder", is_leaf_fn=is_leaf))
                for node2 in preorder:
                    if node2.sequence[site] in gctree.utils.bases:
                        continue
                    min_cost = min(node2.cv)
                    base_index = random.choice([
                        i for i, val in enumerate(node2.cv) if val == min_cost
                    ])
                    new_base = gctree.utils.bases[base_index]
                    # Adjust child cost vectors
                    if not is_leaf(node2):
                        for child in node2.children:
                            child.cv = [
                                sum(v)
                                for v in zip(child.cv, cost_adjust[new_base])
                            ]
                    node2.sequence = (node2.sequence[:site] + new_base +
                                      node2.sequence[(site + 1):])
    for node in tree.traverse():
        try:
            node.del_feature("cv")
        except (AttributeError, KeyError):
            pass
    tree.dist = 0
    for node in tree.iter_descendants():
        node.dist = gctree.utils.hamming_distance(node.up.sequence,
                                                  node.sequence)
    return tree
Example #21
0
def get_tree_object_in_newick(tree, id_to_sample_dict=None):
    """Take a tree object, and create a newick formatted representation of it"""

    new_tree = Tree()
    new_tree.dist = 0
    new_tree.name = "root"

    node_id = 0
    node_id_to_node_in_old_tree = {node_id: tree}
    node_id_to_node_in_new_tree = {node_id: new_tree}

    node_ids_to_visit_in_old_tree = [node_id]

    while node_ids_to_visit_in_old_tree:
        node_id_in_old_tree = node_ids_to_visit_in_old_tree.pop()
        node_in_old_tree = node_id_to_node_in_old_tree[node_id_in_old_tree]
        cl_dist = node_in_old_tree.dist / 2.0

        for ch_node_in_old_tree in [node_in_old_tree.left, node_in_old_tree.right]:
            if ch_node_in_old_tree:
                ch_for_new_tree = Tree()
                ch_for_new_tree.dist = cl_dist

                node_id += 1
                node_id_to_node_in_new_tree[node_id] = ch_for_new_tree

                if ch_node_in_old_tree.is_leaf():
                    if id_to_sample_dict:
                        ch_for_new_tree.name = id_to_sample_dict[ch_node_in_old_tree.id]
                    else:
                        ch_for_new_tree.name = ch_node_in_old_tree.id
                else:
                    # we used to export our trees with internal node labels so we could
                    # do various interface operations more easily:
                    #
                    #    ch_for_new_tree.name = 'Int' + str(ch_node_in_old_tree.id)
                    #
                    # but our new interface design does not require such addditions to
                    # dendrograms. Although here we add 0 branch support for our
                    # dendrograms since we wish to use a standard format to export these
                    # data as a tree.
                    ch_for_new_tree.support = 0.0

                node_id_to_node_in_new_tree[node_id_in_old_tree].add_child(ch_for_new_tree)
                node_id_to_node_in_old_tree[node_id] = ch_node_in_old_tree
                node_ids_to_visit_in_old_tree.append(node_id)

    for node in new_tree.traverse("preorder"):
        if node.is_leaf():
            continue

        has_child_with_dist_or_int = False

        for child in node.get_children():
            if not child.is_leaf() or child.dist > 0:
                has_child_with_dist_or_int = True
                break

        if has_child_with_dist_or_int:
            continue

        # swap childs alphabetically
        node.children = sorted(node.get_children(), key=lambda x:x.name, reverse=True)

    return new_tree.write(format=2)
#!python3
import os
import argparse
import pandas as pd
from ete3 import Tree

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '-n',
        '--nwk',
        default="../DataEmpirical/PrimatesBinaryLHTShort/rootedtree.nwk",
        required=False,
        type=str,
        dest="nwk")
    args = parser.parse_args()
    nwk = Tree(args.nwk, format=1)
    root_age = nwk.get_closest_leaf()[1]
    nwk.dist = nwk.dist / root_age
    for n in nwk.iter_descendants():
        print("{0}: {1}".format(n.name, n.dist / root_age))
        n.dist = n.dist / root_age
    nwk.write(format=1, outfile=args.nwk + ".scaled.nwk")