Beispiel #1
0
def insert(scores, threshold, length_f=None):
    """Insert nodes that pass the threshold

    Parameters
    ----------
    scores : dict
        The result of best()
    threshold : float
        A minimum scoring threshold which must be exceeded for a node to be
        inserted
    length_f : function, optional
        A function which can provide a branch length to set. This function
        must conform to the following signature:

            f(TreeNode, str, float) -> float

        Where TreeNode is the node to insert at, str is the name of the node
        being inserted, and float is the score of the query at the node.

        The default is to set a branch length of 0.0
    """
    if length_f is None:

        def length_f(a, b, c):
            return 0.0

    for query, result in scores.items():
        if result['score'] >= threshold:
            length = length_f(result['node'], query, result['score'])
            result['node'].append(skbio.TreeNode(name=query, length=length))
def recursive_subdivide(node, k, count, id_1, tree_node):
    count += 1
    id_1 += str(count)
    
    bin_id.append("H" + str(count))
    samples["H" + str(count)] = ""
    
     
    if len(node.points)<=k:
        return

    print(type(node.width))
    w_ = node.width/2
    h_ = node.height/2
    
    #fun challenge do it in a for loop
    #probably make it smaller
    p = contains(node.x0, node.y0, w_, h_, node.points)
    x1 = Node(node.x0, node.y0, w_, h_, p, id_1 + "sw;")
    node_1 = skbio.TreeNode(name=str(count) + "sw")
    for pt in p:
        bin_1.append((pt.sample_id, count, x1.get_id()))
        node_1.extend([skbio.TreeNode(name=pt.sample_id)])
    recursive_subdivide(x1, k, count, id_1+"sw;", node_1)
    
    p = contains(node.x0, node.y0+h_, w_, h_, node.points)
    x2 = Node(node.x0, node.y0+h_, w_, h_, p, id_1+"nw;")
    node_2 = skbio.TreeNode(name=str(count)+"nw")
    for pt in p:
        bin_1.append((pt.sample_id, count, x2.get_id()))
        node_2.extend([skbio.TreeNode(name=pt.sample_id)])
    recursive_subdivide(x2, k, count, id_1+"nw;", node_2)

    p = contains(node.x0+w_, node.y0, w_, h_, node.points)
    x3 = Node(node.x0 + w_, node.y0, w_, h_, p, id_1+"se;")
    node_3 = skbio.TreeNode(name=str(count)+"se")
    for pt in p:
        bin_1.append((pt.sample_id, count, x3.get_id()))
        node_3.extend([skbio.TreeNode(name=pt.sample_id)])
    recursive_subdivide(x3, k, count, id_1+"se;", node_3)

    p = contains(node.x0+w_, node.y0+h_, w_, h_, node.points)
    x4 = Node(node.x0+w_, node.y0+h_, w_, h_, p, id_1+"ne;")
    node_4 = skbio.TreeNode(name=str(count)+"ne")
    for pt in p:
        bin_1.append((pt.sample_id, count, x4.get_id()))
        node_4.extend([skbio.TreeNode(name=pt.sample_id)])
    recursive_subdivide(x4, k, count, id_1+"ne;", node_4)

    
    
    tree_node.extend([node_1, node_2, node_3, node_4])
    node.children = [x1, x2, x3, x4]
Beispiel #3
0
def _0(ff: TSVTaxonomyFormat) -> skbio.TreeNode:
    root = skbio.TreeNode('root', length=0)
    with ff.open() as fh:
        reader = iter(csv.reader(fh, delimiter='\t'))
        next(reader)  # skip header
        for row in reader:
            id_, taxonomy = row[:2]
            taxonomy = taxonomy.split(';')
            node = root
            for taxon in taxonomy:
                for child in node.children:
                    if child.name == taxon:
                        node = child
                        break
                else:
                    child = skbio.TreeNode(taxon, length=1)
                    node.append(child)
                    node = child

            node.append(skbio.TreeNode(id_, length=1))

    return root
    def subdivide(self, count):
        samples = pd.DataFrame()
        samples['index'] = df['index']
        samples = samples.set_index('index')

        count = 0
        id_1 = ""
        base = skbio.TreeNode(name="root")

        recursive_subdivide(self.root, self.threshold, count, id_1, base)

            
        return base
Beispiel #5
0
def tree_to_matrix(tree, label, with_repr=False):
    # to do here : given a skbio tree and the beta-labels in a given order,
    # return the matrix A and the new labels corresponding
    dicti = dict()
    d = len(label)
    LEAVES = [tip.name for tip in tree.tips()]
    order = []
    # list that will give the order in which the nodes are added
    # in the dicti, such that it will be easy to remove similar nodes
    for i, name_leaf in enumerate(label):
        name_leaf = label[i]
        dicti[name_leaf] = np.zeros(d, dtype=bool)
        dicti[name_leaf][i] = True
        order.append(name_leaf)
        if name_leaf not in LEAVES:
            tree.append(skbio.TreeNode(name=name_leaf)
                        )  # add the node if it is node already in the tree
            print("The feature {} i not in the leaves of the tree".format(
                name_leaf))
        for n in tree.find(name_leaf).ancestors():
            ancest = n.name
            if ancest[-1] != "_":
                if ancest not in dicti:
                    dicti[ancest] = np.zeros(d, dtype=bool)
                    order.append(ancest)
                dicti[ancest][i] = True

    L, label2 = [], []

    for node in tree.levelorder():
        nam = node.name
        if nam in dicti and nam not in label2:
            label2.append(nam)
            L.append(dicti[nam])

    to_keep = np.ones(len(L), dtype=bool)
    to_keep = remove_same_vect(L, label2, order)
    L = np.array(L)
    to_keep[np.all(L, axis=1)] = False

    return L[to_keep].T, np.array(label2)[to_keep]
Beispiel #6
0
def generate_new_phylogeny(phylogeny_fp, pivot_mapping_dct):
    tree = average_branch_lengths(phylogeny_fp)
    observed_internal_nodes = set()
    for idx in pivot_mapping_dct.keys():
        if idx == -1:
            continue
        else:
            node_lca = tree.lca(pivot_mapping_dct[idx])
            if len(list(node_lca.tips())) != len(pivot_mapping_dct[idx]):
                raise ValueError("The LCA has different tips than expected",
                                 len(list(node_lca.tips())),
                                 len(pivot_mapping_dct[idx]))
            tmp_node = skbio.TreeNode('cluster' + str(idx))
            tmp_node.length = node_lca.avg
            parent_node = node_lca.parent
            if node_lca.dummy_name in observed_internal_nodes:
                raise ValueError("node_lca was observed before!",
                                 node_lca.dummy_name, node_lca, idx)
            else:
                observed_internal_nodes.add(node_lca.dummy_name)
            parent_node.append(tmp_node)
            parent_node.remove(node_lca)
    return tree
Beispiel #7
0
def majority_consensus(trees, cutoff=0.5):
    """Compute the majority consensus tree.

    When a taxon is designated as an outgroup, this algorithm is equivalent to
    tree popping described in The Mathematics of Phylogenetics. Since a clade
    is the subset of a split that does not contain the root, all sets of
    terminal labels obtained via the tips method when traversing the tree
    correspond to the half of the split without the root. Thus operating on the
    clades is implicitly operating on the equivalent split.

    The overall approach of the algorithm is to first tally the clades and
    then to progressively build the consensus tree by applying the clades in
    decreasing order of frequency (as long as they are compatible with all
    clades already displayed on the tree). The proof presented in The
    Mathematics of Phylogenetics is considerably more complex than the more
    intuitive algorithm used here. Whereas that proof required finding the
    minimum spanning tree, this approach instead uses the idea that as long as
    clades are applied in order of size (which is guaranteed by the principle
    that a parent clade will appear at least as many times in counts as its
    subclades and subsequent sorting on counts then size), then the right node
    onto which to apply a clade is always the smallest one that is compatible.

    This actually equivalent to finding the minimum spanning tree of the nodes
    colored by each clade (blue for those in the clade and red for those not in
    the clade). The tree induced by a node whose terminals are a superset of
    the clade clearly spans that clade. Thus, by iterating through all the
    nodes in the consensus tree sorted by decreasing size, the last node that
    meets this criterion induces the minimal spanning tree (MST) for the blue
    nodes (those within the clade). The MST for the red nodes (those not within
    the clade and containing the root) intersects with this MST at a single
    node via the tree popping theorem. By maintaining the directionality of the
    parent-child relationships away from the root, we know this is the node we
    have already identified. To see why, assume another node in the blue MST is
    the node of intersection. The parent of this node must also be blue because
    otherwise the MST is not minimal.The parent must also lie on the red MST
    since by construction following the chain of parents creates a path to the
    root and this path to the root must be on the red MST as trees have no
    cycles. Thus, we have a contradiction of the blue and red MSTs only
    intersecting at a single node, so the intersection node must be the
    previously identified node.

    Parameters
    ----------
        trees: list of skbio TreeNodes
            Input trees from which to compute the consensus.
        cutoff: float
            The minimum fraction of trees a clade must be displayed on in order
            (not including the value itself) to contribute to the consensus.
            The default value of 0.5 ensures all incorporated clades are
            compatible. If the cutoff is set lower, a clade is only added to
            the consensus if it compatible with all clades already displayed.
            Clades are added in order of decreasing number of counts in the
            input trees.

    Returns
    -------
        root: TreeNode
    """
    # Count nodes and record tip names
    counts, tip_names = {}, set()
    for tree in trees:
        tip_names.update([tip.name for tip in tree.tips()])
        for node in tree.traverse():
            if not node.is_tip():
                tip_set = frozenset([tip.name for tip in node.tips()])
                counts[tip_set] = counts.get(tip_set, 0) + 1
    counts = sorted(counts.items(),
                    key=lambda x: (x[1], len(x[0])),
                    reverse=True)  # Sort by count then size

    # Make count list and node dictionary
    # Initialize consensus nodes with a tip for each unique taxon label and a root node containing all these tips
    # (The tips are instantiated first, and then the root node containing them is created.)
    consensus_nodes = {
        frozenset([name]): skbio.TreeNode(name=name)
        for name in tip_names
    }
    root = skbio.TreeNode(children=list(consensus_nodes.values()))
    consensus_nodes[frozenset([node.name for node in root.children])] = root

    # Add nodes
    for tip_set1, count in counts[1:]:
        if count / len(trees) < cutoff:
            break

        # Find parent
        for tip_set2 in sorted(consensus_nodes,
                               key=lambda x: len(x),
                               reverse=True):
            compatible = (tip_set1 <= tip_set2 or tip_set2 <= tip_set1
                          or not (tip_set1 & tip_set2))
            if not compatible:
                break  # Break search over parents
            if tip_set2 >= tip_set1:
                parent_set = tip_set2  # Smallest superset since sorted by size
        if not compatible:
            continue  # Each clade should be compatible with all others in consensus tree
        parent_node = consensus_nodes[parent_set]

        # Construct current node
        children = []
        for node in parent_node.children:
            for tip in node.tips(include_self=True):
                if tip.name in tip_set1:
                    children.append(node)
                    break  # Compatibility requires all tips of a subnode are included if one is
        node = skbio.TreeNode(
            children=children, support=count /
            len(trees))  # Instantiation automatically updates parents
        consensus_nodes[tip_set1] = node
        parent_node.append(node)

    # Get branch lengths
    bl_sums = {}
    for tree in trees:
        for node in tree.traverse():
            if node.is_tip():
                continue
            tip_set = frozenset([tip.name for tip in node.tips()])

            # Add clade
            if tip_set in consensus_nodes:
                count, bl_sum = bl_sums.get(tip_set, (0, 0))
                count, bl_sum = count + 1, bl_sum + node.length if node.length else None
                bl_sums[tip_set] = (count, bl_sum)

                # Add terminal children of clade
                # The lengths of terminal children are only included if all the non-terminal children are clades
                # displayed in the consensus tree. This is expressed by checking this condition in a for loop and
                # proceeding to the else clause only if the loop isn't broken
                for child_node in filter(lambda x: not x.is_tip(),
                                         node.children):
                    if not frozenset([tip.name for tip in child_node.tips()
                                      ]) in consensus_nodes:
                        break
                else:
                    for child_tip in filter(lambda x: x.is_tip(),
                                            node.children):
                        tip_set = frozenset([child_tip.name])
                        count, bl_sum = bl_sums.get(tip_set, (0, 0))
                        count, bl_sum = count + 1, bl_sum + child_tip.length if child_tip.length else None
                        bl_sums[tip_set] = (count, bl_sum)

    # Set branch lengths
    for tip_set, (count, bl_sum) in bl_sums.items():
        node = consensus_nodes[tip_set]
        node.length = bl_sum / count if bl_sum else None

    return root
Beispiel #8
0
def build_tree_recurse(gene_tree, path):
    loss_nodes = []
    current_tree = gene_tree
    path_splited = path.split('_')
    _id = '_' + path.split('_')[-1] if len(path_splited) > 1 else ''
    parent_name_splited = gene_tree.name.split('_')
    _parent_id = '_' + parent_name_splited[-1] if len(
        parent_name_splited) > 1 else ''
    for i in range(len(parent_name_splited)):
        if ('l' in parent_name_splited[i]
                and len(parent_name_splited[i]) == 2):
            _parent_id = '_' + parent_name_splited[i - 1] if i > 1 else ''
            break

    if (_id):
        subtree_path = os.path.join(path, 'gene_tree.txt')
        f = open(subtree_path)
        subtree = skbio.read(f, format='newick', into=skbio.tree.TreeNode)
        f.close()
        current_tree = subtree
        event_path = os.path.join(path, 'event.txt')
        f = open(event_path)
        line = f.readline()
        splited = line.strip().split(',')
        node_name = splited[0]
        distance = float(splited[1])
        event_name = '_' + splited[2][0]
        event_index = '_' + splited[3]

        new_dt_node = skbio.TreeNode()
        child = None
        for node in gene_tree.traverse():
            if node.name == (node_name + _parent_id):
                child = node
                break
            elif (node.name):
                if '_dl' in node.name:
                    if node.name.split('_dl')[0] == (node_name + _parent_id):
                        child = node
                        break
                elif '_tl' in node.name:
                    if node.name.split('_tl')[0] == (node_name + _parent_id):
                        child = node
                        break
                elif '_il' in node.name:
                    if node.name.split('_il')[0] == (node_name + _parent_id):
                        child = node
                        break
                elif '_sl' in node.name:
                    if node.name.split('_sl')[0] == (node_name + _parent_id):
                        child = node
                        break
        parent = child.parent
        new_dt_node.name = node_name + event_name + _id
        new_dt_node.length = child.length - distance
        new_dt_node.parent = parent
        new_dt_node.children.append(child)
        child.length = distance
        child.parent = new_dt_node
        for i in range(len(parent.children)):
            if (parent.children[i].name == child.name):
                del parent.children[i]
                break
        parent.children.append(new_dt_node)
        new_dt_node.children.append(subtree)
        subtree.parent = new_dt_node
        for node in subtree.traverse():
            node.name = node.name + _id

    files = os.listdir(path)
    files_end_with_digit = []
    for f in files:
        if f.split('_')[-1].isdigit():
            files_end_with_digit.append(f)
    files = files_end_with_digit
    files = sorted(files, key=lambda x: int(x.split('_')[-1]))
    for f in files:
        file_path = os.path.join(path, f)
        if os.path.isdir(file_path):
            loss_nodes += build_tree_recurse(current_tree, file_path)

    files = os.listdir(path)

    for f in files:
        file_name = f.split('_')
        if (file_name and file_name[0] == 'ils'):
            _index = '_' + file_name[1]
            ils_path = os.path.join(path, f)
            file_ = open(ils_path)
            line = file_.readline()
            splited = line.split(',')
            node_name = splited[0]
            for node in current_tree.traverse():
                if node.name == (node_name + _id):
                    node.name = node_name + '_i' + _index + '_id' + _id
                    break
            file_.close()

    for f in files:
        file_name = f.split('_')
        if (file_name and file_name[0] == 's'):
            _index = '_' + file_name[1]
            s_path = os.path.join(path, f)
            file_ = open(s_path)
            line = file_.readline()
            splited = line.split(',')
            node_name = splited[0]
            for node in current_tree.traverse():
                if node.name == (node_name + _id):
                    node.name = node_name + '_s' + _index + '_id' + _id
                    break
            file_.close()

    for f in files:
        file_name = f.split('_')
        if (file_name and file_name[0] == 'loss'):
            loss_path = os.path.join(path, f)
            file_ = open(loss_path)
            line = file_.readline()
            splited = line.split(',')
            node_l_name = splited[0]
            node_l_distance = float(splited[1])
            node_index = int(splited[2])

            new_l_node = skbio.TreeNode()
            child = None
            print('start')
            for node in current_tree.traverse():
                if (node.name):
                    print('n=', node.name)
                    splited = node.name.split('_')
                if (_id == ''):
                    if (splited[0] == node_l_name):
                        child = node
                        break
                else:
                    if node.name == (node_l_name + _id):
                        child = node
                        break
                    elif (splited[0] == node_l_name
                          and ('_' + splited[-1]) == _id):
                        child = node
                        break
                    elif '_dl' in node.name:
                        if node.name.split('_dl')[0] == (node_l_name + _id):
                            child = node
                            break
                        else:
                            splited = node.name.split('_dl')[0].split('_')
                    elif '_tl' in node.name:
                        if node.name.split('_tl')[0] == (node_l_name + _id):
                            child = node
                            break
                        else:
                            splited = node.name.split('_tl')[0].split('_')
                    elif '_il' in node.name:
                        if node.name.split('_il')[0] == (node_l_name + _id):
                            child = node
                            break
                        else:
                            splited = node.name.split('_il')[0].split('_')
                    elif '_sl' in node.name:
                        if node.name.split('_sl')[0] == (node_l_name + _id):
                            child = node
                            break
                        else:
                            splited = node.name.split('_sl')[0].split('_')
                    if (splited[0] == node_l_name
                            and ('_' + splited[-1]) == _id):
                        child = node
                        break
            print(node_l_name + _id)
            print('end')
            parent = child.parent
            if '_s_' in parent.name:
                new_l_node.name = node_l_name + '_l' + '_id' + _id
                if (parent.children[0].name == child.name):
                    parent.children[
                        1].name += '_sl' + '_ind_' + parent.name.split(
                            '_s_')[1].split('_')[0] + '_' + str(node_index)
                else:
                    parent.children[
                        0].name += '_sl' + '_ind_' + parent.name.split(
                            '_s_')[1].split('_')[0] + '_' + str(node_index)
            elif '_d_' in parent.name:
                new_l_node.name = node_l_name + '_l' + '_id' + _id
                if (parent.children[0].name == child.name):
                    parent.children[
                        1].name += '_dl' + '_ind_' + parent.name.split(
                            '_d_')[1].split('_')[0] + '_' + str(node_index)
                else:
                    parent.children[
                        0].name += '_dl' + '_ind_' + parent.name.split(
                            '_d_')[1].split('_')[0] + '_' + str(node_index)
            elif '_t_' in parent.name:
                new_l_node.name = node_l_name + '_l' + '_id' + _id
                if (parent.children[0].name == child.name):
                    parent.children[
                        1].name += '_tl' + '_ind_' + parent.name.split(
                            '_t_')[1].split('_')[0] + '_' + str(node_index)
                else:
                    parent.children[
                        0].name += '_tl' + '_ind_' + parent.name.split(
                            '_t_')[1].split('_')[0] + '_' + str(node_index)
            elif '_i_' in parent.name:
                new_l_node.name = node_l_name + '_l' + '_id' + _id
                if (parent.children[0].name == child.name):
                    parent.children[
                        1].name += '_il' + '_ind_' + parent.name.split(
                            '_i_')[1].split('_')[0] + '_' + str(node_index)
                else:
                    parent.children[
                        0].name += '_il' + '_ind_' + parent.name.split(
                            '_i_')[1].split('_')[0] + '_' + str(node_index)

            new_l_node.length = child.length - node_l_distance
            new_l_node.parent = parent
            new_l_node.children.append(child)
            child.length = node_l_distance
            child.parent = new_l_node
            for i in range(len(parent.children)):
                if (parent.children[i].name == child.name):
                    del parent.children[i]
                    break
            parent.children.append(new_l_node)
            loss_nodes.append(new_l_node)
            file_.close()

    return loss_nodes