コード例 #1
0
ファイル: newick.py プロジェクト: squarednob/scikit-bio
def _newick_to_tree_node(fh, convert_underscores=True):
    tree_stack = []
    current_depth = 0
    last_token = ''
    next_is_distance = False
    root = TreeNode()
    tree_stack.append((root, current_depth))
    for token in _tokenize_newick(fh, convert_underscores=convert_underscores):
        # Check for a label
        if last_token not in '(,):':
            if not next_is_distance:
                tree_stack[-1][0].name = last_token if last_token else None
            else:
                next_is_distance = False
        # Check for a distance
        if token == ':':
            next_is_distance = True
        elif last_token == ':':
            try:
                tree_stack[-1][0].length = float(token)
            except ValueError:
                raise NewickFormatError("Could not read length as numeric type"
                                        ": %s." % token)

        elif token == '(':
            current_depth += 1
            tree_stack.append((TreeNode(), current_depth))
        elif token == ',':
            tree_stack.append((TreeNode(), current_depth))
        elif token == ')':
            if len(tree_stack) < 2:
                raise NewickFormatError("Could not parse file as newick."
                                        " Parenthesis are unbalanced.")
            children = []
            # Pop all nodes at this depth as they belong to the remaining
            # node on the top of the stack as children.
            while current_depth == tree_stack[-1][1]:
                node, _ = tree_stack.pop()
                children.insert(0, node)
            parent = tree_stack[-1][0]
            if parent.children:
                raise NewickFormatError("Could not parse file as newick."
                                        " Contains unnested children.")
            # This is much faster than TreeNode.extend
            for child in children:
                child.parent = parent
            parent.children = children
            current_depth -= 1
        elif token == ';':
            if len(tree_stack) == 1:
                return root
            break

        last_token = token

    raise NewickFormatError("Could not parse file as newick."
                            " `(Parenthesis)`, `'single-quotes'`,"
                            " `[comments]` may be unbalanced, or tree may be"
                            " missing its root.")
コード例 #2
0
ファイル: taxonomy.py プロジェクト: margaret/onecodex
def generate_skbio_tree(classification, existing_tree=None):
    from skbio.tree import MissingNodeError, TreeNode

    otus = classification.results()['table']
    if existing_tree is None:
        tree = TreeNode(name='1', length=1)
        tree.tax_name = 'Root'
        tree.rank = 'no rank'
    else:
        tree = existing_tree

    # we use this to keep track of nodes that haven't had their parent added yet
    unlinked = defaultdict(list)

    for otu in otus:
        tax_id = otu['tax_id']
        # skip nodes already in the tree
        try:
            tree.find(tax_id)
            continue
        except MissingNodeError:
            pass

        # try to find a parent (if it exists)
        parent_id = otu['parent_tax_id']
        try:
            parent = tree.find(parent_id)
            # the children are merged out here (only if we have a parent) to
            # make sure we're not creating trees inside unlinked itself
            children = _merge_unlinked(tax_id, unlinked)
        except MissingNodeError:
            parent = None
            children = None

        # create the node
        node = TreeNode(name=tax_id, length=1, children=children)
        node.tax_name = otu.get('name', '')
        node.rank = otu.get('rank', 'no rank')

        # either add the node to its parent or keep track of it until its
        # parent is "in tree" too
        if parent is not None:
            parent.append(node)
        else:
            unlinked[parent_id].append(node)

    assert len(
        unlinked) == 0, 'some unlinked nodes were not included in the tree'

    return tree
コード例 #3
0
ファイル: distance.py プロジェクト: margaret/onecodex
def unifrac(classifications, weighted=True,
            field='readcount_w_children', rank='species', strict=False):
    """
    A beta diversity metric that takes into account the relative relatedness of community members.
    Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence
    """
    assert field in ACCEPTABLE_FIELDS
    counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)

    tree = None
    for c in classifications:
        if strict and c.job.id != classifications[0].job.id:
            raise OneCodexException('All Classifications must have the same Job for Unifrac')
        tree = generate_skbio_tree(c, existing_tree=tree)

    # there's a bug (?) in skbio where it expects the root to only have
    # one child, so we do a little faking here
    new_tree = TreeNode(name='fake root')
    new_tree.rank = 'no rank'
    new_tree.append(tree)

    # prune low-level nodes off the tree so the tips are what we're comparing
    prune_to_rank(new_tree, rank=rank)

    if weighted:
        return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids,
                                              tree=new_tree, otu_ids=tax_ids)
    else:
        return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids,
                                              tree=new_tree, otu_ids=tax_ids)
コード例 #4
0
def _build_trees(clade_counts, edge_lengths, support_attr):
    """Construct the trees with support

    Parameters
    ----------
    clade_counts : dict
        Keyed by the frozenset of the clade and valued by the support
    edge_lengths : dict
        Keyed by the frozenset of the clade and valued by the weighted length
    support_attr : str
        The name of the attribute to hold the support value

    Returns
    -------
    list of TreeNode
        A list of the constructed trees
    """
    nodes = {}
    queue = [(len(clade), clade) for clade in clade_counts]
    while queue:
        # The values within the queue are updated on each iteration, so it
        # doesn't look like an insertion sort will make sense unfortunately
        queue.sort()
        (clade_size, clade) = queue.pop(0)
        new_queue = []

        # search for ancestors of clade
        for (_, ancestor) in queue:
            if clade.issubset(ancestor):
                # update ancestor such that, in the following example:
                # ancestor == {1, 2, 3, 4}
                # clade == {2, 3}
                # new_ancestor == {1, {2, 3}, 4}
                new_ancestor = (ancestor - clade) | frozenset([clade])

                # update references for counts and lengths
                clade_counts[new_ancestor] = clade_counts.pop(ancestor)
                edge_lengths[new_ancestor] = edge_lengths.pop(ancestor)

                ancestor = new_ancestor

            new_queue.append((len(ancestor), ancestor))

        # if the clade is a tip, then we have a name
        if clade_size == 1:
            name = list(clade)[0]
        else:
            name = None

        # the clade will not be in nodes if it is a tip
        children = [nodes.pop(c) for c in clade if c in nodes]
        length = edge_lengths[clade]

        node = TreeNode(children=children, length=length, name=name)
        setattr(node, support_attr, clade_counts[clade])
        nodes[clade] = node

        queue = new_queue

    return list(nodes.values())
コード例 #5
0
    def unifrac(self, weighted=True, rank="auto"):
        """Calculate the UniFrac beta diversity metric.

        UniFrac takes into account the relatedness of community members. Weighted UniFrac considers
        abundances, unweighted UniFrac considers presence.

        Parameters
        ----------
        weighted : `bool`
            Calculate the weighted (True) or unweighted (False) distance metric.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        # needs read counts, not relative abundances
        import skbio.diversity

        if self._guess_normalized():
            raise OneCodexException("UniFrac requires unnormalized read counts.")

        df = self.to_df(rank=rank, normalize=False)

        counts = []
        for c_id in df.index:
            counts.append(df.loc[c_id].tolist())

        tax_ids = df.keys().tolist()

        tree = self.tree_build()
        tree = self.tree_prune_rank(tree, rank=df.ocx_rank)

        # there's a bug (?) in skbio where it expects the root to only have
        # one child, so we do a little faking here
        from skbio.tree import TreeNode

        new_tree = TreeNode(name="fake root")
        new_tree.rank = "no rank"
        new_tree.append(tree)

        # then finally run the calculation and return
        if weighted:
            return skbio.diversity.beta_diversity(
                "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
        else:
            return skbio.diversity.beta_diversity(
                "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
コード例 #6
0
    def tree_build(self):
        """Build a tree from the taxonomy data present in this object.

        This is designed for use with `ClassificationsDataFrame` or `SampleCollection`.

        Returns
        -------
        `skbio.tree.TreeNode`, the root node of a tree that contains all the taxa in the current
        analysis and their parents leading back to the root node.
        """
        from skbio.tree import TreeNode

        # build all the nodes
        nodes = {}

        for tax_id in self.taxonomy.index:
            node = TreeNode(name=tax_id, length=1)
            node.tax_name = self.taxonomy["name"][tax_id]
            node.rank = self.taxonomy["rank"][tax_id]
            node.parent_tax_id = self.taxonomy["parent_tax_id"][tax_id]

            nodes[tax_id] = node

        # generate all the links
        for tax_id in self.taxonomy.index:
            try:
                parent = nodes[nodes[tax_id].parent_tax_id]
            except KeyError:
                if tax_id != "1":
                    warnings.warn(
                        "tax_id={} has parent_tax_id={} which is not in tree"
                        "".format(tax_id, nodes[tax_id].parent_tax_id))

                continue

            parent.append(nodes[tax_id])

        return nodes["1"]
コード例 #7
0
def deserialize(st: str,
                words: Optional[List] = None,
                convert_underscores: bool = True) -> Tuple[TreeNode, List]:
    """read str to TreeNode and get nested list of operation order

    Parameters
    ----------
    st: str
        The string to recreate the tree from and extract an order of neighbors to be used as the guide tree. The
        guide tree is in the form of list of lists and tuples where an internal list represents an inner node and a
        tuple represents a pair to combine, e.g.:
            [['abc', ('aab','aac')],'xyz']
        for the tree:
                        --- 'xyz'
                    ---|     --- 'abc'
                        --- |   --- 'aab'
                            ---|
                                --- 'aac'
    words: list
        If the serialized tree was created using indices instead of labels, the original column can be passed in to return the
        exact values inside the order array
    convert_underscores: bool (default = True)
        flag to convert underscores as per the newick tokenizer
    """
    tree_stack = []
    current_depth = 0
    last_token = ''
    root = TreeNode()
    tree_stack.append((root, current_depth))
    next_is_distance = False

    combo = []
    my_stack = []
    my_stack.append((combo, current_depth))

    for token in _tokenize_newick(st, convert_underscores=convert_underscores):
        # Check for a label
        if last_token not in '(,):':
            val = Sequence(
                words[int(last_token)]) if words else int(last_token)
            if not next_is_distance:
                tree_stack[-1][0].name = val if last_token else None
            else:
                next_is_distance = False
            if last_token:
                my_stack[-1][0].append(val)
            else:
                my_stack[-1][0].append(None)

            # Check for a distance
        if token == ':':
            next_is_distance = True
        elif last_token == ':':
            try:
                tree_stack[-1][0].length = float(token)
            except ValueError:
                raise NewickFormatError("Could not read length as numeric type"
                                        ": %s." % token)
        elif token == '(':
            current_depth += 1
            tree_stack.append((TreeNode(), current_depth))
            my_stack.append((list(), current_depth))
        elif token == ',':
            tree_stack.append((TreeNode(), current_depth))
            my_stack.append((list(), current_depth))
        elif token == ')':
            if len(tree_stack) < 2:
                raise NewickFormatError("Could not parse file as newick."
                                        " Parenthesis are unbalanced.")
            children = []
            my_children = []
            # Pop all nodes at this depth as they belong to the remaining
            # node on the top of the stack as children.
            while current_depth == tree_stack[-1][1]:
                node, _ = tree_stack.pop()
                children.insert(0, node)
                nc, _ = my_stack.pop()
                [my_children.insert(0, c) for c in nc]
            parent = tree_stack[-1][0]
            my_parent = my_stack[-1][0]

            if parent.children:
                raise NewickFormatError("Could not parse file as newick."
                                        " Contains unnested children.")
            # This is much faster than TreeNode.extend
            for child in children:
                child.parent = parent
            parent.children = children

            my_parent.append(my_children)

            current_depth -= 1
        elif token == ';':
            if len(tree_stack) == 1:
                return root, my_stack
            break

        last_token = token

    raise NewickFormatError("Could not parse file as newick."
                            " `(Parenthesis)`, `'single-quotes'`,"
                            " `[comments]` may be unbalanced, or tree may be"
                            " missing its root.")
コード例 #8
0
ファイル: distance.py プロジェクト: fossabot/onecodex
    def unifrac(self, weighted=True, rank=Rank.Auto):
        """Calculate the UniFrac beta diversity metric.

        UniFrac takes into account the relatedness of community members. Weighted UniFrac considers
        abundances, unweighted UniFrac considers presence.

        Parameters
        ----------
        weighted : `bool`
            Calculate the weighted (True) or unweighted (False) distance metric.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        import skbio.diversity

        df = self.to_df(rank=rank, normalize=self._guess_normalized())

        ocx_rank = df.ocx_rank
        # The scikit-bio implementations of phylogenetic metrics require integer counts
        if self._guess_normalized():
            df = df * 10e9

        tax_ids = df.keys().tolist()

        tree = self.tree_build()
        tree = self.tree_prune_rank(tree, rank=ocx_rank)

        # `scikit-bio` requires that the tree root has no more than 2
        # children, otherwise it considers it "unrooted".
        #
        # https://github.com/biocore/scikit-bio/blob/f3ae1dcfe8ea88e52e19f6693d79e529d05bda04/skbio/diversity/_util.py#L89
        #
        # Our taxonomy root regularly has more than 2 children, so we
        # add a fake parent of `root` to the tree here.
        from skbio.tree import TreeNode

        new_tree = TreeNode(name="fake root")
        new_tree.rank = "no rank"
        new_tree.append(tree)

        # then finally run the calculation and return
        if weighted:
            return skbio.diversity.beta_diversity(
                BetaDiversityMetric.WeightedUnifrac,
                df,
                df.index,
                tree=new_tree,
                otu_ids=tax_ids,
                normalized=True,
            )
        else:
            return skbio.diversity.beta_diversity(
                BetaDiversityMetric.UnweightedUnifrac,
                df,
                df.index,
                tree=new_tree,
                otu_ids=tax_ids,
            )
コード例 #9
0
def cluster(feature_matrix, prob_features, names=None, result_constructor=None):
    fm = copy.deepcopy(feature_matrix)
    fm = fm.values 
    
    Ds = []
    joins = []
    
    if names is None:
        names = np.arange(fm.shape[0])
    
    log_prob_features = np.log(prob_features)
    log_prob_features[-log_prob_features == np.inf] = -10000 #hacky
    
    
    # Compute the distance matrix
    D = squareform(pdist(fm, lambda u,v: (- (u + v - 2*u*v) * log_prob_features).sum()))
    
    tree_nodes = {}
    
    for name in names:
        tree_nodes[name] = TreeNode(name=str(name))
        
    print('Starting with {0} nodes'.format(len(D)))
    new_name = len(D)
    
    new_lcas = {}
    while len(D) > 2:
              
        s = time.time()
        
        Ds.append(D)
        
        # Convert Q martix to lower triangular form without the diagonal to avoid merging the same site
        D[np.tril_indices(D.shape[0], 0)]  = np.inf
        
        # Now find the argmin (i,j) of Q. These are the sites the be merged
        min_i, min_j = np.unravel_index(np.argmin(D, axis=None), D.shape)
        s = time.time() 
        
        joins.append((names[min_i], names[min_j]))

        
        # Create a new TreeNode from the merged children
        
        new_name += 1
        
        
        child_i = tree_nodes[names[min_i]]
        child_j = tree_nodes[names[min_j]]
        new_node = TreeNode(name=str(new_name), length=None, parent=None, children=[child_i, child_j])

            
        child_i.parent = new_node
        child_j.parent = new_node
        
        tree_nodes[new_name] = new_node
        
        
        names = np.delete(names, [min_i,min_j], axis=0)
        names = np.hstack([names, new_name])
                
        # Now we merge i,j. We need to replace i,j in the feature matrix with lca(i,j).
#         lca = lcas[min_i,min_j]
        lca = fm[min_i]*fm[min_j]
        fm  = np.delete(fm, [min_i,min_j], axis=0)
        fm  = np.vstack([fm, lca])
        
        new_lcas[new_name] = lca

        # We also need to replace the distance of each site k to i or j with the distance to lca(i,j)

        D = np.delete(np.delete(D, [min_i,min_j], axis=0), [min_i,min_j], axis=1)

        new_D = np.zeros((fm.shape[0], fm.shape[0]))
        new_D[:-1, :-1] = D

        new_D_row = - ((fm + fm[-1] - 2* fm * fm[-1])*log_prob_features).sum(1)


        new_D[-1, :] = new_D_row
        new_D[:, -1] = new_D_row
        D = new_D
        
        
    new_name += 1
    
    # Merge the last two remaining sites to complete the tree
    child1, child2 = tree_nodes[names[0]], tree_nodes[names[1]]
    root = TreeNode(name = str(new_name), children=[child1, child2])
    child1.parent = root
    child2.parent = root
    
    return root, {'Ds':Ds, 'joins':joins, 'lcas':new_lcas}
コード例 #10
0
ファイル: tree.py プロジェクト: kellyhuang21/deep-taxon
def _cn2tn(cn, names):
    if cn.is_leaf():
        return TreeNode(name=names[cn.id], length=cn.dist)
    left = _cn2tn(cn.left, names)
    right = _cn2tn(cn.right, names)
    return TreeNode(name=cn.id, length=cn.dist, children=[left, right])