Exemple #1
0
def gradient_clustering(table: pd.DataFrame,
                        gradient: MetadataCategory,
                        weighted=True) -> skbio.TreeNode:
    """ Builds a tree for features based on a gradient.

    Parameters
    ----------
    table : pd.DataFrame
       Contingency table where rows are samples and columns are features.
    gradient : qiime2.MetadataCategory
       Continuous vector of measurements corresponding to samples.
    weighted : bool
       Specifies if abundance or presence/absence information
       should be used to perform the clustering.

    Returns
    -------
    skbio.TreeNode
       Represents the partitioning of features with respect to the gradient.
    """
    c = gradient.to_series()
    c = c.astype(np.float)
    if not weighted:
        table = table > 0

    t = gradient_linkage(table, c, method='average')
    mean_g = mean_niche_estimator(table, c)
    mean_g = pd.Series(mean_g, index=table.columns)
    mean_g = mean_g.sort_values()
    t = gradient_sort(t, mean_g)
    t = rename_internal_nodes(t)
    return t
Exemple #2
0
def ilr_phylogenetic(
        table: pd.DataFrame,
        tree: skbio.TreeNode,
        pseudocount: float = 0.5) -> (pd.DataFrame, skbio.TreeNode):
    t = tree.copy()
    t.bifurcate()
    t = rename_internal_nodes(t)
    return ilr_transform(add_pseudocount(table, pseudocount), t), t
Exemple #3
0
def assign_ids(input_table: pd.DataFrame,
               input_tree: skbio.TreeNode) -> (pd.DataFrame, skbio.TreeNode):

    t = input_tree.copy()
    t.bifurcate()
    ids = ['%sL-%s' % (i, uuid.uuid4())
           for i, n in enumerate(t.levelorder(include_self=True))
           if not n.is_tip()]
    t = rename_internal_nodes(t, names=ids)
    _table, _t = match_tips(input_table, t)
    return _table, _t
def assign_ids(input_tree: skbio.TreeNode) -> skbio.TreeNode:

    t = input_tree.copy()
    t.bifurcate()
    ids = [
        '%sL-%s' % (i, uuid.uuid4())
        for i, n in enumerate(t.levelorder(include_self=True))
        if not n.is_tip()
    ]
    t = rename_internal_nodes(t, names=ids)
    return t
Exemple #5
0
def ilr_phylogenetic_differential(
        differential: pd.DataFrame,
        tree: skbio.TreeNode) -> (pd.DataFrame, skbio.TreeNode):
    t = tree.copy()
    t.bifurcate()
    diff, _tree = match_tips(differential.T, t)
    _tree = rename_internal_nodes(_tree)
    in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()]
    basis = _balance_basis(_tree)[0]
    basis = pd.DataFrame(basis.T, index=diff.columns, columns=in_nodes)
    diff_balances = (diff @ basis).T
    diff_balances.index.name = 'featureid'
    return diff_balances, t
Exemple #6
0
def rank_linkage(r, method='average'):
    r""" Hierchical Clustering on feature ranks.

    The hierarchy is built based on the rank values of the features given
    an input vector `r` of ranks. The distance between two features :math:`x`
    and :math:`y` can be defined as

    .. math::
       d(x, y) = (r(x) - r(y))^2

    Where :math:`r(x)` is the rank of the features.  Hierarchical clustering is
    then performed using :math:`d(x, y)` as the distance metric.

    This can be useful for constructing principal balances.

    Parameters
    ----------
    r : pd.Series
        Continuous vector representing some ordering of the features in X.
    method : str
        Clustering method.  (default='average')

    Returns
    -------
    skbio.TreeNode
        Tree for constructing principal balances.

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import rank_linkage
    >>> ranks = pd.Series([1, 2, 4, 5],
    ...                   index=['o1', 'o2', 'o3', 'o4'])
    >>> tree = rank_linkage(ranks)
    >>> print(tree.ascii_art())
                        /-o1
              /y1------|
             |          \-o2
    -y0------|
             |          /-o3
              \y2------|
                        \-o4
    """
    dm = DistanceMatrix.from_iterable(r, euclidean)
    lm = linkage(dm.condensed_form(), method)
    t = TreeNode.from_linkage_matrix(lm, r.index)
    t = rename_internal_nodes(t)
    return t
Exemple #7
0
def _intersect_of_table_metadata_tree(table, metadata, tree):
    """ Matches tips, features and samples between the table, metadata
    and tree.  This module returns the features and samples that are
    contained in all 3 objects.

    Parameters
    ----------
    table : pd.DataFrame
        Contingency table where samples correspond to rows and
        features correspond to columns.
    metadata: pd.DataFrame
        Metadata table that contains information about the samples contained
        in the `table` object.  Samples correspond to rows and covariates
        correspond to columns.
    tree : skbio.TreeNode
        Tree object where the leaves correspond to the columns contained in
        the table.

    Returns
    -------
    pd.DataFrame
        Subset of `table` with common row names as `metadata`
        and common columns as `tree.tips()`
    pd.DataFrame
        Subset of `metadata` with common row names as `table`
    skbio.TreeNode
        Subtree of `tree` with common tips as `table`
    """
    if np.any(table <= 0):
        raise ValueError('Cannot handle zeros or negative values in `table`. '
                         'Use pseudocounts or ``multiplicative_replacement``.')

    _table, _metadata = match(table, metadata)
    _table, _tree = match_tips(_table, tree)
    non_tips_no_name = [(n.name is None) for n in _tree.levelorder()
                        if not n.is_tip()]
    if len(non_tips_no_name) == 0:
        raise ValueError('There are no internal nodes in `tree` after'
                         'intersection with `table`.')

    if len(_table.index) == 0:
        raise ValueError('There are no internal nodes in `table` after '
                         'intersection with `metadata`.')

    if any(non_tips_no_name):
        _tree = rename_internal_nodes(_tree)
    return _table, _metadata, _tree
Exemple #8
0
def proportional_clustering(table: pd.DataFrame) -> skbio.TreeNode:
    """ Builds a tree for features based on a proportionality.

    Parameters
    ----------
    table : pd.DataFrame
       Contingency table where rows are samples and columns are features.
       In addition, the table must have strictly nonzero values.

    Returns
    -------
    skbio.TreeNode
       Represents the partitioning of features with respect to proportionality.
    """
    t = proportional_linkage(table)
    t = rename_internal_nodes(t)
    return t
Exemple #9
0
def ilr_phylogenetic_ordination(
        table: pd.DataFrame,
        tree: skbio.TreeNode,
        pseudocount: float = 0.5,
        top_k_var: int = 10,
        clades: list = None
) -> (OrdinationResults, skbio.TreeNode, pd.DataFrame):
    t = tree.copy()
    t.bifurcate()
    _table, _tree = match_tips(table, t)
    _tree = rename_internal_nodes(_tree)
    if not clades:
        in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()]
        basis = _balance_basis(_tree)[0]
        _table = add_pseudocount(_table, pseudocount)
        basis = pd.DataFrame(basis.T, index=_table.columns, columns=in_nodes)
        balances = np.log(_table) @ basis
        var = balances.var(axis=0).sort_values(ascending=False)
        clades = var.index[:top_k_var]
        balances = balances[clades]
        basis = basis[clades]
    else:
        clades = clades[0].split(',')
        balances, basis = _fast_ilr(_tree, _table, clades, pseudocount=0.5)
        var = balances.var(axis=0).sort_values(ascending=False)

    balances.index.name = 'sampleid'
    # feature metadata
    eigvals = var
    prop = var[clades] / var.sum()
    balances = OrdinationResults(
        short_method_name='ILR',
        long_method_name='Phylogenetic Isometric Log Ratio Transform',
        samples=balances,
        features=pd.DataFrame(np.eye(len(clades)), index=clades),
        eigvals=eigvals,
        proportion_explained=prop)
    basis.index.name = 'featureid'
    return balances, _tree, basis
Exemple #10
0
 def test_rename_internal_nodes_mutable(self):
     tree = TreeNode.read([u"(((a,b)y2, c),d)r;"])
     rename_internal_nodes(tree, inplace=True)
     self.assertEqual(str(tree), "(((a,b)y2,c)y1,d)y0;\n")
Exemple #11
0
 def test_rename_internal_nodes_names_mismatch(self):
     tree = TreeNode.read([u"(((a,b), c),d)r;"])
     with self.assertRaises(ValueError):
         rename_internal_nodes(tree, ['r', 'abc'])
Exemple #12
0
 def test_rename_internal_nodes_names(self):
     tree = TreeNode.read([u"(((a,b), c),d)r;"])
     exp_tree = TreeNode.read([u"(((a,b)ab, c)abc,d)r;"])
     res_tree = rename_internal_nodes(tree, ['r', 'abc', 'ab'])
     self.assertEqual(str(exp_tree), str(res_tree))
Exemple #13
0
 def test_rename_internal_nodes(self):
     tree = TreeNode.read([u"(((a,b), c),d)r;"])
     exp_tree = TreeNode.read([u"(((a,b)y2, c)y1,d)y0;"])
     res_tree = rename_internal_nodes(tree)
     self.assertEqual(str(exp_tree), str(res_tree))
Exemple #14
0
 def test_rename_internal_nodes_mutable(self):
     tree = TreeNode.read([u"(((a,b)y2, c),d)r;"])
     rename_internal_nodes(tree, inplace=True)
     self.assertEqual(str(tree), "(((a,b)y2,c)y1,d)y0;\n")
Exemple #15
0
 def test_rename_internal_nodes_names_mismatch(self):
     tree = TreeNode.read([u"(((a,b), c),d)r;"])
     with self.assertRaises(ValueError):
         rename_internal_nodes(tree, ['r', 'abc'])
Exemple #16
0
 def test_rename_internal_nodes_names(self):
     tree = TreeNode.read([u"(((a,b), c),d)r;"])
     exp_tree = TreeNode.read([u"(((a,b)ab, c)abc,d)r;"])
     res_tree = rename_internal_nodes(tree, ['r', 'abc', 'ab'])
     self.assertEqual(str(exp_tree), str(res_tree))
Exemple #17
0
 def test_rename_internal_nodes(self):
     tree = TreeNode.read([u"(((a,b), c),d)r;"])
     exp_tree = TreeNode.read([u"(((a,b)y2, c)y1,d)y0;"])
     res_tree = rename_internal_nodes(tree)
     self.assertEqual(str(exp_tree), str(res_tree))
Exemple #18
0
def correlation_linkage(X, method='ward'):
    r"""
    Hierarchical Clustering based on proportionality.

    The hierarchy is built based on the correlationity between
    any two pairs of features.  Specifically the correlation between
    two features :math:`x` and :math:`y` is measured by

    .. math::
        p(x, y) = var (\ln \frac{x}{y})

    If :math:`p(x, y)` is very small, then :math:`x` and :math:`y`
    are said to be highly correlation. A hierarchical clustering is
    then performed using this correlation as a distance metric.

    This can be useful for constructing principal balances [1]_.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    method : str
        Clustering method.  (default='ward')

    Returns
    -------
    skbio.TreeNode
        Tree for constructing principal balances.

    References
    ----------

    .. [1] Pawlowsky-Glahn V, Egozcue JJ, and Tolosana-Delgado R.
       Principal Balances (2011).

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import correlation_linkage
    >>> table = pd.DataFrame([[1, 1, 0, 0, 0],
    ...                       [0, 1, 1, 0, 0],
    ...                       [0, 0, 1, 1, 0],
    ...                       [0, 0, 0, 1, 1]],
    ...                      columns=['s1', 's2', 's3', 's4', 's5'],
    ...                      index=['o1', 'o2', 'o3', 'o4']).T
    >>> tree = correlation_linkage(table+0.1)
    >>> print(tree.ascii_art())
                        /-o1
              /y1------|
             |          \-o2
    -y0------|
             |          /-o3
              \y2------|
                        \-o4
    """
    dm = variation_matrix(X)
    lm = linkage(dm.condensed_form(), method=method)
    t = TreeNode.from_linkage_matrix(lm, X.columns)
    t = rename_internal_nodes(t)
    return t
Exemple #19
0
 def test_rename_internal_nodes_warning(self):
     tree = TreeNode.read([u"(((a,b)y2, c),d)r;"])
     with self.assertWarns(Warning):
         rename_internal_nodes(tree)