Esempio n. 1
0
    def test_mean_niche_estimator_missing(self):
        gradient = pd.Series([1, 2, 3, 4, np.nan],
                             index=['s1', 's2', 's3', 's4', 's5'])
        values = pd.Series([1, 3, 0, 0, 0],
                           index=['s1', 's2', 's3', 's4', 's5'])

        with self.assertRaises(ValueError):
            mean_niche_estimator(values, gradient)
Esempio n. 2
0
    def test_mean_niche_estimator_bad_length(self):
        gradient = pd.Series([1, 2, 3, 4, 5],
                             index=['s1', 's2', 's3', 's4', 's5'])
        values = pd.Series([1, 3, 0, 0, 0, 0],
                           index=['s1', 's2', 's3', 's4', 's5', 's6'])

        with self.assertRaises(ValueError):
            mean_niche_estimator(values, gradient)
Esempio n. 3
0
    def test_mean_niche_estimator_missing(self):
        gradient = pd.Series(
            [1, 2, 3, 4, np.nan],
            index=['s1', 's2', 's3', 's4', 's5'])
        values = pd.Series(
            [1, 3, 0, 0, 0],
            index=['s1', 's2', 's3', 's4', 's5'])

        with self.assertRaises(ValueError):
            mean_niche_estimator(values, gradient)
Esempio n. 4
0
    def test_mean_niche_estimator_bad_length(self):
        gradient = pd.Series(
            [1, 2, 3, 4, 5],
            index=['s1', 's2', 's3', 's4', 's5'])
        values = pd.Series(
            [1, 3, 0, 0, 0, 0],
            index=['s1', 's2', 's3', 's4', 's5', 's6'])

        with self.assertRaises(ValueError):
            mean_niche_estimator(values, gradient)
Esempio n. 5
0
def gradient_clustering(table: pd.DataFrame,
                        gradient: NumericMetadataColumn,
                        weighted: bool = True) -> skbio.TreeNode:
    """ Builds a tree for features based on a gradient.

    Parameters
    ----------
    table : pd.DataFrame
       Contingency table where rows are samples and columns are features.
    gradient : qiime2.NumericMetadataColumn
       Continuous vector of measurements corresponding to samples.
    weighted : bool
       Specifies if abundance or presence/absence information
       should be used to perform the clustering.

    Returns
    -------
    skbio.TreeNode
       Represents the partitioning of features with respect to the gradient.
    """
    c = gradient.to_series()
    if not weighted:
        table = (table > 0).astype(np.float)
    table, c = match(table, c)
    t = gradient_linkage(table, c, method='average')
    mean_g = mean_niche_estimator(table, c)
    mean_g = pd.Series(mean_g, index=table.columns)
    mean_g = mean_g.sort_values()
    t = gradient_sort(t, mean_g)
    return t
Esempio n. 6
0
 def test_mean_niche_estimator2(self):
     gradient = pd.Series([1, 2, 3, 4, 5],
                          index=['s1', 's2', 's3', 's4', 's5'])
     values = pd.Series([1, 3, 0, 0, 0],
                        index=['s1', 's2', 's3', 's4', 's5'])
     m = mean_niche_estimator(values, gradient)
     self.assertEqual(m, 1.75)
Esempio n. 7
0
 def test_mean_niche_estimator2(self):
     gradient = pd.Series(
         [1, 2, 3, 4, 5],
         index=['s1', 's2', 's3', 's4', 's5'])
     values = pd.Series(
         [1, 3, 0, 0, 0],
         index=['s1', 's2', 's3', 's4', 's5'])
     m = mean_niche_estimator(values, gradient)
     self.assertEqual(m, 1.75)
Esempio n. 8
0
 def test_mean_niche_estimator_frame(self):
     gradient = pd.Series([1, 2, 3, 4, 5],
                          index=['s1', 's2', 's3', 's4', 's5'])
     values = pd.DataFrame(np.array([[1, 3, 0, 0, 0], [1, 3, 0, 0, 0]]).T,
                           index=['s1', 's2', 's3', 's4', 's5'],
                           columns=['o1', 'o2'])
     m = mean_niche_estimator(values, gradient)
     exp = pd.Series([1.75, 1.75], index=['o1', 'o2'])
     pdt.assert_series_equal(m, exp)
Esempio n. 9
0
 def test_mean_niche_estimator_frame(self):
     gradient = pd.Series(
         [1, 2, 3, 4, 5],
         index=['s1', 's2', 's3', 's4', 's5'])
     values = pd.DataFrame(
         np.array([[1, 3, 0, 0, 0],
                   [1, 3, 0, 0, 0]]).T,
         index=['s1', 's2', 's3', 's4', 's5'],
         columns=['o1', 'o2'])
     m = mean_niche_estimator(values, gradient)
     exp = pd.Series([1.75, 1.75], index=['o1', 'o2'])
     pdt.assert_series_equal(m, exp)
Esempio n. 10
0
def gradient_linkage(X, y, method='average'):
    """
    Principal Balance Analysis using Hierarchical Clustering
    on known gradient.

    The hierarchy is built based on the values of the samples
    located along a gradient.  Given a feature :math:`x`, the mean gradient
    values that :math:`x` was observed in is calculated by

    .. math::
        f(g , x) =
         \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j}

    Where :math:`N` is the number of samples, :math:`x_i` is the proportion of
    feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value
    at sample `i`.

    The distance between two features :math:`x` and :math:`y` can be defined as

    .. math::
        d(x, y) = (f(g, x) - f(g, y))^2

    If :math:`d(x, y)` is very small, then :math:`x` and :math:`y`
    are expected to live in very similar positions across the gradient.
    A hierarchical clustering is  then performed using :math:`d(x, y)` as
    the distance metric.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    y : pd.Series
        Continuous vector representing some ordering of the features in X.
    method : str
        Clustering method.  (default='average')

    Returns
    -------
    skbio.TreeNode
        Tree generated from principal balance analysis.

    See Also
    --------
    mean_niche_estimator
    """
    _X, _y = match(X, y)
    mean_X = mean_niche_estimator(_X, gradient=_y)
    dm = DistanceMatrix.from_iterable(mean_X, euclidean)
    lm = linkage(dm.condensed_form(), method)
    return TreeNode.from_linkage_matrix(lm, X.columns)
Esempio n. 11
0
def gradient_clustering(table: pd.DataFrame,
                        gradient: NumericMetadataColumn,
                        ignore_missing_samples: bool = False,
                        weighted: bool = True) -> skbio.TreeNode:
    """ Builds a tree for features based on a gradient.

    Parameters
    ----------
    table : pd.DataFrame
       Contingency table where rows are samples and columns are features.
    gradient : qiime2.NumericMetadataColumn
       Continuous vector of measurements corresponding to samples.
    ignore_missing_samples: bool
        Whether to except or ignore when there are samples present in the table
        that are not present in the gradient metadata.
    weighted : bool
       Specifies if abundance or presence/absence information
       should be used to perform the clustering.

    Returns
    -------
    skbio.TreeNode
       Represents the partitioning of features with respect to the gradient.
    """
    c = gradient.to_series()
    if not ignore_missing_samples:
        difference = set(table.index) - set(c.index)
        if difference:
            raise KeyError("There are samples present in the table not "
                           "present in the gradient metadata column. Override "
                           "this error by using the `ignore_missing_samples` "
                           "argument. Offending samples: %r" %
                           ', '.join(sorted([str(i) for i in difference])))
    if not weighted:
        table = (table > 0).astype(float)
    table, c = match(table, c)
    t = gradient_linkage(table, c, method='average')
    mean_g = mean_niche_estimator(table, c)
    mean_g = pd.Series(mean_g, index=table.columns)
    mean_g = mean_g.sort_values()
    t = gradient_sort(t, mean_g)
    return t
Esempio n. 12
0
def gradient_linkage(X, y, method='average'):
    r"""
    Hierarchical Clustering on known gradient.

    The hierarchy is built based on the values of the samples
    located along a gradient.  Given a feature :math:`x`, the mean gradient
    values that :math:`x` was observed in is calculated by

    .. math::
        f(g , x) =
         \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j}

    Where :math:`N` is the number of samples, :math:`x_i` is the proportion of
    feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value
    at sample `i`.

    The distance between two features :math:`x` and :math:`y` can be defined as

    .. math::
        d(x, y) = (f(g, x) - f(g, y))^2

    If :math:`d(x, y)` is very small, then :math:`x` and :math:`y`
    are expected to live in very similar positions across the gradient.
    A hierarchical clustering is then performed using :math:`d(x, y)` as
    the distance metric.

    This can be useful for constructing principal balances.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    y : pd.Series
        Continuous vector representing some ordering of the samples in X.
    method : str
        Clustering method.  (default='average')

    Returns
    -------
    skbio.TreeNode
        Tree for constructing principal balances.

    See Also
    --------
    mean_niche_estimator

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import gradient_linkage
    >>> table = pd.DataFrame([[1, 1, 0, 0, 0],
    ...                       [0, 1, 1, 0, 0],
    ...                       [0, 0, 1, 1, 0],
    ...                       [0, 0, 0, 1, 1]],
    ...                      columns=['s1', 's2', 's3', 's4', 's5'],
    ...                      index=['o1', 'o2', 'o3', 'o4']).T
    >>> gradient = pd.Series([1, 2, 3, 4, 5],
    ...                      index=['s1', 's2', 's3', 's4', 's5'])
    >>> tree = gradient_linkage(table, gradient)
    >>> print(tree.ascii_art())
                        /-o1
              /y1------|
             |          \-o2
    -y0------|
             |          /-o3
              \y2------|
                        \-o4
    """
    _X, _y = match(X, y)
    mean_X = mean_niche_estimator(_X, gradient=_y)
    t = rank_linkage(mean_X)
    return t