Exemple #1
0
    def test_match_immutable(self):
        # tests to make sure that the original tables don't change.
        table = pd.DataFrame([[0, 0, 1, 1],
                              [2, 2, 4, 4],
                              [5, 5, 3, 3],
                              [0, 0, 0, 1]],
                             index=['s1', 's2', 's3', 's4'],
                             columns=['o1', 'o2', 'o3', 'o4'])
        metadata = pd.DataFrame([['a', 'control'],
                                 ['c', 'diseased'],
                                 ['b', 'control']],
                                index=['s1', 's3', 's2'],
                                columns=['Barcode', 'Treatment'])

        exp_table = pd.DataFrame([[0, 0, 1, 1],
                                  [2, 2, 4, 4],
                                  [5, 5, 3, 3],
                                  [0, 0, 0, 1]],
                                 index=['s1', 's2', 's3', 's4'],
                                 columns=['o1', 'o2', 'o3', 'o4'])
        exp_metadata = pd.DataFrame([['a', 'control'],
                                     ['c', 'diseased'],
                                     ['b', 'control']],
                                    index=['s1', 's3', 's2'],
                                    columns=['Barcode', 'Treatment'])
        match(table, metadata)
        pdt.assert_frame_equal(table, exp_table)
        pdt.assert_frame_equal(metadata, exp_metadata)
Exemple #2
0
    def test_match_duplicate(self):
        table1 = pd.DataFrame([[0, 0, 1, 1],
                               [2, 2, 4, 4],
                               [5, 5, 3, 3],
                               [0, 0, 0, 1]],
                              index=['s2', 's2', 's3', 's4'],
                              columns=['o1', 'o2', 'o3', 'o4'])
        metadata1 = pd.DataFrame([['a', 'control'],
                                  ['b', 'control'],
                                  ['c', 'diseased'],
                                  ['d', 'diseased']],
                                 index=['s1', 's2', 's3', 's4'],
                                 columns=['Barcode', 'Treatment'])

        table2 = pd.DataFrame([[0, 0, 1, 1],
                               [2, 2, 4, 4],
                               [5, 5, 3, 3],
                               [0, 0, 0, 1]],
                              index=['s1', 's2', 's3', 's4'],
                              columns=['o1', 'o2', 'o3', 'o4'])
        metadata2 = pd.DataFrame([['a', 'control'],
                                  ['b', 'control'],
                                  ['c', 'diseased'],
                                  ['d', 'diseased']],
                                 index=['s1', 's1', 's3', 's4'],
                                 columns=['Barcode', 'Treatment'])

        with self.assertRaises(ValueError):
            match(table1, metadata1)
        with self.assertRaises(ValueError):
            match(table2, metadata2)
Exemple #3
0
    def test_match_duplicate(self):
        table1 = pd.DataFrame([[0, 0, 1, 1],
                               [2, 2, 4, 4],
                               [5, 5, 3, 3],
                               [0, 0, 0, 1]],
                              index=['s2', 's2', 's3', 's4'],
                              columns=['o1', 'o2', 'o3', 'o4'])
        metadata1 = pd.DataFrame([['a', 'control'],
                                  ['b', 'control'],
                                  ['c', 'diseased'],
                                  ['d', 'diseased']],
                                 index=['s1', 's2', 's3', 's4'],
                                 columns=['Barcode', 'Treatment'])

        table2 = pd.DataFrame([[0, 0, 1, 1],
                               [2, 2, 4, 4],
                               [5, 5, 3, 3],
                               [0, 0, 0, 1]],
                              index=['s1', 's2', 's3', 's4'],
                              columns=['o1', 'o2', 'o3', 'o4'])
        metadata2 = pd.DataFrame([['a', 'control'],
                                  ['b', 'control'],
                                  ['c', 'diseased'],
                                  ['d', 'diseased']],
                                 index=['s1', 's1', 's3', 's4'],
                                 columns=['Barcode', 'Treatment'])

        with self.assertRaises(ValueError):
            match(table1, metadata1)
        with self.assertRaises(ValueError):
            match(table2, metadata2)
Exemple #4
0
    def test_match_immutable(self):
        # tests to make sure that the original tables don't change.
        table = pd.DataFrame([[0, 0, 1, 1],
                              [2, 2, 4, 4],
                              [5, 5, 3, 3],
                              [0, 0, 0, 1]],
                             index=['s1', 's2', 's3', 's4'],
                             columns=['o1', 'o2', 'o3', 'o4'])
        metadata = pd.DataFrame([['a', 'control'],
                                 ['c', 'diseased'],
                                 ['b', 'control']],
                                index=['s1', 's3', 's2'],
                                columns=['Barcode', 'Treatment'])

        exp_table = pd.DataFrame([[0, 0, 1, 1],
                                  [2, 2, 4, 4],
                                  [5, 5, 3, 3],
                                  [0, 0, 0, 1]],
                                 index=['s1', 's2', 's3', 's4'],
                                 columns=['o1', 'o2', 'o3', 'o4'])
        exp_metadata = pd.DataFrame([['a', 'control'],
                                     ['c', 'diseased'],
                                     ['b', 'control']],
                                    index=['s1', 's3', 's2'],
                                    columns=['Barcode', 'Treatment'])
        match(table, metadata)
        pdt.assert_frame_equal(table, exp_table)
        pdt.assert_frame_equal(metadata, exp_metadata)
 def test_biom_match_no_common_ids(self):
     table = Table(
         np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3]]).T,
         ['a', 'b', 'c', 'd'], ['y2', 'y3', 'y4'])
     md = pd.DataFrame({
         'x1': [1, 3, 2],
         'x2': [1, 1, 0]
     },
                       columns=['s2', 's2', 's3']).T
     with self.assertRaises(ValueError):
         match(table, md)
    def test_match_empty(self):
        table = pd.DataFrame(
            [[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]],
            index=['s1', 's2', 's3', 's4'],
            columns=['o1', 'o2', 'o3', 'o4'])
        metadata = pd.DataFrame([['a', 'control'], ['b', 'control'],
                                 ['c', 'diseased'], ['d', 'diseased']],
                                index=['a1', 'a2', 'a3', 'a4'],
                                columns=['Barcode', 'Treatment'])

        with self.assertRaises(ValueError):
            match(table, metadata)
    def test_match_intersect(self):
        table = pd.DataFrame(
            [[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]],
            index=['s1', 's2', 's3', 's4'],
            columns=['o1', 'o2', 'o3', 'o4'])
        metadata = pd.DataFrame(
            [['a', 'control'], ['c', 'diseased'], ['b', 'control']],
            index=['s1', 's3', 's2'],
            columns=['Barcode', 'Treatment'])

        exp_table = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3]],
                                 index=['s1', 's2', 's3'],
                                 columns=['o1', 'o2', 'o3', 'o4'])

        exp_metadata = pd.DataFrame(
            [['a', 'control'], ['b', 'control'], ['c', 'diseased']],
            index=['s1', 's2', 's3'],
            columns=['Barcode', 'Treatment'])

        res_table, res_metadata = match(table, metadata)
        # sort for comparison, since the match function
        # scrambles the names due to hashing.
        res_table = res_table.sort_index()
        res_metadata = res_metadata.sort_index()
        pdt.assert_frame_equal(exp_table, res_table)
        pdt.assert_frame_equal(exp_metadata, res_metadata)
    def test_match_scrambled(self):
        table = pd.DataFrame(
            [[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]],
            index=['s1', 's2', 's3', 's4'],
            columns=['o1', 'o2', 'o3', 'o4'])
        metadata = pd.DataFrame([['a', 'control'], ['c', 'diseased'],
                                 ['b', 'control'], ['d', 'diseased']],
                                index=['s1', 's3', 's2', 's4'],
                                columns=['Barcode', 'Treatment'])
        exp_table = table
        exp_metadata = pd.DataFrame([['a', 'control'], ['b', 'control'],
                                     ['c', 'diseased'], ['d', 'diseased']],
                                    index=['s1', 's2', 's3', 's4'],
                                    columns=['Barcode', 'Treatment'])

        res_table, res_metadata = match(table, metadata)
        # make sure that the metadata and table indeces match
        pdt.assert_index_equal(res_table.index, res_metadata.index)

        res_table = res_table.sort_index()
        exp_table = exp_table.sort_index()

        res_metadata = res_metadata.sort_index()
        exp_metadata = exp_metadata.sort_index()

        pdt.assert_frame_equal(exp_table, res_table)
        pdt.assert_frame_equal(exp_metadata, res_metadata)
Exemple #9
0
    def test_match_intersect(self):
        table = pd.DataFrame([[0, 0, 1, 1],
                              [2, 2, 4, 4],
                              [5, 5, 3, 3],
                              [0, 0, 0, 1]],
                             index=['s1', 's2', 's3', 's4'],
                             columns=['o1', 'o2', 'o3', 'o4'])
        metadata = pd.DataFrame([['a', 'control'],
                                 ['c', 'diseased'],
                                 ['b', 'control']],
                                index=['s1', 's3', 's2'],
                                columns=['Barcode', 'Treatment'])

        exp_table = pd.DataFrame([[0, 0, 1, 1],
                                  [2, 2, 4, 4],
                                  [5, 5, 3, 3]],
                                 index=['s1', 's2', 's3'],
                                 columns=['o1', 'o2', 'o3', 'o4'])

        exp_metadata = pd.DataFrame([['a', 'control'],
                                     ['b', 'control'],
                                     ['c', 'diseased']],
                                    index=['s1', 's2', 's3'],
                                    columns=['Barcode', 'Treatment'])

        res_table, res_metadata = match(table, metadata)
        # sort for comparison, since the match function
        # scrambles the names due to hashing.
        res_table = res_table.sort_index()
        res_metadata = res_metadata.sort_index()
        pdt.assert_frame_equal(exp_table, res_table)
        pdt.assert_frame_equal(exp_metadata, res_metadata)
Exemple #10
0
    def test_match(self):
        table = pd.DataFrame([[0, 0, 1, 1],
                              [2, 2, 4, 4],
                              [5, 5, 3, 3],
                              [0, 0, 0, 1]],
                             index=['s1', 's2', 's3', 's4'],
                             columns=['o1', 'o2', 'o3', 'o4'])
        metadata = pd.DataFrame([['a', 'control'],
                                 ['b', 'control'],
                                 ['c', 'diseased'],
                                 ['d', 'diseased']],
                                index=['s1', 's2', 's3', 's4'],
                                columns=['Barcode', 'Treatment'])
        exp_table, exp_metadata = table, metadata
        res_table, res_metadata = match(table, metadata)

        # make sure that the metadata and table indeces match
        pdt.assert_index_equal(res_table.index, res_metadata.index)

        res_table = res_table.sort_index()
        exp_table = exp_table.sort_index()

        res_metadata = res_metadata.sort_index()
        exp_metadata = exp_metadata.sort_index()

        pdt.assert_frame_equal(exp_table, res_table)
        pdt.assert_frame_equal(exp_metadata, res_metadata)
Exemple #11
0
 def test_biom_match_no_common_ids(self):
     table = Table(
         np.array([[0, 0, 1, 1],
                   [2, 3, 4, 4],
                   [5, 5, 3, 3]]).T,
         ['a', 'b', 'c', 'd'],
         ['y2', 'y3', 'y4'])
     md = pd.DataFrame(
         {
             'x1': [1, 3, 2],
             'x2': [1, 1, 0]
         },
         columns=['s2', 's2', 's3']
     ).T
     with self.assertRaises(ValueError):
         match(table, md)
Exemple #12
0
    def test_match_empty(self):
        table = pd.DataFrame([[0, 0, 1, 1],
                              [2, 2, 4, 4],
                              [5, 5, 3, 3],
                              [0, 0, 0, 1]],
                             index=['s1', 's2', 's3', 's4'],
                             columns=['o1', 'o2', 'o3', 'o4'])
        metadata = pd.DataFrame([['a', 'control'],
                                 ['b', 'control'],
                                 ['c', 'diseased'],
                                 ['d', 'diseased']],
                                index=['a1', 'a2', 'a3', 'a4'],
                                columns=['Barcode', 'Treatment'])

        with self.assertRaises(ValueError):
            match(table, metadata)
Exemple #13
0
def gradient_clustering(table: pd.DataFrame,
                        gradient: NumericMetadataColumn,
                        weighted: bool = True) -> skbio.TreeNode:
    """ Builds a tree for features based on a gradient.

    Parameters
    ----------
    table : pd.DataFrame
       Contingency table where rows are samples and columns are features.
    gradient : qiime2.NumericMetadataColumn
       Continuous vector of measurements corresponding to samples.
    weighted : bool
       Specifies if abundance or presence/absence information
       should be used to perform the clustering.

    Returns
    -------
    skbio.TreeNode
       Represents the partitioning of features with respect to the gradient.
    """
    c = gradient.to_series()
    if not weighted:
        table = (table > 0).astype(np.float)
    table, c = match(table, c)
    t = gradient_linkage(table, c, method='average')
    mean_g = mean_niche_estimator(table, c)
    mean_g = pd.Series(mean_g, index=table.columns)
    mean_g = mean_g.sort_values()
    t = gradient_sort(t, mean_g)
    return t
Exemple #14
0
    def test_biom_match(self):
        table = Table(
            np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3]]).T,
            ['a', 'b', 'c', 'd'], ['s2', 's3', 's4'])
        md = pd.DataFrame({
            'x1': [1, 3, 2],
            'x2': [1, 1, 0]
        },
                          columns=['s1', 's2', 's3']).T

        exp_table = Table(
            np.array([[0, 0, 1, 1], [2, 3, 4, 4]]).T, ['a', 'b', 'c', 'd'],
            ['s2', 's3'])
        exp_md = pd.DataFrame({
            'x1': [3, 2],
            'x2': [1, 0]
        },
                              columns=['s2', 's3']).T

        res_table, res_md = match(table, md)
        exp_df = pd.DataFrame(exp_table.to_dataframe())
        res_df = pd.DataFrame(res_table.to_dataframe())

        exp_df = exp_df.reindex_axis(sorted(exp_df.columns), axis=1)
        res_df = res_df.reindex_axis(sorted(res_df.columns), axis=1)

        pdt.assert_frame_equal(exp_df, res_df)

        exp_md = exp_md.reindex_axis(sorted(exp_md.index), axis=0)
        res_md = res_md.reindex_axis(sorted(res_md.index), axis=0)

        pdt.assert_frame_equal(res_md, exp_md)
Exemple #15
0
def gradient_linkage(X, y, method='average'):
    """
    Principal Balance Analysis using Hierarchical Clustering
    on known gradient.

    The hierarchy is built based on the values of the samples
    located along a gradient.  Given a feature :math:`x`, the mean gradient
    values that :math:`x` was observed in is calculated by

    .. math::
        f(g , x) =
         \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j}

    Where :math:`N` is the number of samples, :math:`x_i` is the proportion of
    feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value
    at sample `i`.

    The distance between two features :math:`x` and :math:`y` can be defined as

    .. math::
        d(x, y) = (f(g, x) - f(g, y))^2

    If :math:`d(x, y)` is very small, then :math:`x` and :math:`y`
    are expected to live in very similar positions across the gradient.
    A hierarchical clustering is  then performed using :math:`d(x, y)` as
    the distance metric.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    y : pd.Series
        Continuous vector representing some ordering of the features in X.
    method : str
        Clustering method.  (default='average')

    Returns
    -------
    skbio.TreeNode
        Tree generated from principal balance analysis.

    See Also
    --------
    mean_niche_estimator
    """
    _X, _y = match(X, y)
    mean_X = mean_niche_estimator(_X, gradient=_y)
    dm = DistanceMatrix.from_iterable(mean_X, euclidean)
    lm = linkage(dm.condensed_form(), method)
    return TreeNode.from_linkage_matrix(lm, X.columns)
Exemple #16
0
def niche_sort(table, gradient, niche_estimator=mean_niche_estimator):
    """ Sort the table according to estimated niches.

    Sorts the table by samples along the gradient
    and otus by their estimated niche along the gradient.

    Parameters
    ----------
    table : pd.DataFrame
        Contingency table where samples are rows and features (i.e. OTUs)
        are columns.
    gradient : pd.Series
        Vector of numerical gradient values.
    niche_estimator : function, optional
        A function that takes in two pandas series and returns an ordered
        object. The ability for the object to be ordered is critical, since
        this will allow the table to be sorted according to this ordering.
        By default, `mean_niche_estimator` will be used.

    Returns
    -------
    pd.DataFrame :
        Sorted table according to the gradient of the samples, and the niches
        of the organisms along that gradient.

    Raises
    ------
    ValueError :
        Raised if `niche_estimator` is not a function.
    """
    if not callable(niche_estimator):
        raise ValueError("`niche_estimator` is not a function.")

    table, gradient = match(table, gradient)
    niche_estimator = partial(niche_estimator, gradient=gradient)

    # normalizes feature abundances to sum to 1, for each sample.
    # (i.e. scales values in each row to sum to 1).
    normtable = table.apply(lambda x: x/x.sum(), axis=1)

    # calculates estimated niche for each feature
    est_niche = normtable.apply(niche_estimator, axis=0)
    gradient = gradient.sort_values()
    est_niche = est_niche.sort_values()

    table = table.reindex(index=gradient.index,
                          columns=est_niche.index)
    return table
Exemple #17
0
def niche_sort(table, gradient, niche_estimator=mean_niche_estimator):
    """ Sort the table according to estimated niches.

    Sorts the table by samples along the gradient
    and otus by their estimated niche along the gradient.

    Parameters
    ----------
    table : pd.DataFrame
        Contingency table where samples are rows and features (i.e. OTUs)
        are columns.
    gradient : pd.Series
        Vector of numerical gradient values.
    niche_estimator : function, optional
        A function that takes in two pandas series and returns an ordered
        object. The ability for the object to be ordered is critical, since
        this will allow the table to be sorted according to this ordering.
        By default, `mean_niche_estimator` will be used.

    Returns
    -------
    pd.DataFrame :
        Sorted table according to the gradient of the samples, and the niches
        of the organisms along that gradient.

    Raises
    ------
    ValueError :
        Raised if `niche_estimator` is not a function.
    """
    if not callable(niche_estimator):
        raise ValueError("`niche_estimator` is not a function.")

    table, gradient = match(table, gradient)
    niche_estimator = partial(niche_estimator, gradient=gradient)

    # normalizes feature abundances to sum to 1, for each sample.
    # (i.e. scales values in each row to sum to 1).
    normtable = table.apply(lambda x: x/x.sum(), axis=1)

    # calculates estimated niche for each feature
    est_niche = normtable.apply(niche_estimator, axis=0)
    gradient = gradient.sort_values()
    est_niche = est_niche.sort_values()

    table = table.reindex(index=gradient.index,
                          columns=est_niche.index)
    return table
Exemple #18
0
def _intersect_of_table_metadata_tree(table, metadata, tree):
    """ Matches tips, features and samples between the table, metadata
    and tree.  This module returns the features and samples that are
    contained in all 3 objects.

    Parameters
    ----------
    table : pd.DataFrame
        Contingency table where samples correspond to rows and
        features correspond to columns.
    metadata: pd.DataFrame
        Metadata table that contains information about the samples contained
        in the `table` object.  Samples correspond to rows and covariates
        correspond to columns.
    tree : skbio.TreeNode
        Tree object where the leaves correspond to the columns contained in
        the table.

    Returns
    -------
    pd.DataFrame
        Subset of `table` with common row names as `metadata`
        and common columns as `tree.tips()`
    pd.DataFrame
        Subset of `metadata` with common row names as `table`
    skbio.TreeNode
        Subtree of `tree` with common tips as `table`
    """
    if np.any(table <= 0):
        raise ValueError('Cannot handle zeros or negative values in `table`. '
                         'Use pseudocounts or ``multiplicative_replacement``.')

    _table, _metadata = match(table, metadata)
    _table, _tree = match_tips(_table, tree)
    non_tips_no_name = [(n.name is None) for n in _tree.levelorder()
                        if not n.is_tip()]
    if len(non_tips_no_name) == 0:
        raise ValueError('There are no internal nodes in `tree` after'
                         'intersection with `table`.')

    if len(_table.index) == 0:
        raise ValueError('There are no internal nodes in `table` after '
                         'intersection with `metadata`.')

    if any(non_tips_no_name):
        _tree = rename_internal_nodes(_tree)
    return _table, _metadata, _tree
Exemple #19
0
 def test_biom_match_intersect(self):
     table = Table(
         np.array([[0, 0, 1, 1], [2, 3, 4, 4], [5, 5, 3, 3]]).T,
         ['a', 'b', 'c', 'd'], ['s1', 's2', 'y4'])
     md = pd.DataFrame([[0, 1], [1, 0], [1, 1]],
                       index=['s2', 's1', 's3'],
                       columns=['x1', 'x2'])
     exp_table = Table(
         np.array([[0, 0, 1, 1], [2, 3, 4, 4]]).T, ['a', 'b', 'c', 'd'],
         ['s1', 's2'])
     exp_md = pd.DataFrame([[1, 0], [0, 1]],
                           columns=['x1', 'x2'],
                           index=['s1', 's2'])
     res_table, res_md = match(table, md)
     pdt.assert_frame_equal(res_md, exp_md)
     exp_df = pd.DataFrame(exp_table.to_dataframe())
     res_df = pd.DataFrame(res_table.to_dataframe())
     pdt.assert_frame_equal(res_df, exp_df)
Exemple #20
0
    def test_biom_match(self):
        table = Table(
            np.array([[0, 0, 1, 1],
                      [2, 3, 4, 4],
                      [5, 5, 3, 3]]).T,
            ['a', 'b', 'c', 'd'],
            ['s2', 's3', 's4'])
        md = pd.DataFrame(
            {
                'x1': [1, 3, 2],
                'x2': [1, 1, 0]
            },
            columns=['s1', 's2', 's3']
        ).T

        exp_table = Table(
            np.array(
                [
                    [0, 0, 1, 1],
                    [2, 3, 4, 4]
                ]).T,
            ['a', 'b', 'c', 'd'],
            ['s2', 's3'])
        exp_md = pd.DataFrame(
            {
                'x1': [3, 2],
                'x2': [1, 0]
            },
            columns=['s2', 's3']
        ).T

        res_table, res_md = match(table, md)
        exp_df = pd.DataFrame(exp_table.to_dataframe())
        res_df = pd.DataFrame(res_table.to_dataframe())

        exp_df = exp_df.reindex_axis(sorted(exp_df.columns), axis=1)
        res_df = res_df.reindex_axis(sorted(res_df.columns), axis=1)

        pdt.assert_frame_equal(exp_df, res_df)

        exp_md = exp_md.reindex_axis(sorted(exp_md.index), axis=0)
        res_md = res_md.reindex_axis(sorted(res_md.index), axis=0)

        pdt.assert_frame_equal(res_md, exp_md)
Exemple #21
0
def gradient_clustering(table: pd.DataFrame,
                        gradient: NumericMetadataColumn,
                        ignore_missing_samples: bool = False,
                        weighted: bool = True) -> skbio.TreeNode:
    """ Builds a tree for features based on a gradient.

    Parameters
    ----------
    table : pd.DataFrame
       Contingency table where rows are samples and columns are features.
    gradient : qiime2.NumericMetadataColumn
       Continuous vector of measurements corresponding to samples.
    ignore_missing_samples: bool
        Whether to except or ignore when there are samples present in the table
        that are not present in the gradient metadata.
    weighted : bool
       Specifies if abundance or presence/absence information
       should be used to perform the clustering.

    Returns
    -------
    skbio.TreeNode
       Represents the partitioning of features with respect to the gradient.
    """
    c = gradient.to_series()
    if not ignore_missing_samples:
        difference = set(table.index) - set(c.index)
        if difference:
            raise KeyError("There are samples present in the table not "
                           "present in the gradient metadata column. Override "
                           "this error by using the `ignore_missing_samples` "
                           "argument. Offending samples: %r" %
                           ', '.join(sorted([str(i) for i in difference])))
    if not weighted:
        table = (table > 0).astype(float)
    table, c = match(table, c)
    t = gradient_linkage(table, c, method='average')
    mean_g = mean_niche_estimator(table, c)
    mean_g = pd.Series(mean_g, index=table.columns)
    mean_g = mean_g.sort_values()
    t = gradient_sort(t, mean_g)
    return t
Exemple #22
0
# Get OTU to taxa match
taxonomy=table.metadata_to_dataframe('observation')
taxonomy.columns=['kingdom', 'phylum', 'class', 'order', 
                             'family', 'genus', 'species']
taxonomy['taxonomy'] = taxonomy[taxonomy.columns].apply(lambda x: ';'.join(x), axis=1)

#mapping import 
map_file='cluster_models/keyboard.txt' #import metadata
mappingdf= pd.read_table('%s'%map_file, index_col=0,low_memory=False)
mappingdf=mappingdf.replace(np.nan,'Unknown', regex=True)
mappingdf.index=list(map(str,mappingdf.index))
mappingdf=mappingdf.astype(str)
mappingdf=mappingdf[~mappingdf.index.duplicated(keep='first')]

#match the tables
otutabledf,mappingdf=match(otutabledf,mappingdf[mappingdf['host_subject_id'].isin(['M2','M3','M9'])])

otutabledf=otutabledf.T[otutabledf.sum()>0].T
otutabledf=otutabledf[otutabledf.T.sum()>0]
otutabledf.columns=[str(x) for x in otutabledf.columns]

sorting_map={'M9':2,'M2':3,'M3':1}

mappingdf['host_num']=[int(sorting_map[x]) for x in mappingdf['host_subject_id']]
mappingdf=mappingdf.apply(pd.to_numeric, errors='ignore')

#sort by niche 
observed_table = niche_sort(otutabledf, mappingdf['host_num'])
mappingdf=mappingdf.T[observed_table.index].T
otutabledf=observed_table.copy()
Exemple #23
0
def convert_biom_to_pandas(table):
    otu_table = pd.DataFrame(np.array(table.matrix_data.todense()).T,
                             index=table.ids(axis='sample'),
                             columns=table.ids(axis='observation'))
    return otu_table


table = load_table('../data/dibd.biom')
otu_table = convert_biom_to_pandas(table)

mapping = pd.read_table("../data/dibd.map.txt",
                        sep='\t',
                        header=0,
                        index_col=0)
mapping = mapping.loc[mapping['disease_stat'].isin(['IBD', 'healthy'])]
mapping, otu_table = match(mapping, otu_table)

labels = np.array((mapping['disease_stat'] == 'IBD').astype(int))
dat = np.transpose(np.array(otu_table))

# normalization
sample_reads = np.sum(dat, axis=0)  # colSum: total reads in each sample
norm_length = 10000
dat_norm = dat / sample_reads * norm_length

# one group of the data
same = dat_norm[:, labels == 0]


def filtering(data, filterLev):
    otu_sum = np.sum(data, axis=1)
Exemple #24
0
def dendrogram_heatmap(output_dir: str, table: pd.DataFrame,
                       tree: TreeNode,
                       metadata: qiime2.CategoricalMetadataColumn,
                       pseudocount: float = 0.5,
                       ndim: int = 10, method: str = 'clr',
                       color_map: str = 'viridis'):

    table, tree = match_tips(add_pseudocount(table, pseudocount), tree)
    nodes = [n.name for n in tree.levelorder() if not n.is_tip()]

    nlen = min(ndim, len(nodes))
    numerator_color, denominator_color = '#fb9a99', '#e31a1c'
    highlights = pd.DataFrame([[numerator_color, denominator_color]] * nlen,
                              index=nodes[:nlen])
    if method == 'clr':
        mat = pd.DataFrame(clr(centralize(table)),
                           index=table.index,
                           columns=table.columns)
    elif method == 'log':
        mat = pd.DataFrame(np.log(table),
                           index=table.index,
                           columns=table.columns)
    c = metadata.to_series()
    table, c = match(table, c)
    # TODO: There are a few hard-coded constants here
    # will need to have some adaptive defaults set in the future
    fig = heatmap(mat, tree, c, highlights, cmap=color_map,
                  highlight_width=0.01, figsize=(12, 8))
    fig.savefig(os.path.join(output_dir, 'heatmap.svg'))
    fig.savefig(os.path.join(output_dir, 'heatmap.pdf'))

    css = r"""
        .square {
          float: left;
          width: 100px;
          height: 20px;
          margin: 5px;
          border: 1px solid rgba(0, 0, 0, .2);
        }

        .numerator {
          background: %s;
        }

        .denominator {
          background: %s;
        }
    """ % (numerator_color, denominator_color)

    index_fp = os.path.join(output_dir, 'index.html')
    with open(index_fp, 'w') as index_f:
        index_f.write('<html><body>\n')
        index_f.write('<h1>Dendrogram heatmap</h1>\n')
        index_f.write('<img src="heatmap.svg" alt="heatmap">')
        index_f.write('<a href="heatmap.pdf">')
        index_f.write('Download as PDF</a><br>\n')
        index_f.write('<style>%s</style>' % css)
        index_f.write('<div class="square numerator">'
                      'Numerator<br/></div>')
        index_f.write('<div class="square denominator">'
                      'Denominator<br/></div>')
        index_f.write('</body></html>\n')
Exemple #25
0
def balance_taxonomy(output_dir: str, table: pd.DataFrame, tree: TreeNode,
                     taxonomy: pd.DataFrame,
                     balance_name: str,
                     pseudocount: float = 0.5,
                     taxa_level: int = 0,
                     n_features: int = 10,
                     threshold: float = None,
                     metadata: qiime2.MetadataColumn = None) -> None:
    if threshold is not None and isinstance(metadata,
                                            qiime2.CategoricalMetadataColumn):
        raise ValueError('Categorical metadata column detected. Only specify '
                         'a threshold when using a numerical metadata column.')

    # make sure that the table and tree match up
    table, tree = match_tips(add_pseudocount(table, pseudocount), tree)

    # parse out headers for taxonomy
    taxa_data = list(taxonomy['Taxon'].apply(lambda x: x.split(';')).values)
    taxa_df = pd.DataFrame(taxa_data, index=taxonomy.index)

    # fill in NAs
    def f(x):
        y = np.array(list(map(lambda k: k is not None, x)))
        i = max(0, np.where(y)[0][-1])
        x[np.logical_not(y)] = [x[i]] * np.sum(np.logical_not(y))
        return x
    taxa_df = taxa_df.apply(f, axis=1)

    num_clade = tree.find(balance_name).children[NUMERATOR]
    denom_clade = tree.find(balance_name).children[DENOMINATOR]

    if num_clade.is_tip():
        num_features = pd.DataFrame(
            {num_clade.name: taxa_df.loc[num_clade.name]}
            ).T
        r = 1
    else:
        num_features = taxa_df.loc[num_clade.subset()]
        r = len(list(num_clade.tips()))

    if denom_clade.is_tip():
        denom_features = pd.DataFrame(
            {denom_clade.name: taxa_df.loc[denom_clade.name]}
            ).T
        s = 1
    else:
        denom_features = taxa_df.loc[denom_clade.subset()]
        s = len(list(denom_clade.tips()))

    b = (np.log(table.loc[:, num_features.index]).mean(axis=1) -
         np.log(table.loc[:, denom_features.index]).mean(axis=1))

    b = b * np.sqrt(r * s / (r + s))
    balances = pd.DataFrame(b, index=table.index,
                            columns=[balance_name])

    # the actual colors for the numerator and denominator
    num_color = sns.color_palette("Paired")[0]
    denom_color = sns.color_palette("Paired")[1]

    fig, (ax_num, ax_denom) = plt.subplots(2)
    balance_barplots(tree, balance_name, taxa_level, taxa_df,
                     denom_color=denom_color, num_color=num_color,
                     axes=(ax_num, ax_denom))

    ax_num.set_title(
        r'$%s_{numerator} \; taxa \; (%d \; taxa)$' % (
            balance_name, len(num_features)))
    ax_denom.set_title(
        r'$%s_{denominator} \; taxa \; (%d \; taxa)$' % (
            balance_name, len(denom_features)))
    ax_denom.set_xlabel('Number of unique taxa')
    plt.tight_layout()
    fig.savefig(os.path.join(output_dir, 'barplots.svg'))
    fig.savefig(os.path.join(output_dir, 'barplots.pdf'))

    dcat = None
    multiple_cats = False
    if metadata is not None:
        fig2, ax = plt.subplots()
        c = metadata.to_series()
        data, c = match(balances, c)
        data[c.name] = c
        y = data[balance_name]

        # check if continuous
        if isinstance(metadata, qiime2.NumericMetadataColumn):
            ax.scatter(c.values, y)
            ax.set_xlabel(c.name)
            if threshold is None:
                threshold = c.mean()
            dcat = c.apply(
                lambda x: '%s < %f' % (c.name, threshold)
                if x < threshold
                else '%s > %f' % (c.name, threshold)
            )
            sample_palette = pd.Series(sns.color_palette("Set2", 2),
                                       index=dcat.value_counts().index)

        elif isinstance(metadata, qiime2.CategoricalMetadataColumn):

            sample_palette = pd.Series(
                sns.color_palette("Set2", len(c.value_counts())),
                index=c.value_counts().index)

            try:
                pd.to_numeric(metadata.to_series())
            except ValueError:
                pass
            else:
                raise ValueError('Categorical metadata column '
                                 f'{metadata.name!r} contains only numerical '
                                 'values. At least one value must be '
                                 'non-numerical.')

            balance_boxplot(balance_name, data, y=c.name, ax=ax,
                            palette=sample_palette)
            if len(c.value_counts()) > 2:
                warnings.warn(
                    'More than 2 categories detected in categorical metadata '
                    'column. Proportion plots will not be displayed',
                    stacklevel=2)
                multiple_cats = True
            else:
                dcat = c

        else:
            # Some other type of MetadataColumn
            raise NotImplementedError()

        ylabel = (r"$%s = \ln \frac{%s_{numerator}}"
                  "{%s_{denominator}}$") % (balance_name,
                                            balance_name,
                                            balance_name)
        ax.set_title(ylabel, rotation=0)
        ax.set_ylabel('log ratio')
        fig2.savefig(os.path.join(output_dir, 'balance_metadata.svg'))
        fig2.savefig(os.path.join(output_dir, 'balance_metadata.pdf'))

        if not multiple_cats:
            # Proportion plots
            # first sort by clr values and calculate average fold change
            ctable = pd.DataFrame(clr(centralize(table)),
                                  index=table.index, columns=table.columns)

            left_group = dcat.value_counts().index[0]
            right_group = dcat.value_counts().index[1]

            lidx, ridx = (dcat == left_group), (dcat == right_group)
            if b.loc[lidx].mean() > b.loc[ridx].mean():
                # double check ordering and switch if necessary
                # careful - the left group is also commonly associated with
                # the denominator.
                left_group = dcat.value_counts().index[1]
                right_group = dcat.value_counts().index[0]
                lidx, ridx = (dcat == left_group), (dcat == right_group)
            # we are not performing a statistical test here
            # we're just trying to figure out a way to sort the data.
            num_fold_change = ctable.loc[:, num_features.index].apply(
                lambda x: ttest_ind(x[ridx], x[lidx])[0])
            num_fold_change = num_fold_change.sort_values(
                ascending=False
            )

            denom_fold_change = ctable.loc[:, denom_features.index].apply(
                lambda x: ttest_ind(x[ridx], x[lidx])[0])
            denom_fold_change = denom_fold_change.sort_values(
                ascending=True
            )

            metadata = pd.DataFrame({dcat.name: dcat})
            top_num_features = num_fold_change.index[:n_features]
            top_denom_features = denom_fold_change.index[:n_features]

            fig3, (ax_denom, ax_num) = plt.subplots(1, 2)
            proportion_plot(
                table, metadata,
                category=metadata.columns[0],
                left_group=left_group,
                right_group=right_group,
                feature_metadata=taxa_df,
                label_col=taxa_level,
                num_features=top_num_features,
                denom_features=top_denom_features,
                # Note that the syntax is funky and counter
                # intuitive. This will need to be properly
                # fixed here
                # https://github.com/biocore/gneiss/issues/244
                num_color=sample_palette.loc[right_group],
                denom_color=sample_palette.loc[left_group],
                axes=(ax_num, ax_denom))
            # The below is overriding the default colors in the
            # numerator / denominator this will also need to be fixed in
            # https://github.com/biocore/gneiss/issues/244
            max_ylim, min_ylim = ax_denom.get_ylim()
            num_h, denom_h = n_features, n_features

            space = (max_ylim - min_ylim) / (num_h + denom_h)
            ymid = (max_ylim - min_ylim) * num_h
            ymid = ymid / (num_h + denom_h) - 0.5 * space

            ax_denom.axhspan(min_ylim, ymid,
                             facecolor=num_color,
                             zorder=0)
            ax_denom.axhspan(ymid, max_ylim,
                             facecolor=denom_color,
                             zorder=0)

            ax_num.axhspan(min_ylim, ymid,
                           facecolor=num_color,
                           zorder=0)
            ax_num.axhspan(ymid, max_ylim,
                           facecolor=denom_color,
                           zorder=0)

            fig3.subplots_adjust(
                # the left side of the subplots of the figure
                left=0.3,
                # the right side of the subplots of the figure
                right=0.9,
                # the bottom of the subplots of the figure
                bottom=0.1,
                # the top of the subplots of the figure
                top=0.9,
                # the amount of width reserved for blank space
                # between subplots
                wspace=0,
                # the amount of height reserved for white space
                # between subplots
                hspace=0.2,
            )

            fig3.savefig(os.path.join(output_dir, 'proportion_plot.svg'))
            fig3.savefig(os.path.join(output_dir, 'proportion_plot.pdf'))

    index_fp = os.path.join(output_dir, 'index.html')
    with open(index_fp, 'w') as index_f:
        index_f.write('<html><body>\n')
        if metadata is not None:
            index_f.write('<h1>Balance vs %s </h1>\n' % c.name)
            index_f.write(('<img src="balance_metadata.svg" '
                           'alt="barplots">\n\n'
                           '<a href="balance_metadata.pdf">'
                           'Download as PDF</a><br>\n'))

        if not multiple_cats:
            index_f.write('<h1>Proportion Plot </h1>\n')
            index_f.write(('<img src="proportion_plot.svg" '
                           'alt="proportions">\n\n'
                           '<a href="proportion_plot.pdf">'
                           'Download as PDF</a><br>\n'))

        index_f.write(('<h1>Balance Taxonomy</h1>\n'
                       '<img src="barplots.svg" alt="barplots">\n\n'
                       '<a href="barplots.pdf">'
                       'Download as PDF</a><br>\n'
                       '<h3>Numerator taxa</h3>\n'
                       '<a href="numerator.csv">\n'
                       'Download as CSV</a><br>\n'
                       '<h3>Denominator taxa</h3>\n'
                       '<a href="denominator.csv">\n'
                       'Download as CSV</a><br>\n'))

        num_features.to_csv(os.path.join(output_dir, 'numerator.csv'),
                            header=True, index=True)
        denom_features.to_csv(os.path.join(output_dir, 'denominator.csv'),
                              header=True, index=True)
        index_f.write('</body></html>\n')
Exemple #26
0
def main(_):

    opts = Options(save_path=FLAGS.save_path,
                   train_biom=FLAGS.train_biom,
                   test_biom=FLAGS.test_biom,
                   train_metadata=FLAGS.train_metadata,
                   test_metadata=FLAGS.test_metadata,
                   formula=FLAGS.formula,
                   learning_rate=FLAGS.learning_rate,
                   clipping_size=FLAGS.clipping_size,
                   beta_mean=FLAGS.beta_mean,
                   beta_scale=FLAGS.beta_scale,
                   gamma_mean=FLAGS.gamma_mean,
                   gamma_scale=FLAGS.gamma_scale,
                   epochs_to_train=FLAGS.epochs_to_train,
                   num_neg_samples=FLAGS.num_neg_samples,
                   batch_size=FLAGS.batch_size,
                   min_sample_count=FLAGS.min_sample_count,
                   min_feature_count=FLAGS.min_feature_count,
                   statistics_interval=FLAGS.statistics_interval,
                   summary_interval=FLAGS.summary_interval,
                   checkpoint_interval=FLAGS.checkpoint_interval)
    # preprocessing
    train_table, train_metadata = opts.train_table, opts.train_metadata

    sample_filter = lambda val, id_, md: (
        (id_ in train_metadata.index) and np.sum(val) > opts.min_sample_count)
    read_filter = lambda val, id_, md: np.sum(val) > opts.min_feature_count
    train_table = train_table.filter(sample_filter, axis='sample')
    train_table = train_table.filter(read_filter, axis='observation')
    train_metadata = dmatrix(opts.formula,
                             train_metadata,
                             return_type='dataframe')
    train_table, train_metadata = match(train_table, train_metadata)

    # hold out data preprocessing
    test_table, test_metadata = opts.test_table, opts.test_metadata
    metadata_filter = lambda val, id_, md: id_ in test_metadata.index
    obs_lookup = set(train_table.ids(axis='observation'))
    feat_filter = lambda val, id_, md: id_ in obs_lookup
    test_table = test_table.filter(metadata_filter, axis='sample')
    test_table = test_table.filter(feat_filter, axis='observation')
    test_metadata = dmatrix(opts.formula,
                            test_metadata,
                            return_type='dataframe')
    test_table, test_metadata = match(test_table, test_metadata)

    # pad extra columns with zeros, so that we can still make predictions
    extra_columns = list(
        set(train_metadata.columns) - set(test_metadata.columns))
    df = pd.DataFrame(
        {C: np.zeros(test_metadata.shape[0])
         for C in extra_columns},
        index=test_metadata.index)

    test_metadata = pd.concat((test_metadata, df), axis=1)

    p = train_metadata.shape[1]  # number of covariates
    G_data = train_metadata.values
    y_data = train_table.matrix_data.tocoo().T
    y_test = np.array(test_table.matrix_data.todense()).T
    N, D = y_data.shape
    save_path = opts.save_path
    learning_rate = opts.learning_rate
    batch_size = opts.batch_size
    gamma_mean, gamma_scale = opts.gamma_mean, opts.gamma_scale
    beta_mean, beta_scale = opts.beta_mean, opts.beta_scale
    num_neg = opts.num_neg_samples
    clipping_size = opts.clipping_size

    epoch = y_data.nnz // batch_size
    num_iter = int(opts.epochs_to_train * epoch)
    holdout_size = test_metadata.shape[0]
    checkpoint_interval = opts.checkpoint_interval

    # Model code
    with tf.Graph().as_default(), tf.Session() as session:

        Gpos_ph = tf.placeholder(tf.float32, [batch_size, p], name='G_pos')
        Gneg_ph = tf.placeholder(tf.float32, [num_neg, p], name='G_neg')
        G_holdout = tf.placeholder(tf.float32, [holdout_size, p],
                                   name='G_holdout')
        Y_holdout = tf.placeholder(tf.float32, [holdout_size, D],
                                   name='Y_holdout')

        Y_ph = tf.placeholder(tf.float32, [batch_size], name='Y_ph')

        pos_row = tf.placeholder(tf.int32, shape=[batch_size], name='pos_row')
        pos_col = tf.placeholder(tf.int32, shape=[batch_size], name='pos_col')
        neg_row = tf.placeholder(tf.int32, shape=[num_neg], name='neg_row')
        neg_col = tf.placeholder(tf.int32, shape=[num_neg], name='neg_col')

        neg_data = tf.zeros(shape=[num_neg], name='neg_data', dtype=tf.float32)
        total_zero = tf.constant(y_data.shape[0] * y_data.shape[1] -
                                 y_data.nnz,
                                 dtype=tf.float32)
        total_nonzero = tf.constant(y_data.nnz, dtype=tf.float32)

        qgamma = tf.Variable(tf.random_normal([1, D]), name='qgamma')
        # sample bias (for overdispersion)
        # theta = tf.Variable(tf.random_normal([N, 1]), name='theta')
        theta = tf.constant(np.log(train_table.sum(axis='sample')),
                            dtype=tf.float32)

        qbeta = tf.Variable(tf.random_normal([p, D]), name='qB')

        # species bias
        gamma = Normal(loc=tf.zeros([1, D]) + gamma_mean,
                       scale=tf.ones([1, D]) * gamma_scale,
                       name='gamma')
        # regression coefficents distribution
        beta = Normal(loc=tf.zeros([p, D]) + beta_mean,
                      scale=tf.ones([p, D]) * beta_scale,
                      name='B')

        V = tf.concat([qgamma, qbeta], axis=0)

        # add bias terms for samples
        Gpos = tf.concat([tf.ones([batch_size, 1]), Gpos_ph], axis=1)
        Gneg = tf.concat([tf.ones([num_neg, 1]), Gneg_ph], axis=1)

        # sparse matrix multiplication for positive samples
        pos_prime = tf.reduce_sum(tf.multiply(
            Gpos, tf.transpose(tf.gather(V, pos_col, axis=1))),
                                  axis=1)
        pos_phi = tf.reshape(tf.gather(theta, pos_row), shape=[batch_size
                                                               ]) + pos_prime

        Y = Poisson(log_rate=pos_phi, name='Y')

        # sparse matrix multiplication for negative samples
        neg_prime = tf.reduce_sum(tf.multiply(
            Gneg, tf.transpose(tf.gather(V, neg_col, axis=1))),
                                  axis=1)
        neg_phi = tf.reshape(tf.gather(theta, neg_row), shape=[num_neg
                                                               ]) + neg_prime
        neg_poisson = Poisson(log_rate=neg_phi, name='neg_counts')

        loss = -(
            tf.reduce_sum(gamma.log_prob(qgamma)) + \
            tf.reduce_sum(beta.log_prob(qbeta)) + \
            tf.reduce_sum(Y.log_prob(Y_ph)) * (total_nonzero / batch_size) + \
            tf.reduce_sum(neg_poisson.log_prob(neg_data)) * (total_zero / num_neg)
        )

        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients, variables = zip(*optimizer.compute_gradients(loss))
        gradients, _ = tf.clip_by_global_norm(gradients, clipping_size)
        train = optimizer.apply_gradients(zip(gradients, variables))

        with tf.name_scope('accuracy'):
            holdout_count = tf.reduce_sum(Y_holdout, axis=1)
            pred = tf.reshape(
                holdout_count,
                [-1, 1]) * tf.nn.softmax(tf.matmul(G_holdout, qbeta) + qgamma)
            mse = tf.reduce_mean(tf.squeeze(tf.abs(pred - Y_holdout)))
            tf.summary.scalar('mean_absolute_error', mse)

        tf.summary.scalar('loss', loss)
        tf.summary.histogram('qbeta', qbeta)
        tf.summary.histogram('qgamma', qgamma)
        tf.summary.histogram('theta', theta)
        merged = tf.summary.merge_all()

        tf.global_variables_initializer().run()

        writer = tf.summary.FileWriter(save_path, session.graph)
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()

        losses = np.array([0.] * num_iter)
        idx = np.arange(train_metadata.shape[0])
        log_handle = open(os.path.join(save_path, 'run.log'), 'w')
        gen = subsampler(y_data, num_pos=batch_size, num_neg=num_neg)
        start_time = time.time()
        last_checkpoint_time = 0
        saver = tf.train.Saver()
        for i in range(num_iter):
            batch_idx = np.random.choice(idx, size=batch_size)
            batch = next(gen)
            (positive_row, positive_col, positive_data, negative_row,
             negative_col) = batch
            feed_dict = {
                Y_ph: positive_data,
                Y_holdout: y_test.astype(np.float32),
                G_holdout: test_metadata.values.astype(np.float32),
                Gpos_ph: G_data[positive_row, :],
                Gneg_ph: G_data[negative_row, :],
                pos_row: positive_row,
                pos_col: positive_col,
                neg_row: negative_row,
                neg_col: negative_col
            }
            if i % 1000 == 0:
                _, summary, train_loss, grads = session.run(
                    [train, merged, loss, gradients],
                    feed_dict=feed_dict,
                    options=run_options,
                    run_metadata=run_metadata)
            elif i % 5000 == 0:
                _, summary, err, train_loss, grads = session.run(
                    [train, mse, merged, loss, gradients], feed_dict=feed_dict)
                writer.add_summary(summary, i)
            else:
                _, summary, train_loss, grads = session.run(
                    [train, merged, loss, gradients], feed_dict=feed_dict)
                writer.add_summary(summary, i)

            now = time.time()
            if now - last_checkpoint_time > checkpoint_interval:
                saver.save(session,
                           os.path.join(opts.save_path, "model.ckpt"),
                           global_step=i)
                last_checkpoint_time = now

            losses[i] = train_loss

        elapsed_time = time.time() - start_time
        print('Elapsed Time: %f seconds' % elapsed_time)

        # Cross validation
        pred_beta = qbeta.eval()
        pred_gamma = qgamma.eval()
        mse, mrc = cross_validation(test_metadata.values, pred_beta,
                                    pred_gamma, y_test)
        print("MSE: %f, MRC: %f" % (mse, mrc))
def preprocess(formula,
               train_table,
               train_metadata,
               test_table,
               test_metadata,
               min_sample_count=10,
               min_feature_count=10):
    """ Performs data preprocessing.

  Parameters
  ----------
  formula : str
     Statistical formula specifying the design matrix of covariates
     in the study design.
  train_table : biom.Table
     Biom table containing the feature counts within the training dataset.
  train_metadata : pd.DataFrame
     Sample metadata table containing all of the measured covariates in
     the training dataset.
  test_table : biom.Table
     Biom table containing the feature counts within the holdout dataset.
  test_metadata : pd.DataFrame
     Sample metadata table containing all of the measured covariates in
     the holdout test dataset.
  min_sample_counts : int
     Minimum number of total counts within a sample to be kept.
  min_feature_counts : int
     Minimum number of total counts within a feature to be kept.

  Returns
  -------
  train_table : biom.Table
     Biom table containing the feature counts within the training dataset.
  train_metadata : pd.DataFrame
     Sample metadata table containing all of the measured covariates in
     the training dataset.
  test_table : biom.Table
     Biom table containing the feature counts within the holdout dataset.
  test_metadata : pd.DataFrame
     Sample metadata table containing all of the measured covariates in
     the holdout test dataset.

  Notes
  -----
  This assumes that the biom tables can fit into memory - will
  require some extra consideration when this is no longer the case.
  """
    # preprocessing
    train_table, train_metadata = train_table, train_metadata
    sample_filter = lambda val, id_, md: (
        (id_ in train_metadata.index) and np.sum(val) > min_sample_count)
    read_filter = lambda val, id_, md: np.sum(val) > min_feature_count
    train_table = train_table.filter(sample_filter, axis='sample')
    train_table = train_table.filter(read_filter, axis='observation')
    train_metadata = dmatrix(formula, train_metadata, return_type='dataframe')
    train_table, train_metadata = match(train_table, train_metadata)

    # hold out data preprocessing
    test_table, test_metadata = test_table, test_metadata
    metadata_filter = lambda val, id_, md: id_ in test_metadata.index
    obs_lookup = set(train_table.ids(axis='observation'))
    feat_filter = lambda val, id_, md: id_ in obs_lookup
    test_table = test_table.filter(metadata_filter, axis='sample')
    test_table = test_table.filter(feat_filter, axis='observation')
    test_metadata = dmatrix(formula, test_metadata, return_type='dataframe')
    test_table, test_metadata = match(test_table, test_metadata)

    # pad extra columns with zeros, so that we can still make predictions
    extra_columns = list(
        set(train_metadata.columns) - set(test_metadata.columns))
    df = pd.DataFrame(
        {C: np.zeros(test_metadata.shape[0])
         for C in extra_columns},
        index=test_metadata.index)
    test_metadata = pd.concat((test_metadata, df), axis=1)

    return train_table, test_table, train_metadata, test_metadata
from skbio import TreeNode
from gneiss.util import match
from sklearn.cross_decomposition import PLSSVD
from skbio.stats.composition import clr, centralize, multiplicative_replacement
from biplot import make_biplot

plt.rcParams['svg.fonttype'] = 'none'

args = sys.argv[1:]

mapping = pd.read_table(args[0], index_col=0, sep='\t')
microbes = qiime2.Artifact.load(args[1]).view(pd.DataFrame)
metabolites = qiime2.Artifact.load(args[2]).view(pd.DataFrame)

# do this match thing twice to make sure they are all matched
mapping, microbes = match(mapping, microbes)
mapping, metabolites = match(mapping, metabolites)
microbes, metabolites = match(microbes, metabolites)

mapping, microbes = match(mapping, microbes)
mapping, metabolites = match(mapping, metabolites)
microbes, metabolites = match(microbes, metabolites)

catdict = {
    i + 1: val
    for i, val in enumerate(sorted(mapping['category'].unique().tolist()))
}

n = mapping.shape[0]
print('Number of samples: %d' % n)
print('Number of microbes: %d' % microbes.shape[1])
Exemple #29
0
def balance_taxonomy(output_dir: str, balances: pd.DataFrame, tree: TreeNode,
                     taxonomy: pd.DataFrame,
                     balance_name: Str,
                     taxa_level: Int = 0,
                     metadata: MetadataCategory = None) -> None:

    # parse out headers for taxonomy
    taxa_data = list(taxonomy['Taxon'].apply(lambda x: x.split(';')).values)
    taxa_df = pd.DataFrame(taxa_data,
                           index=taxonomy.index)

    # fill in NAs
    def f(x):
        y = np.array(list(map(lambda k: k is not None, x)))
        i = max(0, np.where(y)[0][-1])
        x[np.logical_not(y)] = [x[i]] * np.sum(np.logical_not(y))
        return x
    taxa_df = taxa_df.apply(f, axis=1)

    num_clade = tree.find(balance_name).children[NUMERATOR]
    denom_clade = tree.find(balance_name).children[DENOMINATOR]

    if num_clade.is_tip():
        num_features = pd.DataFrame(
            {num_clade.name: taxa_df.loc[num_clade.name]}
            ).T
    else:
        num_features = taxa_df.loc[num_clade.subset()]

    if denom_clade.is_tip():
        denom_features = pd.DataFrame(
            {denom_clade.name: taxa_df.loc[denom_clade.name]}
            ).T
    else:
        denom_features = taxa_df.loc[denom_clade.subset()]

    num_color, denom_color = '#4c72b0', '#4c72b0'

    fig, (ax_num, ax_denom) = plt.subplots(2)
    balance_barplots(tree, balance_name, taxa_level, taxa_df,
                     denom_color=denom_color, num_color=num_color,
                     axes=(ax_num, ax_denom))

    ax_num.set_title(
        r'$%s_{numerator} \; taxa \; (%d \; taxa)$' % (balance_name,
                                                       len(num_features)))
    ax_denom.set_title(
        r'$%s_{denominator} \; taxa \; (%d \; taxa)$' % (balance_name,
                                                         len(denom_features)))
    ax_denom.set_xlabel('Number of unique taxa')
    plt.tight_layout()
    fig.savefig(os.path.join(output_dir, 'barplots.svg'))
    fig.savefig(os.path.join(output_dir, 'barplots.pdf'))

    if metadata is not None:
        fig2, ax = plt.subplots()
        c = metadata.to_series()
        data, c = match(balances, c)
        data[c.name] = c
        y = data[balance_name]
        # check if continuous
        try:
            c = c.astype(np.float64)
            ax.scatter(c.values, y)
            ax.set_xlabel(c.name)
        except:
            balance_boxplot(balance_name, data, y=c.name, ax=ax)

        ylabel = (r"$%s = \ln \frac{%s_{numerator}}"
                  "{%s_{denominator}}$") % (balance_name,
                                            balance_name,
                                            balance_name)
        ax.set_title(ylabel, rotation=0)
        ax.set_ylabel('log ratio')
        fig2.savefig(os.path.join(output_dir, 'balance_metadata.svg'))
        fig2.savefig(os.path.join(output_dir, 'balance_metadata.pdf'))

    index_fp = os.path.join(output_dir, 'index.html')
    with open(index_fp, 'w') as index_f:
        index_f.write('<html><body>\n')
        if metadata is not None:
            index_f.write('<h1>Balance vs %s </h1>\n' % c.name)
            index_f.write(('<img src="balance_metadata.svg" '
                           'alt="barplots">\n\n'
                           '<a href="balance_metadata.pdf">'
                           'Download as PDF</a><br>\n'))

        index_f.write(('<h1>Balance Taxonomy</h1>\n'
                       '<img src="barplots.svg" alt="barplots">\n\n'
                       '<a href="barplots.pdf">'
                       'Download as PDF</a><br>\n'
                       '<h3>Numerator taxa</h3>\n'
                       '<a href="numerator.csv">\n'
                       'Download as CSV</a><br>\n'
                       '<h3>Denominator taxa</h3>\n'
                       '<a href="denominator.csv">\n'
                       'Download as CSV</a><br>\n'))

        num_features.to_csv(os.path.join(output_dir, 'numerator.csv'),
                            header=True, index=True)
        denom_features.to_csv(os.path.join(output_dir, 'denominator.csv'),
                              header=True, index=True)
        index_f.write('</body></html>\n')
Exemple #30
0
def gradient_linkage(X, y, method='average'):
    r"""
    Hierarchical Clustering on known gradient.

    The hierarchy is built based on the values of the samples
    located along a gradient.  Given a feature :math:`x`, the mean gradient
    values that :math:`x` was observed in is calculated by

    .. math::
        f(g , x) =
         \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j}

    Where :math:`N` is the number of samples, :math:`x_i` is the proportion of
    feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value
    at sample `i`.

    The distance between two features :math:`x` and :math:`y` can be defined as

    .. math::
        d(x, y) = (f(g, x) - f(g, y))^2

    If :math:`d(x, y)` is very small, then :math:`x` and :math:`y`
    are expected to live in very similar positions across the gradient.
    A hierarchical clustering is then performed using :math:`d(x, y)` as
    the distance metric.

    This can be useful for constructing principal balances.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    y : pd.Series
        Continuous vector representing some ordering of the samples in X.
    method : str
        Clustering method.  (default='average')

    Returns
    -------
    skbio.TreeNode
        Tree for constructing principal balances.

    See Also
    --------
    mean_niche_estimator

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import gradient_linkage
    >>> table = pd.DataFrame([[1, 1, 0, 0, 0],
    ...                       [0, 1, 1, 0, 0],
    ...                       [0, 0, 1, 1, 0],
    ...                       [0, 0, 0, 1, 1]],
    ...                      columns=['s1', 's2', 's3', 's4', 's5'],
    ...                      index=['o1', 'o2', 'o3', 'o4']).T
    >>> gradient = pd.Series([1, 2, 3, 4, 5],
    ...                      index=['s1', 's2', 's3', 's4', 's5'])
    >>> tree = gradient_linkage(table, gradient)
    >>> print(tree.ascii_art())
                        /-o1
              /y1------|
             |          \-o2
    -y0------|
             |          /-o3
              \y2------|
                        \-o4
    """
    _X, _y = match(X, y)
    mean_X = mean_niche_estimator(_X, gradient=_y)
    t = rank_linkage(mean_X)
    return t