コード例 #1
0
ファイル: test_summary.py プロジェクト: gitter-badger/gneiss
 def test_regression_results_coefficient(self):
     exp_coef = pd.DataFrame(
         {
             'Intercept': [1.447368, -0.052632],
             'X': [0.539474, 1.289474]
         },
         index=['Y1', 'Y2'])
     res = RegressionResults(self.results)
     pdt.assert_frame_equal(res.coefficients(),
                            exp_coef,
                            check_exact=False,
                            check_less_precise=True)
コード例 #2
0
ファイル: test_summary.py プロジェクト: gitter-badger/gneiss
    def test_regression_results_predict_extrapolate(self):
        model = RegressionResults(self.results)
        extrapolate = pd.DataFrame({'X': [8, 9, 10]}, index=['k1', 'k2', 'k3'])
        res_predict = model.predict(extrapolate)

        exp_predict = pd.DataFrame(
            {
                'k1': [5.76315789, 10.26315789],
                'k2': [6.30263158, 11.55263158],
                'k3': [6.84210526, 12.84210526]
            },
            index=['Y1', 'Y2']).T

        pdt.assert_frame_equal(res_predict, exp_predict)
コード例 #3
0
ファイル: test_summary.py プロジェクト: gitter-badger/gneiss
    def test_regression_results_coefficient_projection(self):
        exp_coef = pd.DataFrame(
            {
                'Intercept': ilr_inv(np.array([[1.447368, -0.052632]])),
                'X': ilr_inv(np.array([[0.539474, 1.289474]]))
            },
            index=['Z1', 'Z2', 'Z3'])
        feature_names = ['Z1', 'Z2', 'Z3']
        basis = _gram_schmidt_basis(3)
        res = RegressionResults(self.results,
                                basis=basis,
                                feature_names=feature_names)

        pdt.assert_frame_equal(res.coefficients(project=True),
                               exp_coef,
                               check_exact=False,
                               check_less_precise=True)
コード例 #4
0
ファイル: test_summary.py プロジェクト: gitter-badger/gneiss
    def test_regression_results_predict_none(self):
        model = RegressionResults(self.results)
        res_predict = model.predict()

        exp_predict = pd.DataFrame(
            {
                's1': [1.986842, 1.236842],
                's2': [3.065789, 3.815789],
                's3': [2.526316, 2.526316],
                's4': [3.605263, 5.105263],
                's5': [3.065789, 3.815789],
                's6': [4.144737, 6.394737],
                's7': [3.605263, 5.105263]
            },
            index=['Y1', 'Y2']).T

        pdt.assert_frame_equal(res_predict, exp_predict)
コード例 #5
0
ファイル: test_summary.py プロジェクト: gitter-badger/gneiss
 def test_regression_results_residuals(self):
     exp_resid = pd.DataFrame(
         {
             's1': [-0.986842, -0.236842],
             's2': [-0.065789, -1.815789],
             's3': [1.473684, 0.473684],
             's4': [1.394737, -1.105263],
             's5': [-1.065789, 1.184211],
             's6': [-1.144737, -0.394737],
             's7': [0.394737, 1.894737]
         },
         index=['Y1', 'Y2']).T
     res = RegressionResults(self.results)
     pdt.assert_frame_equal(res.residuals(),
                            exp_resid,
                            check_exact=False,
                            check_less_precise=True)
コード例 #6
0
ファイル: test_summary.py プロジェクト: gitter-badger/gneiss
    def test_check_projection(self):
        feature_names = ['Z1', 'Z2', 'Z3']
        basis = _gram_schmidt_basis(3)
        res = RegressionResults(self.results,
                                basis=basis,
                                feature_names=feature_names)

        feature_names = ['Z1', 'Z2', 'Z3']
        basis = _gram_schmidt_basis(3)

        # Test if feature_names is checked for
        res = RegressionResults(self.results, basis=basis)
        with self.assertRaises(ValueError):
            res._check_projection(True)

        # Test if basis is checked for
        res = RegressionResults(self.results, feature_names=feature_names)
        with self.assertRaises(ValueError):
            res._check_projection(True)
コード例 #7
0
ファイル: test_summary.py プロジェクト: gitter-badger/gneiss
 def test_regression_results_pvalues(self):
     # checks to see if pvalues are calculated correctly.
     res = RegressionResults(self.results)
     exp = pd.DataFrame(
         {
             'Intercept': [0.307081, 0.972395],
             'X': [0.211391, 0.029677]
         },
         index=['Y1', 'Y2'])
     pdt.assert_frame_equal(res.pvalues,
                            exp,
                            check_exact=False,
                            check_less_precise=True)
コード例 #8
0
ファイル: test_summary.py プロジェクト: gitter-badger/gneiss
    def test_regression_results_predict_projection(self):
        feature_names = ['Z1', 'Z2', 'Z3']
        basis = _gram_schmidt_basis(3)
        model = RegressionResults(self.results,
                                  basis=basis,
                                  feature_names=feature_names)

        res_predict = model.predict(self.data[['X']], project=True)
        A = np.array  # aliasing np.array for the sake of pep8
        exp_predict = pd.DataFrame(
            {
                's1': ilr_inv(A([1.986842, 1.236842])),
                's2': ilr_inv(A([3.065789, 3.815789])),
                's3': ilr_inv(A([2.526316, 2.526316])),
                's4': ilr_inv(A([3.605263, 5.105263])),
                's5': ilr_inv(A([3.065789, 3.815789])),
                's6': ilr_inv(A([4.144737, 6.394737])),
                's7': ilr_inv(A([3.605263, 5.105263]))
            },
            index=feature_names).T

        pdt.assert_frame_equal(res_predict, exp_predict)
コード例 #9
0
ファイル: test_summary.py プロジェクト: gitter-badger/gneiss
 def test_regression_results_residuals_projection(self):
     A = np.array  # aliasing np.array for the sake of pep8
     exp_resid = pd.DataFrame(
         {
             's1': ilr_inv(A([-0.986842, -0.236842])),
             's2': ilr_inv(A([-0.065789, -1.815789])),
             's3': ilr_inv(A([1.473684, 0.473684])),
             's4': ilr_inv(A([1.394737, -1.105263])),
             's5': ilr_inv(A([-1.065789, 1.184211])),
             's6': ilr_inv(A([-1.144737, -0.394737])),
             's7': ilr_inv(A([0.394737, 1.894737]))
         },
         index=['Z1', 'Z2', 'Z3']).T
     feature_names = ['Z1', 'Z2', 'Z3']
     basis = _gram_schmidt_basis(3)
     res = RegressionResults(self.results,
                             basis=basis,
                             feature_names=feature_names)
     pdt.assert_frame_equal(res.residuals(project=True),
                            exp_resid,
                            check_exact=False,
                            check_less_precise=True)
コード例 #10
0
def glm(formula, table, metadata, tree, groups, **kwargs):
    """ General Linear Models applied to balances.
    
     Parameters
    ----------
    formula : str
        Formula representing the statistical equation to be evaluated.
        These strings are similar to how equations are handled in R and
        statsmodels. Note that the dependent variable in this string should
        not be specified, since this method will be run on each of the
        individual balances. See `patsy` for more details.
    table : pd.DataFrame
        Contingency table where samples correspond to rows and
        features correspond to columns.
    metadata: pd.DataFrame
        Metadata table that contains information about the samples contained
        in the `table` object.  Samples correspond to rows and covariates
        correspond to columns.
    tree : skbio.TreeNode
        Tree object where the leaves correspond to the columns contained in
        the table.
    **kwargs : dict
        Other arguments accepted into `statsmodels.regression.linear_model.GLM`

    Returns
    -------
    RegressionResults
        Container object that holds information about the overall fit.    
    """
    table, metadata, tree = _intersect_of_table_metadata_tree(
        table, metadata, tree)

    ilr_table, basis = _to_balances(table, tree)
    data = pd.merge(ilr_table, metadata, left_index=True, right_index=True)
    fits = []
    #one-time creation of exogenous data matrix allows for faster run-time
    exog_data = dmatrix(formula, data, return_type='dataframe')
    for b in ilr_table.columns:
        endog_data = data[b]
        try:
            mdf = smf.GLM(endog=endog_data, exog=exog_data, **kwargs).fit()
        except PerfectSeparationError:
            continue
        fits.append(mdf)

    return RegressionResults(fits,
                             basis=basis,
                             feature_names=table.columns,
                             balances=ilr_table,
                             tree=tree)
コード例 #11
0
ファイル: test_summary.py プロジェクト: gitter-badger/gneiss
    def test_r2(self):
        fittedvalues = pd.DataFrame(
            {
                's1': [1.986842, 1.236842],
                's2': [3.065789, 3.815789],
                's3': [2.526316, 2.526316],
                's4': [3.605263, 5.105263],
                's5': [3.065789, 3.815789],
                's6': [4.144737, 6.394737],
                's7': [3.605263, 5.105263]
            },
            index=['Y1', 'Y2']).T
        m = self.data.mean(axis=0)
        sse = ((fittedvalues - self.data.iloc[:, :2])**2).sum().sum()
        # ssr = ((fittedvalues - m)**2).sum().sum()
        sst = ((m - self.data.iloc[:, :2])**2).sum().sum()
        exp_r2 = 1 - (sse / sst)

        res = RegressionResults(self.results)
        self.assertAlmostEqual(exp_r2, res.r2)
コード例 #12
0
ファイル: test_summary.py プロジェクト: gitter-badger/gneiss
 def test_regression_results_coefficient_project_error(self):
     res = RegressionResults(self.results)
     with self.assertRaises(ValueError):
         res.coefficients(project=True)
コード例 #13
0
def ols(formula, table, metadata, tree, **kwargs):
    """ Ordinary Least Squares applied to balances.

    A ordinary least square regression is performed on nonzero relative
    abundance data given a list of covariates, or explanatory variables
    such as ph, treatment, etc to test for specific effects. The relative
    abundance data is transformed into balances using the ILR transformation,
    using a tree to specify the groupings of the features. The regression
    is then performed on each balance separately. Only positive data will
    be accepted, so if there are zeros present, consider using a zero
    imputation method such as ``multiplicative_replacement`` or add a
    pseudocount.


    Parameters
    ----------
    formula : str
        Formula representing the statistical equation to be evaluated.
        These strings are similar to how equations are handled in R and
        statsmodels. Note that the dependent variable in this string should
        not be specified, since this method will be run on each of the
        individual balances. See `patsy` for more details.
    table : pd.DataFrame
        Contingency table where samples correspond to rows and
        features correspond to columns.
    metadata: pd.DataFrame
        Metadata table that contains information about the samples contained
        in the `table` object.  Samples correspond to rows and covariates
        correspond to columns.
    tree : skbio.TreeNode
        Tree object where the leaves correspond to the columns contained in
        the table.
    **kwargs : dict
        Other arguments accepted into `statsmodels.regression.linear_model.OLS`

    Returns
    -------
    RegressionResults
        Container object that holds information about the overall fit.

    Example
    -------
    >>> from gneiss import ols
    >>> from skbio import TreeNode
    >>> import pandas as pd

    Here, we will define a table of proportions with 3 features
    `a`, `b`, and `c` across 5 samples.

    >>> proportions = pd.DataFrame(
    ...     [[0.720463, 0.175157, 0.104380],
    ...      [0.777794, 0.189095, 0.033111],
    ...      [0.796416, 0.193622, 0.009962],
    ...      [0.802058, 0.194994, 0.002948],
    ...      [0.803731, 0.195401, 0.000868]],
    ...     columns=['a', 'b', 'c'],
    ...     index=['s1', 's2', 's3', 's4', 's5'])

    Now we will define the environment variables that we want to
    regress against the proportions.

    >>> env_vars = pd.DataFrame({
    ...     'temp': [20, 20, 20, 20, 21],
    ...     'ph': [1, 2, 3, 4, 5]},
    ...     index=['s1', 's2', 's3', 's4', 's5'])

    Finally, we need to define a bifurcating tree used to convert the
    proportions to balances.  If the internal nodes aren't labels,
    a default labeling will be applied (i.e. `y1`, `y2`, ...)

    >>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])

    Once these 3 variables are defined, a regression can be performed.
    These proportions will be converted to balances according to the
    tree specified.  And the regression formula is specified to run
    `temp` and `ph` against the proportions in a single model.

    >>> res = ols('temp + ph', proportions, env_vars, tree)

    From the summary results of the `ols` function, we can view the
    pvalues according to how well each individual balance fitted in the
    regression model.

    >>> res.pvalues
           Intercept            ph      temp
    Y1  2.479592e-01  1.990984e-11  0.243161
    Y2  6.089193e-10  5.052733e-01  0.279805

    We can also view the balance coefficients estimated in the regression
    model. These coefficients can also be viewed as proportions by passing
    `project=True` as an argument in `res.coefficients()`.

    >>> res.coefficients()
        Intercept            ph      temp
    Y1  -0.000499  9.999911e-01  0.000026
    Y2   1.000035  2.865312e-07 -0.000002

    The balance residuals from the model can be viewed as follows.  Again,
    these residuals can be viewed as proportions by passing `project=True`
    into `res.residuals()`

    >>> res.residuals()
                  Y1            Y2
    s1 -4.121647e-06 -2.998793e-07
    s2  6.226749e-07 -1.602904e-09
    s3  1.111959e-05  9.028437e-07
    s4 -7.620619e-06 -6.013615e-07
    s5 -1.332268e-14 -2.375877e-14

    The predicted balances can be obtained as follows.  Note that the predicted
    proportions can also be obtained by passing `project=True` into
    `res.predict()`

    >>> res.predict()
              Y1        Y2
    s1  1.000009  0.999999
    s2  2.000000  0.999999
    s3  2.999991  0.999999
    s4  3.999982  1.000000
    s5  4.999999  0.999998

    The overall model fit can be obtained as follows

    >>> res.r2
    0.99999999997996369

    See Also
    --------
    statsmodels.regression.linear_model.OLS
    skbio.stats.composition.multiplicative_replacement
    """

    table, metadata, tree = _intersect_of_table_metadata_tree(
        table, metadata, tree)
    ilr_table, basis = _to_balances(table, tree)
    data = pd.merge(ilr_table, metadata, left_index=True, right_index=True)

    fits = []

    for b in ilr_table.columns:
        # mixed effects code is obtained here:
        # http://stackoverflow.com/a/22439820/1167475
        stats_formula = '%s ~ %s' % (b, formula)
        mdf = smf.ols(stats_formula, data=data, **kwargs).fit()
        fits.append(mdf)

    return RegressionResults(fits,
                             basis=basis,
                             feature_names=table.columns,
                             balances=ilr_table,
                             tree=tree)
コード例 #14
0
def mixedlm(formula, table, metadata, tree, groups, **kwargs):
    """ Linear Mixed Effects Models applied to balances.

    A linear mixed effects model is performed on nonzero relative abundance
    data given a list of covariates, or explanatory variables such as ph,
    treatment, etc to test for specific effects. The relative abundance data
    is transformed into balances using the ILR transformation, using a tree to
    specify the groupings of the features. The linear mixed effects model is
    applied to each balance separately. Only positive data will be accepted,
    so if there are zeros present, consider using a zero imputation method
    such as ``skbio.stats.composition.multiplicative_replacement`` or
    add a pseudocount.

    Parameters
    ----------
    formula : str
        Formula representing the statistical equation to be evaluated.
        These strings are similar to how equations are handled in R.
        Note that the dependent variable in this string should not be
        specified, since this method will be run on each of the individual
        balances. See `patsy` for more details.
    table : pd.DataFrame
        Contingency table where samples correspond to rows and
        features correspond to columns.
    metadata: pd.DataFrame
        Metadata table that contains information about the samples contained
        in the `table` object.  Samples correspond to rows and covariates
        correspond to columns.
    tree : skbio.TreeNode
        Tree object where the leaves correspond to the columns contained in
        the table.
    groups : str
        Column names in `metadata` that specifies the groups.  These groups are
        often associated with individuals repeatedly sampled, typically
        longitudinally.
    **kwargs : dict
        Other arguments accepted into
        `statsmodels.regression.linear_model.MixedLM`

    Returns
    -------
    RegressionResults
        Container object that holds information about the overall fit.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from skbio.stats.composition import ilr_inv
    >>> from skbio import TreeNode
    >>> from gneiss import mixedlm

    Here, we will define a table of proportions with 3 features
    `a`, `b`, and `c` across 12 samples.

    >>> table = pd.DataFrame({
    ...         'x1': ilr_inv(np.array([1.1, 1.1])),
    ...         'x2': ilr_inv(np.array([1., 2.])),
    ...         'x3': ilr_inv(np.array([1.1, 3.])),
    ...         'y1': ilr_inv(np.array([1., 2.1])),
    ...         'y2': ilr_inv(np.array([1., 3.1])),
    ...         'y3': ilr_inv(np.array([1., 4.])),
    ...         'z1': ilr_inv(np.array([1.1, 5.])),
    ...         'z2': ilr_inv(np.array([1., 6.1])),
    ...         'z3': ilr_inv(np.array([1.1, 7.])),
    ...         'u1': ilr_inv(np.array([1., 6.1])),
    ...         'u2': ilr_inv(np.array([1., 7.])),
    ...         'u3': ilr_inv(np.array([1.1, 8.1]))},
    ...         index=['a', 'b', 'c']).T

    Now we are going to define some of the external variables to
    test for in the model.  Here we will be testing a hypothetical
    longitudinal study across 3 time points, with 4 patients
    `x`, `y`, `z` and `u`, where `x` and `y` were given treatment `1`
    and `z` and `u` were given treatment `2`.

    >>> metadata = pd.DataFrame({
    ...         'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
    ...         'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
    ...         'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
    ...     }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3',
    ...               'z1', 'z2', 'z3', 'u1', 'u2', 'u3'])

    Finally, we need to define a bifurcating tree used to convert the
    proportions to balances.  If the internal nodes aren't labels,
    a default labeling will be applied (i.e. `y1`, `y2`, ...)

    >>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
    >>> print(tree.ascii_art())
              /-c
    -Y1------|
             |          /-b
              \Y2------|
                        \-a

    Now we can run the linear mixed effects model on the proportions.
    Underneath the hood, the proportions will be transformed into balances,
    so that the linear mixed effects models can be run directly on balances.
    Since each patient was sampled repeatedly, we'll specify them separately
    in the groups.  In the linear mixed effects model `time` and `treatment`
    will be simultaneously tested for with respect to the balances.

    >>> res = mixedlm('time + treatment', table, metadata, tree,
    ...               groups='patient')

    See Also
    --------
    statsmodels.regression.linear_model.MixedLM
    skbio.stats.composition.multiplicative_replacement
    ols

    """
    table, metadata, tree = _intersect_of_table_metadata_tree(
        table, metadata, tree)
    ilr_table, basis = _to_balances(table, tree)
    data = pd.merge(ilr_table, metadata, left_index=True, right_index=True)

    fits = []
    for b in ilr_table.columns:
        # mixed effects code is obtained here:
        # http://stackoverflow.com/a/22439820/1167475
        stats_formula = '%s ~ %s' % (b, formula)
        mdf = smf.mixedlm(stats_formula,
                          data=data,
                          groups=data[groups],
                          **kwargs).fit()
        fits.append(mdf)

    return RegressionResults(fits,
                             basis=basis,
                             feature_names=table.columns,
                             balances=ilr_table,
                             tree=tree)