Beispiel #1
0
    def test_summary_head(self):
        A = np.array  # aliasing for the sake of pep8
        table = pd.DataFrame({
            's1': ilr_inv(A([1., 3.])),
            's2': ilr_inv(A([2., 2.])),
            's3': ilr_inv(A([1., 3.])),
            's4': ilr_inv(A([3., 4.])),
            's5': ilr_inv(A([1., 5.]))},
            index=['a', 'b', 'c']).T
        tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
        metadata = pd.DataFrame({
            'lame': [1, 2, 1, 4, 1],
            'real': [1, 2, 3, 4, 5]
        }, index=['s1', 's2', 's3', 's4', 's5'])

        np.random.seed(0)
        self.maxDiff = None
        model = ols('real', table, metadata, tree)
        model.fit()

        fname = get_data_path('exp_ols_results2.txt')
        res = str(model.summary(ndim=1))
        with open(fname, 'r') as fh:
            exp = fh.read()
            self.assertEqual(res, exp)
Beispiel #2
0
 def test_ilr_inv_basis_one_dimension_error(self):
     basis = clr(np.array([[0.80442968, 0.19557032]]))
     table = np.array([[np.log(1/10)*np.sqrt(1/2),
                        np.log(1.14141414 / 9.90909091)*np.sqrt(1/2),
                        np.log(1.28282828 / 9.81818182)*np.sqrt(1/2),
                        np.log(1.42424242 / 9.72727273)*np.sqrt(1/2),
                        np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T
     with self.assertRaises(ValueError):
         ilr_inv(table, basis=basis)
Beispiel #3
0
 def test_ilr_inv_basis_one_dimension_error(self):
     basis = clr(np.array([[0.80442968, 0.19557032]]))
     table = np.array([[np.log(1/10)*np.sqrt(1/2),
                        np.log(1.14141414 / 9.90909091)*np.sqrt(1/2),
                        np.log(1.28282828 / 9.81818182)*np.sqrt(1/2),
                        np.log(1.42424242 / 9.72727273)*np.sqrt(1/2),
                        np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T
     with self.assertRaises(ValueError):
         ilr_inv(table, basis=basis)
Beispiel #4
0
def _regression(y, X, basis=None):
    """
    Performs a simplicial ordinary least squares on a set of
    compositions and a response variable

    Parameters
    ----------
    y : numpy.ndarray, float
       a matrix of proportions where
       rows correspond to samples and
       columns correspond to features.
    X : numpy.ndarray, float
       independent variable

    Returns
    -------
    predict: pd.DataFrame, float
       a predicted matrix of proportions where
       rows correspond to samples and
       columns correspond to features.
    b: pd.DataFrame, float
       a matrix of estimated coefficient compositions
    resid: pd.DataFrame, float
       a matrix of compositional residuals
    r2: float
       coefficient of determination
    """
    y = np.atleast_2d(y)
    X = np.atleast_2d(X)

    # Need to add constant for intercept
    r, c = X.shape

    y_ = ilr(y, basis=basis)

    # Now perform least squares to calculate unknown coefficients
    inv = np.linalg.pinv(np.dot(X.T, X))
    cross = np.dot(inv, X.T)
    b_ = np.dot(cross, y_)
    predict_ = np.dot(X, b_)
    resid = (y_ - predict_)
    sst = (y_ - y_.mean(axis=0))
    r2 = 1 - ((resid**2).sum() / (sst**2).sum())

    if len(b_.shape) == 1:
        b_ = np.atleast_2d(b_).T

    b = ilr_inv(b_)
    if len(predict_.shape) == 1:
        predict_ = np.atleast_2d(predict_).T
    predict = ilr_inv(predict_)

    if len(resid.shape) == 1:
        resid = np.atleast_2d(resid).T
    resid = ilr_inv(resid)
    return predict, b, resid, r2
Beispiel #5
0
 def setUp(self):
     A = np.array  # aliasing for the sake of pep8
     self.table = pd.DataFrame({
         's1': ilr_inv(A([1., 1.])),
         's2': ilr_inv(A([1., 2.])),
         's3': ilr_inv(A([1., 3.])),
         's4': ilr_inv(A([1., 4.])),
         's5': ilr_inv(A([1., 5.]))},
         index=['a', 'b', 'c']).T
     self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
     self.unannotated_tree = TreeNode.read(['(c, (b,a));'])
     self.metadata = pd.DataFrame({
         'lame': [1, 1, 1, 1, 1],
         'real': [1, 2, 3, 4, 5]
     }, index=['s1', 's2', 's3', 's4', 's5'])
Beispiel #6
0
def compositional_noise(cov, nsamp, rng=None):
    """
    This is multiplicative noise applied across the entire dataset.
    The noise is assumed to be Gaussian in the simplex.

    Parameters
    ----------
    cov: array_like
       Covariance matrix for the normal distribution in ilr space.
       This is assumed to be in the default gram-schmidt orthonormal basis.
    nsamp: int
       Number of samples to generate
    rng: np.random.RandomState
       Numpy random state.

    Returns
    -------
    np.array:
       A matrix of probabilities where there are `n` rows and
       `m` columns where `n` corresponds to the number of samples
       and `m` corresponds to the number of species.
    """
    if rng is None:
        rng = RandomState(0)
    dist = multivariate_normal.rvs(cov=cov, size=nsamp, random_state=rng)
    return ilr_inv(dist)
Beispiel #7
0
    def test_regression_results_residuals_projection(self):
        tree = TreeNode.read([r'(c, (a, b)Y2)Y1;'])
        basis, _ = balance_basis(tree)
        exp_resid = pd.DataFrame(
            {
                's1': [-0.986842, -0.236842],
                's2': [-0.065789, -1.815789],
                's3': [1.473684, 0.473684],
                's4': [1.394737, -1.105263],
                's5': [-1.065789, 1.184211],
                's6': [-1.144737, -0.394737],
                's7': [0.394737, 1.894737]
            },
            index=['Y1', 'Y2']).T
        exp_resid = pd.DataFrame(
            ilr_inv(exp_resid, basis),
            index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
            columns=['c', 'a', 'b'])

        submodels = [self.model1, self.model2]
        res = submock(Y=self.balances, Xs=None)
        submock.submodels = submodels
        res.fit()
        res_resid = res.residuals(tree).sort_index()
        pdt.assert_frame_equal(res_resid,
                               exp_resid,
                               check_exact=False,
                               check_less_precise=True)
Beispiel #8
0
    def coefficients(self, tree=None):
        """ Returns coefficients from fit.

        Parameters
        ----------
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented as
            proportions. Otherwise, if this is not specified, the prediction
            will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of coefficients where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.
        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        coef = self._beta
        if tree is not None:
            basis, _ = balance_basis(tree)
            c = ilr_inv(coef.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(c, columns=ids, index=coef.index)
        else:
            return coef
Beispiel #9
0
    def coefficients(self, tree=None):
        """ Returns coefficients from fit.

        Parameters
        ----------
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented as
            proportions. Otherwise, if this is not specified, the prediction
            will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of coefficients where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.
        """
        coef = pd.DataFrame()

        for r in self.results:
            c = r.params
            c.name = r.model.endog_names
            coef = coef.append(c)

        if tree is not None:
            basis, _ = balance_basis(tree)
            c = ilr_inv(coef.values.T, basis=basis).T

            return pd.DataFrame(c, index=[n.name for n in tree.tips()],
                                columns=coef.columns)
        else:
            return coef.T
Beispiel #10
0
    def coefficients(self, tree=None):
        """ Returns coefficients from fit.

        Parameters
        ----------
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented as
            proportions. Otherwise, if this is not specified, the prediction
            will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of coefficients where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.
        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        coef = self._beta
        if tree is not None:
            basis, _ = balance_basis(tree)
            c = ilr_inv(coef.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(c, columns=ids, index=coef.index)
        else:
            return coef
Beispiel #11
0
def partition_microbes(num_microbes, sigmaQ, microbe_in, state):
    """ Split up a single microbe abundances into multiple strains.

    Parameters
    ----------
    num_microbes : int
        Number of strains to be represented
    sigmaQ : float
        The variance of the multivariate distribution
    microbe_in : np.array
        The input abundances for a single species
    state : numpy random state
        Random number generator

    Returns
    -------
    microbes_out : np.array
        Multiple strain abundances.
    """
    num_samples = len(microbe_in)

    a = state.multivariate_normal(mean=np.zeros(num_microbes - 1),
                                  cov=np.diag([sigmaQ] * (num_microbes - 1)),
                                  size=num_samples)

    microbe_partition = ilr_inv(a)

    microbes_out = np.multiply(microbe_partition, microbe_in.reshape(-1, 1))
    return microbes_out
Beispiel #12
0
    def test_regression_results_coefficient_projection(self):
        exp_coef = pd.DataFrame(
            {
                'Intercept': ilr_inv(np.array([[1.447368, -0.052632]])),
                'X': ilr_inv(np.array([[0.539474, 1.289474]]))
            },
            index=['Z1', 'Z2', 'Z3'])
        feature_names = ['Z1', 'Z2', 'Z3']
        basis = _gram_schmidt_basis(3)
        res = RegressionResults(self.results,
                                basis=basis,
                                feature_names=feature_names)

        pdt.assert_frame_equal(res.coefficients(project=True),
                               exp_coef,
                               check_exact=False,
                               check_less_precise=True)
Beispiel #13
0
 def test_regression_results_residuals_projection(self):
     A = np.array  # aliasing np.array for the sake of pep8
     exp_resid = pd.DataFrame(
         {
             's1': ilr_inv(A([-0.986842, -0.236842])),
             's2': ilr_inv(A([-0.065789, -1.815789])),
             's3': ilr_inv(A([1.473684, 0.473684])),
             's4': ilr_inv(A([1.394737, -1.105263])),
             's5': ilr_inv(A([-1.065789, 1.184211])),
             's6': ilr_inv(A([-1.144737, -0.394737])),
             's7': ilr_inv(A([0.394737, 1.894737]))
         },
         index=['a', 'b', 'c']).T
     # note that in the example, the basis is not strictly
     # equivalent to the tree
     basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)),
                          index=['Y1', 'Y2'],
                          columns=['a', 'b', 'c'])
     submodels = [self.model1, self.model2]
     res = submock(submodels=submodels,
                   basis=basis,
                   tree=self.tree,
                   balances=self.balances)
     res.fit()
     pdt.assert_frame_equal(res.residuals(project=True),
                            exp_resid,
                            check_exact=False,
                            check_less_precise=True)
Beispiel #14
0
    def test_regression_results_predict_projection(self):
        basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)),
                             index=['Y1', 'Y2'],
                             columns=['a', 'b', 'c'])
        submodels = [self.model1, self.model2]
        res = submock(submodels=submodels,
                      basis=basis,
                      tree=self.tree,
                      balances=self.balances)
        res.fit()

        res_predict = res.predict(self.data[['X']], project=True)
        A = np.array  # aliasing np.array for the sake of pep8
        exp_predict = pd.DataFrame(
            {
                's1': ilr_inv(A([1.986842, 1.236842])),
                's2': ilr_inv(A([3.065789, 3.815789])),
                's3': ilr_inv(A([2.526316, 2.526316])),
                's4': ilr_inv(A([3.605263, 5.105263])),
                's5': ilr_inv(A([3.065789, 3.815789])),
                's6': ilr_inv(A([4.144737, 6.394737])),
                's7': ilr_inv(A([3.605263, 5.105263]))
            },
            index=['a', 'b', 'c']).T

        pdt.assert_frame_equal(res_predict, exp_predict)
Beispiel #15
0
    def test_regression_results_coefficient_projection(self):
        exp_coef = pd.DataFrame(
            {'Intercept': ilr_inv(np.array([[1.447368, -0.052632]])),
             'X': ilr_inv(np.array([[0.539474, 1.289474]]))},
            index=['a', 'b', 'c'])
        # note that in the example, the basis is not strictly
        # equivalent to the tree
        basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)),
                             index=['Y1', 'Y2'],
                             columns=['a', 'b', 'c'])

        submodels = [self.model1, self.model2]
        res = submock(submodels=submodels, basis=basis,
                      tree=self.tree, balances=self.balances)
        res.fit()
        pdt.assert_frame_equal(res.coefficients(project=True), exp_coef,
                               check_exact=False,
                               check_less_precise=True)
Beispiel #16
0
    def test_mixedlm_balances(self):
        np.random.seed(6241)
        n = 1600
        exog = np.random.normal(size=(n, 2))
        groups = np.kron(np.arange(n / 16), np.ones(16))

        # Build up the random error vector
        errors = 0

        # The random effects
        exog_re = np.random.normal(size=(n, 2))
        slopes = np.random.normal(size=(n / 16, 2))
        slopes = np.kron(slopes, np.ones((16, 1))) * exog_re
        errors += slopes.sum(1)

        # First variance component
        errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4))

        # Second variance component
        errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2))

        # iid errors
        errors += np.random.normal(size=n)

        endog = exog.sum(1) + errors

        df = pd.DataFrame(index=range(n))
        df["y1"] = endog
        df["y2"] = endog + 2 * 2
        df["groups"] = groups
        df["x1"] = exog[:, 0]
        df["x2"] = exog[:, 1]

        tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
        iv = ilr_inv(df[["y1", "y2"]].values)
        table = pd.DataFrame(iv, columns=['a', 'b', 'c'])
        metadata = df[['x1', 'x2', 'groups']]

        res = mixedlm("x1 + x2", table, metadata, tree, groups="groups")
        exp_pvalues = pd.DataFrame(
            [[4.923122e-236,  3.180390e-40,  3.972325e-35,  3.568599e-30],
             [9.953418e-02,  3.180390e-40,  3.972325e-35,  3.568599e-30]],
            index=['Y1', 'Y2'],
            columns=['Intercept', 'Intercept RE', 'x1', 'x2'])

        pdt.assert_frame_equal(res.pvalues, exp_pvalues,
                               check_less_precise=True)

        exp_coefficients = pd.DataFrame(
            [[4.211451,  -0.305906, 1.022008, 0.924873],
             [0.211451,  -0.305906, 1.022008, 0.924873]],
            columns=['Intercept', 'Intercept RE', 'x1', 'x2'],
            index=['Y1', 'Y2'])

        pdt.assert_frame_equal(res.coefficients(), exp_coefficients,
                               check_less_precise=True)
Beispiel #17
0
    def test_ols_ilr_inv_test(self):

        model = ols('x1 + x2', self.Y, self.X)
        model.fit()
        basis, _ = balance_basis(self.tree)
        # test pvalues
        exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues})
        pdt.assert_frame_equal(model.pvalues, exp)

        # test coefficients
        exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params})

        exp = pd.DataFrame(ilr_inv(exp, basis),
                           columns=['c', 'b', 'a'],
                           index=self.X.columns)

        res = model.coefficients(tree=self.tree)
        pdt.assert_frame_equal(res, exp)

        # test residuals
        exp = pd.DataFrame({
            'y1': self.r1_.resid,
            'y2': self.r2_.resid
        },
                           index=self.Y.index)
        exp = pd.DataFrame(ilr_inv(exp, basis),
                           index=self.Y.index,
                           columns=['c', 'b', 'a'])
        res = model.residuals(tree=self.tree)
        pdt.assert_frame_equal(res, exp)

        # test prediction
        exp = pd.DataFrame({
            'y1': self.r1_.predict(),
            'y2': self.r2_.predict()
        },
                           index=self.Y.index)
        exp = pd.DataFrame(ilr_inv(exp, basis),
                           index=self.Y.index,
                           columns=['c', 'b', 'a'])
        res = model.predict(tree=self.tree)
        pdt.assert_frame_equal(res, exp)
    def test_ilr_basis_isomorphism(self):
        # tests to make sure that the isomorphism holds
        # with the introduction of the basis.
        basis = np.array([[0.80442968, 0.19557032]])
        table = np.array([[
            np.log(1 / 10) * np.sqrt(1 / 2),
            np.log(1.14141414 / 9.90909091) * np.sqrt(1 / 2),
            np.log(1.28282828 / 9.81818182) * np.sqrt(1 / 2),
            np.log(1.42424242 / 9.72727273) * np.sqrt(1 / 2),
            np.log(1.56565657 / 9.63636364) * np.sqrt(1 / 2)
        ]]).T
        res = ilr(ilr_inv(table, basis=basis), basis=basis)
        npt.assert_allclose(res, table.squeeze())

        table = np.array([[1., 10.], [1.14141414, 9.90909091],
                          [1.28282828, 9.81818182], [1.42424242, 9.72727273],
                          [1.56565657, 9.63636364]])

        res = ilr_inv(np.atleast_2d(ilr(table, basis=basis)).T, basis=basis)
        npt.assert_allclose(res, closure(table.squeeze()))
Beispiel #19
0
    def test_ilr_basis_isomorphism(self):
        # tests to make sure that the isomorphism holds
        # with the introduction of the basis.
        basis = np.array([[0.80442968, 0.19557032]])
        table = np.array([[np.log(1/10)*np.sqrt(1/2),
                           np.log(1.14141414 / 9.90909091)*np.sqrt(1/2),
                           np.log(1.28282828 / 9.81818182)*np.sqrt(1/2),
                           np.log(1.42424242 / 9.72727273)*np.sqrt(1/2),
                           np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T
        res = ilr(ilr_inv(table, basis=basis), basis=basis)
        npt.assert_allclose(res, table.squeeze())

        table = np.array([[1., 10.],
                          [1.14141414, 9.90909091],
                          [1.28282828, 9.81818182],
                          [1.42424242, 9.72727273],
                          [1.56565657, 9.63636364]])

        res = ilr_inv(np.atleast_2d(ilr(table, basis=basis)).T, basis=basis)
        npt.assert_allclose(res, closure(table.squeeze()))
Beispiel #20
0
    def predict(self, X=None, project=False, **kwargs):
        """ Performs a prediction based on model.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates, where columns are covariates, and
            rows are samples.  If not specified, then the fitted values
            calculated from training the model will be returned.
        project : bool, optional
            Specifies if coefficients should be projected back into
            the Aitchison simplex [1]_.  If false, the coefficients will be
            represented as balances  (default: False).
        **kwargs : dict
            Other arguments to be passed into the model prediction.

        Returns
        -------
        pd.DataFrame
            A table of values where rows are coefficients, and the columns
            are either balances or proportions, depending on the value of
            `project`.

        References
        ----------
        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
           CDA work." Girona 24 (2003): 73-81.
        """
        self._check_projection(project)

        prediction = pd.DataFrame()
        for m in self.results:
            # check if X is none.
            p = pd.Series(m.predict(X, **kwargs))
            p.name = m.model.endog_names
            if X is not None:
                p.index = X.index
            else:
                p.index = m.fittedvalues.index
            prediction = prediction.append(p)

        if project:
            # `check=False`, due to a problem with error handling
            # addressed here https://github.com/biocore/scikit-bio/pull/1396
            # This will need to be fixed here:
            # https://github.com/biocore/gneiss/issues/34
            proj_prediction = ilr_inv(prediction.values.T,
                                      basis=self.basis,
                                      check=False)
            return pd.DataFrame(proj_prediction,
                                columns=self.feature_names,
                                index=prediction.columns)
        return prediction.T
Beispiel #21
0
    def setUp(self):
        A = np.array  # aliasing for the sake of pep8
        self.table = pd.DataFrame({
            's1': ilr_inv(A([1., 1.])),
            's2': ilr_inv(A([1., 2.])),
            's3': ilr_inv(A([1., 3.])),
            's4': ilr_inv(A([1., 4.])),
            's5': ilr_inv(A([1., 5.]))},
            index=['a', 'b', 'c']).T
        self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
        self.unannotated_tree = TreeNode.read(['(c, (b,a));'])
        self.metadata = pd.DataFrame({
            'lame': [1, 1, 1, 1, 1],
            'real': [1, 2, 3, 4, 5]
        }, index=['s1', 's2', 's3', 's4', 's5'])

        np.random.seed(0)
        n = 15
        a = np.array([1, 4.2, 5.3, -2.2, 8])
        x1 = np.linspace(.01, 0.1, n)
        x2 = np.logspace(0, 0.01, n)
        x3 = np.exp(np.linspace(0, 0.01, n))
        x4 = x1 ** 2
        self.x = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4})
        y = (a[0] + a[1]*x1 + a[2]*x2 + a[3]*x3 + a[4]*x4 +
             np.random.normal(size=n))
        sy = np.vstack((y, y/10)).T
        self.y = pd.DataFrame(ilr_inv(sy), columns=['a', 'b', 'c'])
        self.t2 = TreeNode.read([r"((a,b)n,c);"])
Beispiel #22
0
    def test_ols_ilr_inv_test(self):

        model = ols('x1 + x2', self.Y, self.X)
        model.fit()
        basis, _ = balance_basis(self.tree)
        # test pvalues
        exp = pd.DataFrame({'y1': self.r1_.pvalues,
                            'y2': self.r2_.pvalues})
        pdt.assert_frame_equal(model.pvalues, exp)

        # test coefficients
        exp = pd.DataFrame({'y1': self.r1_.params,
                            'y2': self.r2_.params})

        exp = pd.DataFrame(ilr_inv(exp, basis),
                           columns=['c', 'b', 'a'],
                           index=self.X.columns)

        res = model.coefficients(tree=self.tree)
        pdt.assert_frame_equal(res, exp)

        # test residuals
        exp = pd.DataFrame({'y1': self.r1_.resid,
                            'y2': self.r2_.resid},
                           index=self.Y.index)
        exp = pd.DataFrame(ilr_inv(exp, basis),
                           index=self.Y.index,
                           columns=['c', 'b', 'a'])
        res = model.residuals(tree=self.tree)
        pdt.assert_frame_equal(res, exp)

        # test prediction
        exp = pd.DataFrame({'y1': self.r1_.predict(),
                            'y2': self.r2_.predict()},
                           index=self.Y.index)
        exp = pd.DataFrame(ilr_inv(exp, basis),
                           index=self.Y.index,
                           columns=['c', 'b', 'a'])
        res = model.predict(tree=self.tree)
        pdt.assert_frame_equal(res, exp)
Beispiel #23
0
    def coefficients(self, project=False):
        """ Returns coefficients from fit.

        Parameters
        ----------
        project : bool, optional
            Specifies if coefficients should be projected back into
            the Aitchison simplex [1]_.  If false, the coefficients will be
            represented as balances  (default: False).

        Returns
        -------
        pd.DataFrame
            A table of values where columns are coefficients, and the index
            is either balances or proportions, depending on the value of
            `project`.

        Raises
        ------
        ValueError:
            Cannot perform projection into Aitchison simplex if `basis`
            is not specified.
        ValueError:
            Cannot perform projection into Aitchison simplex
            if `feature_names` is not specified.

        References
        ----------
        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
           CDA work." Girona 24 (2003): 73-81.
        """
        self._check_projection(project)
        coef = pd.DataFrame()

        for r in self.results:
            c = r.params
            c.name = r.model.endog_names
            coef = coef.append(c)

        if project:
            # `check=False`, due to a problem with error handling
            # addressed here https://github.com/biocore/scikit-bio/pull/1396
            # This will need to be fixed here:
            # https://github.com/biocore/gneiss/issues/34
            c = ilr_inv(coef.values.T, basis=self.basis, check=False).T
            return pd.DataFrame(c,
                                index=self.feature_names,
                                columns=coef.columns)
        else:
            return coef
Beispiel #24
0
 def test_ilr_inv_basis(self):
     exp = closure(np.array([[1., 10.],
                             [1.14141414, 9.90909091],
                             [1.28282828, 9.81818182],
                             [1.42424242, 9.72727273],
                             [1.56565657, 9.63636364]]))
     basis = np.array([[0.80442968, 0.19557032]])
     table = np.array([[np.log(1/10)*np.sqrt(1/2),
                        np.log(1.14141414 / 9.90909091)*np.sqrt(1/2),
                        np.log(1.28282828 / 9.81818182)*np.sqrt(1/2),
                        np.log(1.42424242 / 9.72727273)*np.sqrt(1/2),
                        np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T
     res = ilr_inv(table, basis=basis)
     npt.assert_allclose(res, exp)
Beispiel #25
0
 def test_ilr_inv_basis(self):
     exp = closure(np.array([[1., 10.],
                             [1.14141414, 9.90909091],
                             [1.28282828, 9.81818182],
                             [1.42424242, 9.72727273],
                             [1.56565657, 9.63636364]]))
     basis = np.array([[0.80442968, 0.19557032]])
     table = np.array([[np.log(1/10)*np.sqrt(1/2),
                        np.log(1.14141414 / 9.90909091)*np.sqrt(1/2),
                        np.log(1.28282828 / 9.81818182)*np.sqrt(1/2),
                        np.log(1.42424242 / 9.72727273)*np.sqrt(1/2),
                        np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T
     res = ilr_inv(table, basis=basis)
     npt.assert_allclose(res, exp)
Beispiel #26
0
    def setUp(self):
        self.results = "results"
        if not os.path.exists(self.results):
            os.mkdir(self.results)
        self.balances = pd.DataFrame(
            {
                'a': [-2, -1, 0, 1, 2],
                'b': [-2, 0, 0, 0, 0]
            },
            index=['a1', 'a2', 'a3', 'a4', 'a5'])
        self.tree = TreeNode.read([r'((k, q)d, ((x, y)a, z)b)c;'])

        self.taxonomy = pd.DataFrame(
            [['foo;barf;a;b;c;d;e', 1], ['foo;bark;f;g;h;i;j', 1],
             ['foo;bark;f;g;h;w;j', 1], ['nom;tu;k;l;m;n;o', 0.9],
             ['nom;tu;k;l;m;t;o', 0.9]],
            columns=['Taxon', 'Confidence'],
            index=['x', 'y', 'z', 'k', 'q'])

        self.balances = pd.DataFrame(
            [[1, 2, 3, 4, 5, 6, 7], [-3.1, -2.9, -3, 3, 2.9, 3.2, 3.1],
             [1, 1, 1, 1, 1, 1, 1], [3, 2, 1, 0, -1, -2, -3]],
            index=['d', 'a', 'b', 'c'],
            columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7']).T
        basis, _ = balance_basis(self.tree)
        self.table = pd.DataFrame(
            ilr_inv(self.balances, basis),
            columns=['x', 'y', 'z', 'k', 'q'],
            index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'])

        index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'], name='id')
        self.categorical = CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'a', 'b', 'b', 'b', 'b'],
                      index=index,
                      name='categorical'))
        self.multi_categorical = CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'c', 'b', 'b', 'b', 'c'],
                      index=index,
                      name='multi_categorical'))
        self.partial_numerical_categorical = CategoricalMetadataColumn(
            pd.Series(['1', '1', '1', '2', '2', '2', 'a'],
                      index=index,
                      name='multi_categorical'))
        self.full_numerical_categorical = CategoricalMetadataColumn(
            pd.Series(['1', '1', '1.0', '2', '2', '2.0', '3'],
                      index=index,
                      name='numerical_categorical'))
        self.continuous = NumericMetadataColumn(
            pd.Series(np.arange(7), index=index, name='continuous'))
Beispiel #27
0
    def residuals(self, project=False):
        """ Returns calculated residuals.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates.  If not specified, then the
            fitted values calculated from training the model will be
            returned.
        project : bool, optional
            Specifies if coefficients should be projected back into
            the Aitchison simplex [1]_.  If false, the coefficients will be
            represented as balances  (default: False).

        Returns
        -------
        pd.DataFrame
            A table of values where rows are samples, and the columns
            are either balances or proportions, depending on the value of
            `project`.

        References
        ----------
        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
           CDA work." Girona 24 (2003): 73-81.
        """
        self._check_projection(project)

        resid = pd.DataFrame()

        for r in self.results:
            err = r.resid
            err.name = r.model.endog_names
            resid = resid.append(err)

        if project:
            # `check=False`, due to a problem with error handling
            # addressed here https://github.com/biocore/scikit-bio/pull/1396
            # This will need to be fixed here:
            # https://github.com/biocore/gneiss/issues/34
            proj_resid = ilr_inv(resid.values.T, basis=self.basis,
                                 check=False).T
            return pd.DataFrame(proj_resid,
                                index=self.feature_names,
                                columns=resid.columns).T
        else:
            return resid.T
Beispiel #28
0
    def predict(self, X=None, tree=None, **kwargs):
        """ Performs a prediction based on model.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates, where columns are covariates, and
            rows are samples.  If not specified, then the fitted values
            calculated from training the model will be returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).
        **kwargs : dict
            Other arguments to be passed into the model prediction.

        Returns
        -------
        pd.DataFrame
            A table of predicted values where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.
        """
        prediction = pd.DataFrame()
        for m in self.results:
            # check if X is none.
            p = pd.Series(m.predict(X, **kwargs))
            p.name = m.model.endog_names
            if X is not None:
                p.index = X.index
            else:
                p.index = m.fittedvalues.index
            prediction = prediction.append(p)

        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_prediction = ilr_inv(prediction.values.T, basis=basis)
            return pd.DataFrame(proj_prediction,
                                columns=[n.name for n in tree.tips()],
                                index=prediction.columns)
        else:
            return prediction.T
Beispiel #29
0
    def residuals(self, tree=None):
        """ Returns calculated residuals from fit.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates.  If not specified, then the
            fitted values calculated from training the model will be
            returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of residuals where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.

        References
        ----------
        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
           CDA work." Girona 24 (2003): 73-81.
        """
        resid = pd.DataFrame()

        for r in self.results:
            err = r.resid
            err.name = r.model.endog_names
            resid = resid.append(err)

        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_resid = ilr_inv(resid.values.T, basis=basis).T
            return pd.DataFrame(proj_resid,
                                index=[n.name for n in tree.tips()],
                                columns=resid.columns).T
        else:
            return resid.T
Beispiel #30
0
    def predict(self, X=None, tree=None, **kwargs):
        """ Performs a prediction based on model.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates, where columns are covariates, and
            rows are samples.  If not specified, then the fitted values
            calculated from training the model will be returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).
        **kwargs : dict
            Other arguments to be passed into the model prediction.

        Returns
        -------
        pd.DataFrame
            A table of predicted values where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.

        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        if X is None:
            X = self.design_matrices

        prediction = X.dot(self._beta)
        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_prediction = ilr_inv(prediction.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(proj_prediction,
                                columns=ids,
                                index=prediction.index)
        else:
            return prediction
Beispiel #31
0
    def predict(self, X=None, tree=None, **kwargs):
        """ Performs a prediction based on model.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates, where columns are covariates, and
            rows are samples.  If not specified, then the fitted values
            calculated from training the model will be returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).
        **kwargs : dict
            Other arguments to be passed into the model prediction.

        Returns
        -------
        pd.DataFrame
            A table of predicted values where columns are coefficients,
            and the rows are balances. If `tree` is specified, then
            the rows are proportions.

        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        if X is None:
            X = self.design_matrices

        prediction = X.dot(self._beta)
        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_prediction = ilr_inv(prediction.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(proj_prediction,
                                columns=ids,
                                index=prediction.index)
        else:
            return prediction
    def test_ilr_inv(self):
        mat = closure(self.cdata7)
        npt.assert_array_almost_equal(ilr_inv(ilr(mat)), mat)

        npt.assert_allclose(ilr_inv(np.identity(3)),
                            self.ortho1,
                            rtol=1e-04,
                            atol=1e-06)

        with self.assertRaises(ValueError):
            ilr_inv(self.cdata1, basis=self.cdata1)

        # make sure that inplace modification is not occurring
        ilr_inv(self.cdata1)
        npt.assert_allclose(self.cdata1, np.array([[2, 2, 6], [4, 4, 2]]))
    def test_ilr_inv(self):
        mat = closure(self.cdata7)
        npt.assert_array_almost_equal(ilr_inv(ilr(mat)), mat)

        npt.assert_allclose(ilr_inv(np.identity(3)), self.ortho1,
                            rtol=1e-04, atol=1e-06)

        with self.assertRaises(ValueError):
            ilr_inv(self.cdata1, basis=self.cdata1)

        # make sure that inplace modification is not occurring
        ilr_inv(self.cdata1)
        npt.assert_allclose(self.cdata1,
                            np.array([[2, 2, 6],
                                      [4, 4, 2]]))
Beispiel #34
0
    def residuals(self, tree=None):
        """ Returns calculated residuals from fit.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates.  If not specified, then the
            fitted values calculated from training the model will be
            returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of residuals where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.

        References
        ----------
        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
           CDA work." Girona 24 (2003): 73-81.
        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        resid = self._resid
        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_resid = ilr_inv(resid.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(proj_resid,
                                columns=ids,
                                index=resid.index)
        else:
            return resid
Beispiel #35
0
    def setUp(self):
        np.random.seed(6241)
        n = 1600
        exog = np.random.normal(size=(n, 2))
        groups = np.kron(np.arange(n // 16), np.ones(16))

        # Build up the random error vector
        errors = 0

        # The random effects
        exog_re = np.random.normal(size=(n, 2))
        slopes = np.random.normal(size=(n // 16, 2))
        slopes = np.kron(slopes, np.ones((16, 1))) * exog_re
        errors += slopes.sum(1)

        # First variance component
        errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4))

        # Second variance component
        errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2))

        # iid errors
        errors += np.random.normal(size=n)

        endog = exog.sum(1) + errors

        df = pd.DataFrame(index=range(n))
        df["y1"] = endog
        df["y2"] = endog + 2 * 2
        df["groups"] = groups
        df["x1"] = exog[:, 0]
        df["x2"] = exog[:, 1]

        self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
        iv = ilr_inv(df[["y1", "y2"]].values)
        self.table = pd.DataFrame(iv, columns=['a', 'b', 'c'])
        self.metadata = df[['x1', 'x2', 'groups']]

        self.results = "results"
        os.mkdir(self.results)
Beispiel #36
0
    def residuals(self, tree=None):
        """ Returns calculated residuals from fit.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates.  If not specified, then the
            fitted values calculated from training the model will be
            returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of residuals where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.

        References
        ----------
        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
           CDA work." Girona 24 (2003): 73-81.
        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        resid = self._resid
        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_resid = ilr_inv(resid.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(proj_resid, columns=ids, index=resid.index)
        else:
            return resid
Beispiel #37
0
    def test_ols_empty_metadata_error(self):
        A = np.array  # aliasing for the sake of pep8
        table = pd.DataFrame({
            'k1': ilr_inv(A([1., 1.])),
            'k2': ilr_inv(A([1., 2.])),
            'k3': ilr_inv(A([1., 3.])),
            'k4': ilr_inv(A([1., 4.])),
            'k5': ilr_inv(A([1., 5.])),
            'k6': ilr_inv(A([1., 5.]))},
            index=['a', 'b', 'c']).T

        tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;'])
        metadata = pd.DataFrame({
            'lame': [1, 1, 1, 1, 1],
            'real': [1, 2, 3, 4, 5]
        }, index=['s1', 's2', 's3', 's4', 's5'])
        with self.assertRaises(ValueError):
            ols('real + lame', table, metadata, tree)
Beispiel #38
0
 def test_regression_results_residuals_projection(self):
     A = np.array  # aliasing np.array for the sake of pep8
     exp_resid = pd.DataFrame(
         {
             's1': ilr_inv(A([-0.986842, -0.236842])),
             's2': ilr_inv(A([-0.065789, -1.815789])),
             's3': ilr_inv(A([1.473684, 0.473684])),
             's4': ilr_inv(A([1.394737, -1.105263])),
             's5': ilr_inv(A([-1.065789, 1.184211])),
             's6': ilr_inv(A([-1.144737, -0.394737])),
             's7': ilr_inv(A([0.394737, 1.894737]))
         },
         index=['Z1', 'Z2', 'Z3']).T
     feature_names = ['Z1', 'Z2', 'Z3']
     basis = _gram_schmidt_basis(3)
     res = RegressionResults(self.results,
                             basis=basis,
                             feature_names=feature_names)
     pdt.assert_frame_equal(res.residuals(project=True),
                            exp_resid,
                            check_exact=False,
                            check_less_precise=True)
Beispiel #39
0
    def test_regression_results_predict_projection(self):
        feature_names = ['Z1', 'Z2', 'Z3']
        basis = _gram_schmidt_basis(3)
        model = RegressionResults(self.results,
                                  basis=basis,
                                  feature_names=feature_names)

        res_predict = model.predict(self.data[['X']], project=True)
        A = np.array  # aliasing np.array for the sake of pep8
        exp_predict = pd.DataFrame(
            {
                's1': ilr_inv(A([1.986842, 1.236842])),
                's2': ilr_inv(A([3.065789, 3.815789])),
                's3': ilr_inv(A([2.526316, 2.526316])),
                's4': ilr_inv(A([3.605263, 5.105263])),
                's5': ilr_inv(A([3.065789, 3.815789])),
                's6': ilr_inv(A([4.144737, 6.394737])),
                's7': ilr_inv(A([3.605263, 5.105263]))
            },
            index=feature_names).T

        pdt.assert_frame_equal(res_predict, exp_predict)