def test_type_cast_to_float(self): x = pd.DataFrame({'a': [1, 2, 3, 4, 5], 'b': ['1', '2', '3', '4', '5'], 'c': ['a', 'b', 'c', 'd', 'e'], 'd': [1., 2., 3., 4., 5.]}) res = _type_cast_to_float(x) exp = pd.DataFrame({'a': [1., 2., 3., 4., 5.], 'b': [1., 2., 3., 4., 5.], 'c': ['a', 'b', 'c', 'd', 'e'], 'd': [1., 2., 3., 4., 5.]}) pdt.assert_frame_equal(res, exp)
def test_type_cast_to_float(self): x = pd.DataFrame({ 'a': [1, 2, 3, 4, 5], 'b': ['1', '2', '3', '4', '5'], 'c': ['a', 'b', 'c', 'd', 'e'], 'd': [1., 2., 3., 4., 5.] }) res = _type_cast_to_float(x) exp = pd.DataFrame({ 'a': [1., 2., 3., 4., 5.], 'b': [1., 2., 3., 4., 5.], 'c': ['a', 'b', 'c', 'd', 'e'], 'd': [1., 2., 3., 4., 5.] }) pdt.assert_frame_equal(res, exp)
def mixedlm(formula, table, metadata, groups, **kwargs): """ Linear Mixed Effects Models applied to balances. Linear mixed effects (LME) models is a method for estimating parameters in a linear regression model with mixed effects. LME models are commonly used for repeated measures, where multiple samples are collected from a single source. This implementation is focused on performing a multivariate response regression with mixed effects where the response is a matrix of balances (`table`), the covariates (`metadata`) are made up of external variables and the samples sources are specified by `groups`. T-statistics (`tvalues`) and p-values (`pvalues`) can be obtained to investigate to evaluate statistical significance for a covariate for a given balance. Predictions on the resulting model can be made using (`predict`), and these results can be interpreted as either balances or proportions. Parameters ---------- formula : str Formula representing the statistical equation to be evaluated. These strings are similar to how equations are handled in R. Note that the dependent variable in this string should not be specified, since this method will be run on each of the individual balances. See `patsy` [1]_ for more details. table : pd.DataFrame Contingency table where samples correspond to rows and balances correspond to columns. metadata: pd.DataFrame Metadata table that contains information about the samples contained in the `table` object. Samples correspond to rows and covariates correspond to columns. groups : str Column name in `metadata` that specifies the groups. These groups are often associated with individuals repeatedly sampled, typically longitudinally. **kwargs : dict Other arguments accepted into `statsmodels.regression.linear_model.MixedLM` Returns ------- LMEModel Container object that holds information about the overall fit. This includes information about coefficients, pvalues and residuals from the resulting regression. References ---------- .. [1] https://patsy.readthedocs.io/en/latest/ Examples -------- >>> import pandas as pd >>> import numpy as np >>> from gneiss.regression import mixedlm Here, we will define a table of balances with features `Y1`, `Y2` across 12 samples. >>> table = pd.DataFrame({ ... 'u1': [ 1.00000053, 6.09924644], ... 'u2': [ 0.99999843, 7.0000045 ], ... 'u3': [ 1.09999884, 8.08474053], ... 'x1': [ 1.09999758, 1.10000349], ... 'x2': [ 0.99999902, 2.00000027], ... 'x3': [ 1.09999862, 2.99998318], ... 'y1': [ 1.00000084, 2.10001257], ... 'y2': [ 0.9999991 , 3.09998418], ... 'y3': [ 0.99999899, 3.9999742 ], ... 'z1': [ 1.10000124, 5.0001796 ], ... 'z2': [ 1.00000053, 6.09924644], ... 'z3': [ 1.10000173, 6.99693644]}, .. index=['Y1', 'Y2']).T Now we are going to define some of the external variables to test for in the model. Here we will be testing a hypothetical longitudinal study across 3 time points, with 4 patients `x`, `y`, `z` and `u`, where `x` and `y` were given treatment `1` and `z` and `u` were given treatment `2`. >>> metadata = pd.DataFrame({ ... 'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], ... 'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], ... 'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] ... }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3', ... 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) Now we can run the linear mixed effects model on the balances. Underneath the hood, the proportions will be transformed into balances, so that the linear mixed effects models can be run directly on balances. Since each patient was sampled repeatedly, we'll specify them separately in the groups. In the linear mixed effects model `time` and `treatment` will be simultaneously tested for with respect to the balances. >>> res = mixedlm('time + treatment', table, metadata, ... groups='patient') See Also -------- statsmodels.regression.linear_model.MixedLM ols """ metadata = _type_cast_to_float(metadata.copy()) data = pd.merge(table, metadata, left_index=True, right_index=True) if len(data) == 0: raise ValueError(("No more samples left. Check to make sure that " "the sample names between `metadata` and `table` " "are consistent")) submodels = [] for b in table.columns: # mixed effects code is obtained here: # http://stackoverflow.com/a/22439820/1167475 stats_formula = '%s ~ %s' % (b, formula) mdf = smf.mixedlm(stats_formula, data=data, groups=data[groups], **kwargs) submodels.append(mdf) # ugly hack to get around the statsmodels object model = LMEModel(Y=table, Xs=None) model.submodels = submodels model.balances = table return model
def ols(formula, table, metadata, tree, **kwargs): """ Ordinary Least Squares applied to balances. An ordinary least square regression is performed on nonzero relative abundance data given a list of covariates, or explanatory variables such as ph, treatment, etc to test for specific effects. The relative abundance data is transformed into balances using the ILR transformation, using a tree to specify the groupings of the features. The regression is then performed on each balance separately. Only positive data will be accepted, so if there are zeros present, consider using a zero imputation method such as ``multiplicative_replacement`` or add a pseudocount. Parameters ---------- formula : str Formula representing the statistical equation to be evaluated. These strings are similar to how equations are handled in R and statsmodels. Note that the dependent variable in this string should not be specified, since this method will be run on each of the individual balances. See `patsy` for more details. table : pd.DataFrame Contingency table where samples correspond to rows and features correspond to columns. The features could either correspond proportions or balances. metadata: pd.DataFrame Metadata table that contains information about the samples contained in the `table` object. Samples correspond to rows and covariates correspond to columns. tree : skbio.TreeNode Tree object that defines the partitions of the features. Each of the leaves correspond to the columns contained in the table. **kwargs : dict Other arguments accepted into `statsmodels.regression.linear_model.OLS` Returns ------- OLSModel Container object that holds information about the overall fit. This includes information about coefficients, pvalues, residuals and coefficient of determination from the resulting regression. Example ------- >>> from gneiss.regression import ols >>> from skbio import TreeNode >>> import pandas as pd Here, we will define a table of proportions with 3 features `a`, `b`, and `c` across 5 samples. >>> proportions = pd.DataFrame( ... [[0.720463, 0.175157, 0.104380], ... [0.777794, 0.189095, 0.033111], ... [0.796416, 0.193622, 0.009962], ... [0.802058, 0.194994, 0.002948], ... [0.803731, 0.195401, 0.000868]], ... columns=['a', 'b', 'c'], ... index=['s1', 's2', 's3', 's4', 's5']) Now we will define the environment variables that we want to regress against the proportions. >>> env_vars = pd.DataFrame({ ... 'temp': [20, 20, 20, 20, 21], ... 'ph': [1, 2, 3, 4, 5]}, ... index=['s1', 's2', 's3', 's4', 's5']) Finally, we need to define a bifurcating tree used to convert the proportions to balances. If the internal nodes aren't labels, a default labeling will be applied (i.e. `y1`, `y2`, ...) >>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) Once these 3 variables are defined, a regression can be performed. These proportions will be converted to balances according to the tree specified. And the regression formula is specified to run `temp` and `ph` against the proportions in a single model. >>> res = ols('temp + ph', proportions, env_vars, tree) From the summary results of the `ols` function, we can view the pvalues according to how well each individual balance fitted in the regression model. >>> res.pvalues Intercept ph temp Y1 2.479592e-01 1.990984e-11 0.243161 Y2 6.089193e-10 5.052733e-01 0.279805 We can also view the balance coefficients estimated in the regression model. These coefficients can also be viewed as proportions by passing `project=True` as an argument in `res.coefficients()`. >>> res.coefficients() Intercept ph temp Y1 -0.000499 9.999911e-01 0.000026 Y2 1.000035 2.865312e-07 -0.000002 The balance residuals from the model can be viewed as follows. Again, these residuals can be viewed as proportions by passing `project=True` into `res.residuals()` >>> res.residuals() Y1 Y2 s1 -4.121647e-06 -2.998793e-07 s2 6.226749e-07 -1.602904e-09 s3 1.111959e-05 9.028437e-07 s4 -7.620619e-06 -6.013615e-07 s5 -1.332268e-14 -2.375877e-14 The predicted balances can be obtained as follows. Note that the predicted proportions can also be obtained by passing `project=True` into `res.predict()` >>> res.predict() Y1 Y2 s1 1.000009 0.999999 s2 2.000000 0.999999 s3 2.999991 0.999999 s4 3.999982 1.000000 s5 4.999999 0.999998 The overall model fit can be obtained as follows >>> res.r2 0.99999999997996369 See Also -------- statsmodels.regression.linear_model.OLS skbio.stats.composition.multiplicative_replacement """ # TODO: clean up table, metadata, tree = _intersect_of_table_metadata_tree( table, metadata, tree) ilr_table, basis = _to_balances(table, tree) ilr_table, metadata = ilr_table.align(metadata, join='inner', axis=0) # one-time creation of exogenous data matrix allows for faster run-time metadata = _type_cast_to_float(metadata) x = dmatrix(formula, metadata, return_type='dataframe') submodels = _fit_ols(ilr_table, x) basis = pd.DataFrame(basis, index=ilr_table.columns, columns=table.columns) return OLSModel(submodels, basis=basis, balances=ilr_table, tree=tree)
def ols(formula, table, metadata): """ Ordinary Least Squares applied to balances. An ordinary least squares (OLS) regression is a method for estimating parameters in a linear regression model. OLS is a common statistical technique for fitting and testing the effects of covariates on a response. This implementation is focused on performing a multivariate response regression where the response is a matrix of balances (`table`) and the covariates (`metadata`) are made up of external variables. Global statistical tests indicating goodness of fit and contributions from covariates can be accessed from a coefficient of determination (`r2`), leave-one-variable-out cross validation (`lovo`), leave-one-out cross validation (`loo`) and k-fold cross validation (`kfold`). In addition residuals (`residuals`) can be accessed for diagnostic purposes. T-statistics (`tvalues`) and p-values (`pvalues`) can be obtained to investigate to evaluate statistical significance for a covariate for a given balance. Predictions on the resulting model can be made using (`predict`), and these results can be interpreted as either balances or proportions. Parameters ---------- formula : str Formula representing the statistical equation to be evaluated. These strings are similar to how equations are handled in R and statsmodels. Note that the dependent variable in this string should not be specified, since this method will be run on each of the individual balances. See `patsy` for more details. table : pd.DataFrame Contingency table where samples correspond to rows and balances correspond to columns. metadata: pd.DataFrame Metadata table that contains information about the samples contained in the `table` object. Samples correspond to rows and covariates correspond to columns. Returns ------- OLSModel Container object that holds information about the overall fit. This includes information about coefficients, pvalues, residuals and coefficient of determination from the resulting regression. Example ------- >>> import numpy as np >>> import pandas as pd >>> from skbio import TreeNode >>> from gneiss.regression import ols Here, we will define a table of balances as follows >>> np.random.seed(0) >>> n = 100 >>> g1 = np.linspace(0, 15, n) >>> y1 = g1 + 5 >>> y2 = -g1 - 2 >>> Y = pd.DataFrame({'y1': y1, 'y2': y2}) Once we have the balances defined, we will add some errors >>> e = np.random.normal(loc=1, scale=0.1, size=(n, 2)) >>> Y = Y + e Now we will define the environment variables that we want to regress against the balances. >>> X = pd.DataFrame({'g1': g1}) Once these variables are defined, a regression can be performed. These proportions will be converted to balances according to the tree specified. And the regression formula is specified to run `temp` and `ph` against the proportions in a single model. >>> res = ols('g1', Y, X) >>> res.fit() From the summary results of the `ols` function, we can view the pvalues according to how well each individual balance fitted in the regression model. >>> res.pvalues y1 y2 Intercept 8.826379e-148 7.842085e-71 g1 1.923597e-163 1.277152e-163 We can also view the balance coefficients estimated in the regression model. These coefficients can also be viewed as proportions by passing `project=True` as an argument in `res.coefficients()`. >>> res.coefficients() y1 y2 Intercept 6.016459 -0.983476 g1 0.997793 -1.000299 The overall model fit can be obtained as follows >>> res.r2 0.99945903186495066 """ # one-time creation of exogenous data matrix allows for faster run-time metadata = _type_cast_to_float(metadata.copy()) x = dmatrix(formula, metadata, return_type='dataframe') ilr_table, x = table.align(x, join='inner', axis=0) return OLSModel(Y=ilr_table, Xs=x)