def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame: """Performs isometric logratio (ilr) transformation on feature-table. This creates a new table with balances (groups of features) that distinguish samples. Zeros must first be removed from the table (e.g. add-pseudocount). For source documentation check out: https://numpydoc.readthedocs.io/en/latest/ Parameters ----------- table : pd.DataFrame Dataframe of the feature table where rows correspond to samples and columns are features. The values within the table must be positive and nonzero. tree : skbio.TreeNode A tree relating all of the features to balances or log-contrasts (hierarchy). This tree must be bifurcating (i.e. has exactly 2 nodes). The internal nodes of the tree will be renamed. Returns -------- balances : pd.DataFrame Balances calculated from the feature table. Balance represents the log ratio of subchildren values below the specified internal node. """ _table, _tree = match_tips(table, tree) basis, nodes = balance_basis(_tree) balances = ilr(_table.values, basis) in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()] return pd.DataFrame(balances, columns=in_nodes, index=table.index)
def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame: """Performs isometric logratio (ilr) transformation on feature-table. This creates a new table with balances (groups of features) that distinguish samples. Zeros must first be removed from the table (e.g. add-pseudocount). For source documentation check out: https://numpydoc.readthedocs.io/en/latest/ Parameters ----------- table : pd.DataFrame Dataframe of the feature table where rows correspond to samples and columns are features. The values within the table must be positive and nonzero. tree : skbio.TreeNode A tree relating all of the features to balances or log-contrasts (hierarchy). This tree must be bifurcating (i.e. has exactly 2 nodes). The internal nodes of the tree will be renamed. Returns -------- balances : pd.DataFrame Balances calculated from the feature table. Balance represents the log ratio of subchildren values below the specified internal node. """ _table, _tree = match_tips(table, tree) basis, nodes = balance_basis(_tree) balances = ilr(_table.values, basis) in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()] return pd.DataFrame(balances, columns=in_nodes, index=table.index)
def test_regression_results_residuals_projection(self): tree = TreeNode.read([r'(c, (a, b)Y2)Y1;']) basis, _ = balance_basis(tree) exp_resid = pd.DataFrame( { 's1': [-0.986842, -0.236842], 's2': [-0.065789, -1.815789], 's3': [1.473684, 0.473684], 's4': [1.394737, -1.105263], 's5': [-1.065789, 1.184211], 's6': [-1.144737, -0.394737], 's7': [0.394737, 1.894737] }, index=['Y1', 'Y2']).T exp_resid = pd.DataFrame( ilr_inv(exp_resid, basis), index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'], columns=['c', 'a', 'b']) submodels = [self.model1, self.model2] res = submock(Y=self.balances, Xs=None) submock.submodels = submodels res.fit() res_resid = res.residuals(tree).sort_index() pdt.assert_frame_equal(res_resid, exp_resid, check_exact=False, check_less_precise=True)
def coefficients(self, tree=None): """ Returns coefficients from fit. Parameters ---------- tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of coefficients where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) coef = self._beta if tree is not None: basis, _ = balance_basis(tree) c = ilr_inv(coef.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(c, columns=ids, index=coef.index) else: return coef
def coefficients(self, tree=None): """ Returns coefficients from fit. Parameters ---------- tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of coefficients where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. """ coef = pd.DataFrame() for r in self.results: c = r.params c.name = r.model.endog_names coef = coef.append(c) if tree is not None: basis, _ = balance_basis(tree) c = ilr_inv(coef.values.T, basis=basis).T return pd.DataFrame(c, index=[n.name for n in tree.tips()], columns=coef.columns) else: return coef.T
def coefficients(self, tree=None): """ Returns coefficients from fit. Parameters ---------- tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of coefficients where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) coef = self._beta if tree is not None: basis, _ = balance_basis(tree) c = ilr_inv(coef.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(c, columns=ids, index=coef.index) else: return coef
def _to_balances(table, tree): """ Converts a table of abundances to balances given a tree. Parameters ---------- table : pd.DataFrame Contingency table where samples correspond to rows and features correspond to columns. tree : skbio.TreeNode Tree object where the leaves correspond to the columns contained in the table. Returns ------- pd.DataFrame Contingency table where samples correspond to rows and balances correspond to columns. np.array Orthonormal basis in the Aitchison simplex generated from `tree`. """ non_tips = [n.name for n in tree.levelorder() if not n.is_tip()] basis, _ = balance_basis(tree) mat = ilr(table.values, basis=basis) ilr_table = pd.DataFrame(mat, columns=non_tips, index=table.index) return ilr_table, basis
def test_balance_basis_large1(self): fname = get_data_path('large_tree.nwk', subfolder='data') t = TreeNode.read(fname) # note that the basis is in reverse level order exp_basis = np.loadtxt( get_data_path('large_tree_basis.txt', subfolder='data')) res_basis, res_keys = balance_basis(t) npt.assert_allclose(exp_basis[:, ::-1], res_basis)
def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame: _table, _tree = match_tips(table, tree) basis, _ = balance_basis(_tree) balances = ilr(_table.values, basis) in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()] return pd.DataFrame(balances, columns=in_nodes, index=table.index)
def test_balance_basis_base_case(self): tree = u"(a,b);" t = TreeNode.read([tree]) exp_keys = [t.name] exp_basis = np.array([0.19557032, 0.80442968]) res_basis, res_keys = balance_basis(t) npt.assert_allclose(exp_basis, res_basis) self.assertListEqual(exp_keys, res_keys)
def test_balance_basis_base_case(self): tree = u"(a,b);" t = TreeNode.read([tree]) exp_keys = [t.name] exp_basis = np.array([0.19557032, 0.80442968]) res_basis, res_keys = balance_basis(t) npt.assert_allclose(exp_basis, res_basis) self.assertListEqual(exp_keys, res_keys)
def test_balance_basis_large1(self): fname = get_data_path('large_tree.nwk', subfolder='data') t = TreeNode.read(fname) # note that the basis is in reverse level order exp_basis = np.loadtxt( get_data_path('large_tree_basis.txt', subfolder='data')) res_basis, res_keys = balance_basis(t) npt.assert_allclose(exp_basis[:, ::-1], res_basis)
def test_balance_basis_unbalanced(self): tree = u"((a,b)c, d);" t = TreeNode.read([tree]) exp_keys = [t.name, t[0].name] exp_basis = np.array([[0.18507216, 0.18507216, 0.62985567], [0.14002925, 0.57597535, 0.28399541]]) res_basis, res_keys = balance_basis(t) npt.assert_allclose(exp_basis, res_basis) self.assertListEqual(exp_keys, list(res_keys))
def test_balance_basis_unbalanced(self): tree = u"((a,b)c, d);" t = TreeNode.read([tree]) exp_keys = [t.name, t[0].name] exp_basis = np.array([[0.18507216, 0.18507216, 0.62985567], [0.14002925, 0.57597535, 0.28399541]]) res_basis, res_keys = balance_basis(t) npt.assert_allclose(exp_basis, res_basis) self.assertListEqual(exp_keys, list(res_keys))
def setUp(self): self.results = "results" if not os.path.exists(self.results): os.mkdir(self.results) self.balances = pd.DataFrame( { 'a': [-2, -1, 0, 1, 2], 'b': [-2, 0, 0, 0, 0] }, index=['a1', 'a2', 'a3', 'a4', 'a5']) self.tree = TreeNode.read([r'((k, q)d, ((x, y)a, z)b)c;']) self.taxonomy = pd.DataFrame( [['foo;barf;a;b;c;d;e', 1], ['foo;bark;f;g;h;i;j', 1], ['foo;bark;f;g;h;w;j', 1], ['nom;tu;k;l;m;n;o', 0.9], ['nom;tu;k;l;m;t;o', 0.9]], columns=['Taxon', 'Confidence'], index=['x', 'y', 'z', 'k', 'q']) self.balances = pd.DataFrame( [[1, 2, 3, 4, 5, 6, 7], [-3.1, -2.9, -3, 3, 2.9, 3.2, 3.1], [1, 1, 1, 1, 1, 1, 1], [3, 2, 1, 0, -1, -2, -3]], index=['d', 'a', 'b', 'c'], columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7']).T basis, _ = balance_basis(self.tree) self.table = pd.DataFrame( ilr_inv(self.balances, basis), columns=['x', 'y', 'z', 'k', 'q'], index=['s1', 's2', 's3', 's4', 's5', 's6', 's7']) index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'], name='id') self.categorical = CategoricalMetadataColumn( pd.Series(['a', 'a', 'a', 'b', 'b', 'b', 'b'], index=index, name='categorical')) self.multi_categorical = CategoricalMetadataColumn( pd.Series(['a', 'a', 'c', 'b', 'b', 'b', 'c'], index=index, name='multi_categorical')) self.partial_numerical_categorical = CategoricalMetadataColumn( pd.Series(['1', '1', '1', '2', '2', '2', 'a'], index=index, name='multi_categorical')) self.full_numerical_categorical = CategoricalMetadataColumn( pd.Series(['1', '1', '1.0', '2', '2', '2.0', '3'], index=index, name='numerical_categorical')) self.continuous = NumericMetadataColumn( pd.Series(np.arange(7), index=index, name='continuous'))
def setUp(self): self.pickle_fname = "test.pickle" self.data = pd.DataFrame( [[1, 1, 1], [3, 2, 3], [4, 3, 2], [5, 4, 4], [2, 5, 3], [3, 6, 5], [4, 7, 4]], index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'], columns=['Y1', 'Y2', 'X']) self.model1 = smf.ols(formula="Y1 ~ X", data=self.data) self.model2 = smf.ols(formula="Y2 ~ X", data=self.data) self.tree = TreeNode.read(['((a,b)Y1, c)Y2;']) self.basis = pd.DataFrame(clr_inv(balance_basis(self.tree)[0]), columns=['a', 'b', 'c'], index=['Y1', 'Y2']) self.balances = pd.DataFrame(self.data[['Y1', 'Y2']], index=self.data.index, columns=['Y1', 'Y2'])
def predict(self, X=None, tree=None, **kwargs): """ Performs a prediction based on model. Parameters ---------- X : pd.DataFrame, optional Input table of covariates, where columns are covariates, and rows are samples. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). **kwargs : dict Other arguments to be passed into the model prediction. Returns ------- pd.DataFrame A table of predicted values where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. """ prediction = pd.DataFrame() for m in self.results: # check if X is none. p = pd.Series(m.predict(X, **kwargs)) p.name = m.model.endog_names if X is not None: p.index = X.index else: p.index = m.fittedvalues.index prediction = prediction.append(p) if tree is not None: basis, _ = balance_basis(tree) proj_prediction = ilr_inv(prediction.values.T, basis=basis) return pd.DataFrame(proj_prediction, columns=[n.name for n in tree.tips()], index=prediction.columns) else: return prediction.T
def residuals(self, tree=None): """ Returns calculated residuals from fit. Parameters ---------- X : pd.DataFrame, optional Input table of covariates. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of residuals where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ resid = pd.DataFrame() for r in self.results: err = r.resid err.name = r.model.endog_names resid = resid.append(err) if tree is not None: basis, _ = balance_basis(tree) proj_resid = ilr_inv(resid.values.T, basis=basis).T return pd.DataFrame(proj_resid, index=[n.name for n in tree.tips()], columns=resid.columns).T else: return resid.T
def test_ols_ilr_inv_test(self): model = ols('x1 + x2', self.Y, self.X) model.fit() basis, _ = balance_basis(self.tree) # test pvalues exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues}) pdt.assert_frame_equal(model.pvalues, exp) # test coefficients exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params}) exp = pd.DataFrame(ilr_inv(exp, basis), columns=['c', 'b', 'a'], index=self.X.columns) res = model.coefficients(tree=self.tree) pdt.assert_frame_equal(res, exp) # test residuals exp = pd.DataFrame({ 'y1': self.r1_.resid, 'y2': self.r2_.resid }, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.residuals(tree=self.tree) pdt.assert_frame_equal(res, exp) # test prediction exp = pd.DataFrame({ 'y1': self.r1_.predict(), 'y2': self.r2_.predict() }, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.predict(tree=self.tree) pdt.assert_frame_equal(res, exp)
def predict(self, X=None, tree=None, **kwargs): """ Performs a prediction based on model. Parameters ---------- X : pd.DataFrame, optional Input table of covariates, where columns are covariates, and rows are samples. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). **kwargs : dict Other arguments to be passed into the model prediction. Returns ------- pd.DataFrame A table of predicted values where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) if X is None: X = self.design_matrices prediction = X.dot(self._beta) if tree is not None: basis, _ = balance_basis(tree) proj_prediction = ilr_inv(prediction.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(proj_prediction, columns=ids, index=prediction.index) else: return prediction
def predict(self, X=None, tree=None, **kwargs): """ Performs a prediction based on model. Parameters ---------- X : pd.DataFrame, optional Input table of covariates, where columns are covariates, and rows are samples. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). **kwargs : dict Other arguments to be passed into the model prediction. Returns ------- pd.DataFrame A table of predicted values where columns are coefficients, and the rows are balances. If `tree` is specified, then the rows are proportions. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) if X is None: X = self.design_matrices prediction = X.dot(self._beta) if tree is not None: basis, _ = balance_basis(tree) proj_prediction = ilr_inv(prediction.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(proj_prediction, columns=ids, index=prediction.index) else: return prediction
def residuals(self, tree=None): """ Returns calculated residuals from fit. Parameters ---------- X : pd.DataFrame, optional Input table of covariates. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of residuals where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) resid = self._resid if tree is not None: basis, _ = balance_basis(tree) proj_resid = ilr_inv(resid.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(proj_resid, columns=ids, index=resid.index) else: return resid
def test_ols_ilr_inv_test(self): model = ols('x1 + x2', self.Y, self.X) model.fit() basis, _ = balance_basis(self.tree) # test pvalues exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues}) pdt.assert_frame_equal(model.pvalues, exp) # test coefficients exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params}) exp = pd.DataFrame(ilr_inv(exp, basis), columns=['c', 'b', 'a'], index=self.X.columns) res = model.coefficients(tree=self.tree) pdt.assert_frame_equal(res, exp) # test residuals exp = pd.DataFrame({'y1': self.r1_.resid, 'y2': self.r2_.resid}, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.residuals(tree=self.tree) pdt.assert_frame_equal(res, exp) # test prediction exp = pd.DataFrame({'y1': self.r1_.predict(), 'y2': self.r2_.predict()}, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.predict(tree=self.tree) pdt.assert_frame_equal(res, exp)
def residuals(self, tree=None): """ Returns calculated residuals from fit. Parameters ---------- X : pd.DataFrame, optional Input table of covariates. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of residuals where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) resid = self._resid if tree is not None: basis, _ = balance_basis(tree) proj_resid = ilr_inv(resid.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(proj_resid, columns=ids, index=resid.index) else: return resid