def test_varmat1(self): X = pd.DataFrame({'x': np.arange(1, 10), 'y': np.arange(2, 11)}) res = variation_matrix(X) exp = DistanceMatrix( [[0, 0.032013010420979787 / 2], [0.032013010420979787 / 2, 0]], ids=['x', 'y']) self.assertEqual(str(res), str(exp))
def test_varmat1(self): X = pd.DataFrame({'x': np.arange(1, 10), 'y': np.arange(2, 11)}) res = variation_matrix(X) exp = DistanceMatrix([[0, 0.032013010420979787 / 2], [0.032013010420979787 / 2, 0]], ids=['x', 'y']) self.assertEqual(str(res), str(exp))
def test_varmat_larg(self): np.random.seed(123) D = 50 N = 100 mean = np.ones(D) * 10 cov = np.eye(D) n__ = np.random.multivariate_normal(mean, cov, size=N) X = pd.DataFrame(np.abs(n__), columns=np.arange(D).astype(np.str)) res = variation_matrix(X) exp = DistanceMatrix.read(get_data_path('exp_varmat.txt')) self.assertEqual(str(res), str(exp))
def test_varmat_larg(self): np.random.seed(123) D = 50 N = 100 mean = np.ones(D)*10 cov = np.eye(D) X = pd.DataFrame(np.abs(np.random.multivariate_normal(mean, cov, size=N)), columns=np.arange(D).astype(np.str)) res = variation_matrix(X) exp = DistanceMatrix.read(get_data_path('exp_varmat.txt')) self.assertEqual(str(res), str(exp))
def proportional_linkage(X, method='ward'): r""" Principal Balance Analysis using Hierarchical Clustering based on proportionality. The hierarchy is built based on the proportionality between any two pairs of features. Specifically the proportionality between two features :math:`x` and :math:`y` is measured by .. math:: p(x, y) = var (\ln \frac{x}{y}) If :math:`p(x, y)` is very small, then :math:`x` and :math:`y` are said to be highly proportional. A hierarchical clustering is then performed using this proportionality as a distance metric. Parameters ---------- X : pd.DataFrame Contingency table where the samples are rows and the features are columns. method : str Clustering method. (default='ward') Returns ------- skbio.TreeNode Tree generated from principal balance analysis. References ---------- .. [1] Pawlowsky-Glahn V, Egozcue JJ, and Tolosana-Delgado R. Principal Balances (2011). Examples -------- >>> import pandas as pd >>> from gneiss.cluster import proportional_linkage >>> table = pd.DataFrame([[1, 1, 0, 0, 0], ... [0, 1, 1, 0, 0], ... [0, 0, 1, 1, 0], ... [0, 0, 0, 1, 1]], ... columns=['s1', 's2', 's3', 's4', 's5'], ... index=['o1', 'o2', 'o3', 'o4']).T >>> tree = proportional_linkage(table+0.1) """ dm = variation_matrix(X) lm = linkage(dm.condensed_form(), method=method) return TreeNode.from_linkage_matrix(lm, X.columns)
def correlation_linkage(X, method='ward'): r""" Hierarchical Clustering based on proportionality. The hierarchy is built based on the correlationity between any two pairs of features. Specifically the correlation between two features :math:`x` and :math:`y` is measured by .. math:: p(x, y) = var (\ln \frac{x}{y}) If :math:`p(x, y)` is very small, then :math:`x` and :math:`y` are said to be highly correlation. A hierarchical clustering is then performed using this correlation as a distance metric. This can be useful for constructing principal balances [1]_. Parameters ---------- X : pd.DataFrame Contingency table where the samples are rows and the features are columns. method : str Clustering method. (default='ward') Returns ------- skbio.TreeNode Tree for constructing principal balances. References ---------- .. [1] Pawlowsky-Glahn V, Egozcue JJ, and Tolosana-Delgado R. Principal Balances (2011). Examples -------- >>> import pandas as pd >>> from gneiss.cluster import correlation_linkage >>> table = pd.DataFrame([[1, 1, 0, 0, 0], ... [0, 1, 1, 0, 0], ... [0, 0, 1, 1, 0], ... [0, 0, 0, 1, 1]], ... columns=['s1', 's2', 's3', 's4', 's5'], ... index=['o1', 'o2', 'o3', 'o4']).T >>> tree = correlation_linkage(table+0.1) >>> print(tree.ascii_art()) /-o1 /y1------| | \-o2 -y0------| | /-o3 \y2------| \-o4 """ dm = variation_matrix(X) lm = linkage(dm.condensed_form(), method=method) t = TreeNode.from_linkage_matrix(lm, X.columns) t = rename_internal_nodes(t) return t