def test_clr(self): cmat = clr(closure(self.data1)) A = np.array([.2, .2, .6]) B = np.array([.4, .4, .2]) npt.assert_allclose(cmat, [np.log(A / np.exp(np.log(A).mean())), np.log(B / np.exp(np.log(B).mean()))]) cmat = clr(closure(self.data2)) A = np.array([.2, .2, .6]) npt.assert_allclose(cmat, np.log(A / np.exp(np.log(A).mean()))) cmat = clr(closure(self.data5)) A = np.array([.2, .2, .6]) B = np.array([.4, .4, .2]) npt.assert_allclose(cmat, [np.log(A / np.exp(np.log(A).mean())), np.log(B / np.exp(np.log(B).mean()))]) with self.assertRaises(ValueError): clr(self.bad1) with self.assertRaises(ValueError): clr(self.bad2) # make sure that inplace modification is not occurring clr(self.data2) npt.assert_allclose(self.data2, np.array([2, 2, 6]))
def test_multiplicative_replacement(self): amat = multiplicative_replacement(closure(self.data3)) npt.assert_allclose(amat, np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364], [0.092, 0.04, 0.04, 0.368, 0.46], [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]), rtol=1e-5, atol=1e-5) amat = multiplicative_replacement(closure(self.data4)) npt.assert_allclose(amat, np.array([0.087273, 0.174545, 0.261818, 0.04, 0.436364]), rtol=1e-5, atol=1e-5) amat = multiplicative_replacement(closure(self.data6)) npt.assert_allclose(amat, np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364], [0.092, 0.04, 0.04, 0.368, 0.46], [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]), rtol=1e-5, atol=1e-5) with self.assertRaises(ValueError): multiplicative_replacement(self.bad1) with self.assertRaises(ValueError): multiplicative_replacement(self.bad2) # make sure that inplace modification is not occurring multiplicative_replacement(self.data4) npt.assert_allclose(self.data4, np.array([1, 2, 3, 0, 5]))
def test_multiplicative_replacement(self): amat = multiplicative_replacement(closure(self.cdata3)) npt.assert_allclose( amat, np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364], [0.092, 0.04, 0.04, 0.368, 0.46], [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]), rtol=1e-5, atol=1e-5) amat = multiplicative_replacement(closure(self.cdata4)) npt.assert_allclose( amat, np.array([0.087273, 0.174545, 0.261818, 0.04, 0.436364]), rtol=1e-5, atol=1e-5) amat = multiplicative_replacement(closure(self.cdata6)) npt.assert_allclose( amat, np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364], [0.092, 0.04, 0.04, 0.368, 0.46], [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]), rtol=1e-5, atol=1e-5) with self.assertRaises(ValueError): multiplicative_replacement(self.bad1) with self.assertRaises(ValueError): multiplicative_replacement(self.bad2) # make sure that inplace modification is not occurring multiplicative_replacement(self.cdata4) npt.assert_allclose(self.cdata4, np.array([1, 2, 3, 0, 5]))
def test_clr(self): cmat = clr(closure(self.cdata1)) A = np.array([.2, .2, .6]) B = np.array([.4, .4, .2]) npt.assert_allclose(cmat, [ np.log(A / np.exp(np.log(A).mean())), np.log(B / np.exp(np.log(B).mean())) ]) cmat = clr(closure(self.cdata2)) A = np.array([.2, .2, .6]) npt.assert_allclose(cmat, np.log(A / np.exp(np.log(A).mean()))) cmat = clr(closure(self.cdata5)) A = np.array([.2, .2, .6]) B = np.array([.4, .4, .2]) npt.assert_allclose(cmat, [ np.log(A / np.exp(np.log(A).mean())), np.log(B / np.exp(np.log(B).mean())) ]) with self.assertRaises(ValueError): clr(self.bad1) with self.assertRaises(ValueError): clr(self.bad2) # make sure that inplace modification is not occurring clr(self.cdata2) npt.assert_allclose(self.cdata2, np.array([2, 2, 6]))
def test_closure_warning(self): with self.assertRaises(ValueError): closure([0., 0., 0.]) with self.assertRaises(ValueError): closure([[0., 0., 0.], [0., 5., 5.]])
def generate_band_table(mu, sigma, gradient, n_species, lam, n_contaminants, library_size=10000): """ Generates a band table with normal variables. Parameters ---------- mu : pd.Series Vector of species optimal positions along gradient. sigma : float Variance of the species normal distribution. gradient : array Vector of gradient values. n_species : int Number of species to simulate. n_contaminants : int Number of contaminant species. lam : float Decay constant for contaminant urn (assumes that the contaminant urn follows an exponential distribution). Returns ------- generator of pd.DataFrame Ground truth tables. pd.Series Metadata group categories, and sample information used for benchmarking. pd.Series Species actually differentially abundant. """ xs = [norm.pdf(gradient, loc=mu[i], scale=sigma) for i in range(len(mu))] table = closure(np.vstack(xs).T) x = np.linspace(0, 1, n_contaminants) contaminant_urn = closure(expon.pdf(x, scale=lam)) contaminant_urns = np.repeat(np.expand_dims(contaminant_urn, axis=0), table.shape[0], axis=0) table = np.hstack((table, contaminant_urns)) s_ids = ['F%d' % i for i in range(n_species)] c_ids = ['X%d' % i for i in range(n_contaminants)] table = closure(table) metadata = pd.DataFrame({'gradient': gradient}) metadata['n_diff'] = len(mu) metadata['n_contaminants'] = n_contaminants metadata['library_size'] = library_size # back calculate the beta metadata['effect_size'] = np.max(mu) / np.max(gradient) metadata.index = ['S%d' % i for i in range(len(metadata.index))] table = pd.DataFrame(table) table.index = ['S%d' % i for i in range(len(table.index))] table.columns = s_ids + c_ids ground_truth = list(table.columns)[:n_species] return table, metadata, ground_truth
def setUp(self): data_dir = "../../data/tick/meshnick_tech_reps" biom_file = "%s/373_otu_table.biom" % data_dir meta_file = "%s/meta.txt" % data_dir table = load_table(biom_file) Z = 1 mat = np.array(table._get_sparse_data().todense()).T x = np.ravel(mat[Z, :]) self.tick_pvals = closure(np.array(x[x > 0])) self.uniform_pvals = closure(np.array([10000] * len(self.tick_pvals))) self.exponential_pvals = closure( np.exp(np.linspace(0, 4, len(self.tick_pvals))))
def setUp(self): data_dir = "../../data/tick/meshnick_tech_reps" biom_file = "%s/373_otu_table.biom" % data_dir meta_file = "%s/meta.txt" % data_dir table = load_table(biom_file) Z = 1 mat = np.array(table._get_sparse_data().todense()).T x = np.ravel(mat[Z, :]) self.tick_pvals = closure(np.array(x[x > 0])) self.uniform_pvals = closure(np.array([10000] * len(self.tick_pvals))) self.exponential_pvals = closure(np.exp( np.linspace(0, 4,len(self.tick_pvals))))
def test_exponential_uniform(self): samp_table = np.random.multinomial(n=500, pvals=self.exponential_pvals) bvals = brive(samp_table, replace_zeros=False) rel = closure(samp_table) m = bvals.sum() npt.assert_array_less(rel - bvals, 1.1 / 500) self.assertLess(m, 1 - robbins(samp_table))
def _fit(self): """ fits and calc. the rclr """ X_ = self.X_.copy().astype(float) if (X_ < 0).any(): raise ValueError('Array Contains Negative Values') if np.count_nonzero(np.isinf(X_)) != 0: raise ValueError('Data-table contains either np.inf or -np.inf') if np.count_nonzero(np.isnan(X_)) != 0: raise ValueError('Data-table contains nans') if np.count_nonzero(X_) == 0: warnings.warn("Data-table contains no zeros.", RuntimeWarning) X_log = np.log(closure(np.array(X_))) log_mask = np.array([True] * X_log.shape[0] * X_log.shape[1]).reshape( X_log.shape) log_mask[np.isfinite(X_log)] = False # sum of rows (features) m = np.ma.array(X_log, mask=log_mask) gm = m.mean(axis=-1, keepdims=True) m = (m - gm).squeeze().data m[~np.isfinite(X_log)] = np.nan self.X_sp = m
def test_permutative_f_scaled(self): test_table = pd.DataFrame( closure([[12, 11, 10, 10, 10, 10, 10], [9, 11, 12, 10, 10, 10, 10], [1, 11, 10, 11, 10, 5, 9], [2, 11, 10, 11, 10, 5, 9], [221, 210, 9, 10, 10, 10, 10], [220, 210, 9, 10, 10, 10, 10], [200, 220, 10, 10, 13, 10, 10], [230, 210, 14, 10, 10, 10, 10]]), index=['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8'], columns=['b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']) test_cats = pd.Series([0, 0, 0, 0, 1, 1, 1, 1], index=['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8']) np.random.seed(0) original_table = copy.deepcopy(test_table) original_cats = copy.deepcopy(test_cats) result = ancom(test_table, test_cats, significance_test='permutative-anova') # Test to make sure that the input table hasn't be altered assert_data_frame_almost_equal(original_table, test_table) # Test to make sure that the input table hasn't be altered pdt.assert_series_equal(original_cats, test_cats) exp = pd.DataFrame({'W': np.array([5, 5, 2, 2, 2, 2, 2]), 'reject': np.array([True, True, False, False, False, False, False], dtype=bool)}, index=['b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']) assert_data_frame_almost_equal(result, exp)
def normal_noise(nf, ns, hodepth, kappa): """ uniform-lognormal-poisson normally dist. noise """ x_noise = abs(normal(1, 0.2, (nf, ns))) mu = hodepth * closure(x_noise.T).T y_noise = np.vstack( [poisson(lognormal(np.log(mu[:, i]), kappa)) for i in range(ns)]).T return y_noise
def resample_counts(X, depth, kappa=1): mu = depth * closure(X) n_samples = len(X) new_samples = np.vstack([ poisson(lognormal(np.log(mu[i, :]), kappa)) for i in range(n_samples) ]) return new_samples
def train_count_parameters(data): """ Given a noisy data, try to learn the count noise parameters. This assumes that there is only a single underlying urn. So the multinomial probabilties are just an aggregrate of all of the counts. Parameters ---------- data : array_like A matrix of counts where there are `n` rows and `m` columns where `n` corresponds to the number of samples and `m` corresponds to the number of species. Returns ------- lam: float Poisson parameter for generating sequencing depths. p: np.array Vector of multinomial probabilities. """ depths = data.sum(axis=1) lam = depths.mean() p = closure(data.sum(axis=0)) return lam, p
def test_exponential_uniform(self): samp_table = np.random.multinomial(n=500, pvals=self.exponential_pvals) bvals = brive(samp_table, replace_zeros=False) rel = closure(samp_table) m = bvals.sum() npt.assert_array_less(rel-bvals, 1.1/500) self.assertLess(m, 1 - robbins(samp_table))
def test_inverse_rclr(self): cmat = self._rclr.fit_transform(self.cdata1) npt.assert_allclose(closure(self.cdata1), np.around(self._inv.fit_transform(cmat), 1)) # inverse can not take zero, nan, or inf values (value error) pass
def Subsample(X_noise, spar, num_samples): """ yij ~ PLN( lambda_{ij}, /phi ) """ # subsample mu = spar * closure(X_noise.T).T X_noise = np.vstack([poisson(lognormal(np.log(mu[:, i]), 1)) for i in range(num_samples)]).T # add sparsity return X_noise
def test_composition_variable_features(self): gen = compositional_variable_features_generator( max_changing=2, fold_change=2, reps=5, intervals=2, n_species=5, fold_balance=False, n_contaminants=2, lam=0.1) table, metadata, truth = next(gen) table, metadata, truth = next(gen) exp_table = pd.DataFrame( closure( np.vstack(( np.array([0.142857]*2 + [0.071429]*3 + [0.499977, 0.00002269]), np.array([0.142857]*2 + [0.071429]*3 + [0.499977, 0.00002269]), np.array([0.142857]*2 + [0.071429]*3 + [0.499977, 0.00002269]), np.array([0.142857]*2 + [0.071429]*3 + [0.499977, 0.00002269]), np.array([0.142857]*2 + [0.071429]*3 + [0.499977, 0.00002269]), np.array([0.071429]*3 + [0.142857]*2 + [0.499977, 0.00002269]), np.array([0.071429]*3 + [0.142857]*2 + [0.499977, 0.00002269]), np.array([0.071429]*3 + [0.142857] *2+ [0.499977, 0.00002269]), np.array([0.071429]*3 + [0.142857]*2 + [0.499977, 0.00002269]), np.array([0.071429]*3 + [0.142857]*2 + [0.499977, 0.00002269]) ))), index = ['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9'], columns = ['F0', 'F1', 'F2', 'F3', 'F4', 'X0', 'X1'] ) pdt.assert_frame_equal(table, exp_table, check_less_precise=True) exp_metadata = pd.DataFrame( {'group': [0] * 5 + [1] * 5, 'n_diff': [4] * 10, 'effect_size': [2] * 10, 'library_size': [10000] * 10 }, index = ['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9'], ) metadata = metadata.reindex_axis(sorted(metadata.columns), axis=1) exp_metadata = exp_metadata.reindex_axis(sorted(exp_metadata.columns), axis=1) pdt.assert_frame_equal(metadata, exp_metadata) exp_truth = ['F0', 'F1', 'F3', 'F4'] self.assertListEqual(truth, exp_truth)
def test_centralize(self): cmat = centralize(closure(self.data1)) npt.assert_allclose(cmat, np.array([[0.22474487, 0.22474487, 0.55051026], [0.41523958, 0.41523958, 0.16952085]])) cmat = centralize(closure(self.data5)) npt.assert_allclose(cmat, np.array([[0.22474487, 0.22474487, 0.55051026], [0.41523958, 0.41523958, 0.16952085]])) with self.assertRaises(ValueError): centralize(self.bad1) with self.assertRaises(ValueError): centralize(self.bad2) centralize(self.data1) npt.assert_allclose(self.data1, np.array([[2, 2, 6], [4, 4, 2]]))
def gradient(nf, ns, kappa=0.1, depth=100, sigma=2.0, g_min=0, gmax=10): """ poisson-lognormal simulation """ sigma = [sigma] * nf g = np.linspace(g_min, gmax, ns) mu = np.linspace(0, 10, nf) x = chain(g, mu=mu, sigma=sigma) mu = depth * closure(x.T).T y = np.vstack( [poisson(lognormal(np.log(mu[:, i]), kappa)) for i in range(ns)]).T return x, y
def test_closure(self): npt.assert_allclose(closure(self.cdata1), np.array([[.2, .2, .6], [.4, .4, .2]])) npt.assert_allclose(closure(self.cdata2), np.array([.2, .2, .6])) npt.assert_allclose(closure(self.cdata5), np.array([[.2, .2, .6], [.4, .4, .2]])) with self.assertRaises(ValueError): closure(self.bad1) with self.assertRaises(ValueError): closure(self.bad2) # make sure that inplace modification is not occurring closure(self.cdata2) npt.assert_allclose(self.cdata2, np.array([2, 2, 6]))
def test_centralize(self): cmat = centralize(closure(self.cdata1)) npt.assert_allclose( cmat, np.array([[0.22474487, 0.22474487, 0.55051026], [0.41523958, 0.41523958, 0.16952085]])) cmat = centralize(closure(self.cdata5)) npt.assert_allclose( cmat, np.array([[0.22474487, 0.22474487, 0.55051026], [0.41523958, 0.41523958, 0.16952085]])) with self.assertRaises(ValueError): centralize(self.bad1) with self.assertRaises(ValueError): centralize(self.bad2) # make sure that inplace modification is not occurring centralize(self.cdata1) npt.assert_allclose(self.cdata1, np.array([[2, 2, 6], [4, 4, 2]]))
def test_power(self): pmat = power(closure(self.data1), 2) npt.assert_allclose(pmat, np.array([[.04/.44, .04/.44, .36/.44], [.16/.36, .16/.36, .04/.36]])) pmat = power(closure(self.data2), 2) npt.assert_allclose(pmat, np.array([.04, .04, .36])/.44) pmat = power(closure(self.data5), 2) npt.assert_allclose(pmat, np.array([[.04/.44, .04/.44, .36/.44], [.16/.36, .16/.36, .04/.36]])) with self.assertRaises(ValueError): power(self.bad1, 2) # make sure that inplace modification is not occurring power(self.data2, 4) npt.assert_allclose(self.data2, np.array([2, 2, 6]))
def test_centralize(self): cmat = centralize(closure(self.cdata1)) npt.assert_allclose(cmat, np.array([[0.22474487, 0.22474487, 0.55051026], [0.41523958, 0.41523958, 0.16952085]])) cmat = centralize(closure(self.cdata5)) npt.assert_allclose(cmat, np.array([[0.22474487, 0.22474487, 0.55051026], [0.41523958, 0.41523958, 0.16952085]])) with self.assertRaises(ValueError): centralize(self.bad1) with self.assertRaises(ValueError): centralize(self.bad2) # make sure that inplace modification is not occurring centralize(self.cdata1) npt.assert_allclose(self.cdata1, np.array([[2, 2, 6], [4, 4, 2]]))
def random_noise(nf, ns, hedepth, kappa): """ random uniform-lognormal-poisson normally dist. noise """ x_noise = abs(normal(1, 0.2, (nf, ns))) err = np.ones_like(x_noise) i = randint(0, err.shape[0], 5000) j = randint(0, err.shape[1], 5000) err[i, j] = hedepth x_noise = abs(normal(x_noise, err)) mu = hedepth * closure(x_noise.T).T y_noise = np.vstack( [poisson(lognormal(np.log(mu[:, i]), kappa)) for i in range(ns)]).T return y_noise
def test_closure(self): npt.assert_allclose(closure(self.data1), np.array([[.2, .2, .6], [.4, .4, .2]])) npt.assert_allclose(closure(self.data2), np.array([.2, .2, .6])) npt.assert_allclose(closure(self.data5), np.array([[.2, .2, .6], [.4, .4, .2]])) with self.assertRaises(ValueError): closure(self.bad1) with self.assertRaises(ValueError): closure(self.bad2) # make sure that inplace modification is not occurring closure(self.data2) npt.assert_allclose(self.data2, np.array([2, 2, 6]))
def test_ilr_inv_basis(self): exp = closure(np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]])) basis = np.array([[0.80442968, 0.19557032]]) table = np.array([[np.log(1/10)*np.sqrt(1/2), np.log(1.14141414 / 9.90909091)*np.sqrt(1/2), np.log(1.28282828 / 9.81818182)*np.sqrt(1/2), np.log(1.42424242 / 9.72727273)*np.sqrt(1/2), np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T res = ilr_inv(table, basis=basis) npt.assert_allclose(res, exp)
def aitchison_transform_part(df, use_multiplicative_replacement = True): """ Aitchison tranformation on df with all columns belonging to same batch. df should consist of all samples tagged together in one channel (i.e. A549_S_rep1 etc.) """ if use_multiplicative_replacement == True: df_aitchison = multiplicative_replacement(df) else: df_aitchison = closure(df) df_idx = df.index df_col = df.columns df_aitchison = pd.DataFrame(df_aitchison, index = df_idx, columns = df_col) return df_aitchison
def partition_metabolites(uU, sigmaU, uV, sigmaV, num_metabolites, latent_dim, microbe_partition, metabolite_in, state): """ Split up a single chemical abundances into multiple subspecies. Parameters ---------- uU, sigmaU, uV, sigmaV : int, int, int, int Parameters for the conditional probability matrix. num_microbes : int Number of strains to be represented num_metabolites : int Number of chemicals to be represented latent_dim : int Number of latent dimensions in conditional probability matrix. microbe_partition : np.array The input microbial abundances for multiple strains. metabolite_in : np.array The input intensities for a single chemicals state : numpy random state Random number generator Returns ------- U: np.array Microbial latent variables. V: np.array Metabolomic latent variables. metabolites_out: np.array Multiple chemical abundances. """ num_microbes = microbe_partition.shape[1] num_samples = len(metabolite_in) U = state.normal(uU, sigmaU, size=(num_microbes, latent_dim)) V = state.normal(uV, sigmaV, size=(latent_dim, num_metabolites)) # Randomly generate conditional probability matrices # Question : how to incorporate the existing abundances? probs = softmax(U @ V) # for each submicrobe strain, generate metabolite distribution metabolite_partition = closure(microbe_partition @ probs) # Return partitioned metabolites metabolites_out = np.multiply(metabolite_partition, metabolite_in.reshape(-1, 1)) return U, V, metabolites_out
def test_ilr_inv(self): mat = closure(self.cdata7) npt.assert_array_almost_equal(ilr_inv(ilr(mat)), mat) npt.assert_allclose(ilr_inv(np.identity(3)), self.ortho1, rtol=1e-04, atol=1e-06) with self.assertRaises(ValueError): ilr_inv(self.cdata1, basis=self.cdata1) # make sure that inplace modification is not occurring ilr_inv(self.cdata1) npt.assert_allclose(self.cdata1, np.array([[2, 2, 6], [4, 4, 2]]))
def test_ilr(self): mat = closure(self.cdata7) npt.assert_array_almost_equal(ilr(mat), np.array([0.70710678, 0.40824829])) # Should give same result as inner npt.assert_allclose(ilr(self.ortho1), np.identity(3), rtol=1e-04, atol=1e-06) with self.assertRaises(ValueError): ilr(self.cdata1, basis=self.cdata1) # make sure that inplace modification is not occurring ilr(self.cdata1) npt.assert_allclose(self.cdata1, np.array([[2, 2, 6], [4, 4, 2]]))
def multinomial_bioms(k, D, N, M, min_sv=0.11, max_sv=5.0, sigma_sq=0.1): """ Simulates biom tables from multinomial. Parameters ---------- k : int Number of latent dimensions. D : int Number of microbes. N : int Number of samples. M : int Average sequencing depth. Returns ------- dict of np.array Ground truth parameters. """ dims, hdims, total = D, k, N eigs = min_sv + (max_sv - min_sv) * np.linspace(0, 1, hdims) eigvectors = ortho_group.rvs(dims - 1)[:, :hdims] W = np.matmul(eigvectors, np.diag(np.sqrt(eigs - sigma_sq))) sigma_sq = sigma_sq sigma = np.sqrt(sigma_sq) z = np.random.normal(size=(total, hdims)) eta = np.random.normal(np.matmul(z, W.T), sigma).astype(np.float32) tree = random_linkage(D) Psi = _balance_basis(tree)[0] prob = closure(np.exp(eta @ Psi)) depths = np.random.poisson(M, size=N) Y = np.vstack([np.random.multinomial(depths[i], prob[i]) for i in range(N)]) return dict( sigma=sigma, W=W, Psi=Psi, tree=tree, eta=eta, z=z, Y=Y, depths=depths, eigs=eigs, eigvectors=eigvectors )
def test_ancom_basic_proportions(self): # Converts from counts to proportions test_table = pd.DataFrame(closure(self.table1)) original_table = copy.deepcopy(test_table) test_cats = pd.Series(self.cats1) original_cats = copy.deepcopy(test_cats) result = ancom(test_table, test_cats, multiple_comparisons_correction=None) # Test to make sure that the input table hasn't be altered assert_data_frame_almost_equal(original_table, test_table) # Test to make sure that the input table hasn't be altered pdt.assert_series_equal(original_cats, test_cats) exp = pd.DataFrame({'W': np.array([5, 5, 2, 2, 2, 2, 2]), 'reject': np.array([True, True, False, False, False, False, False], dtype=bool)}) assert_data_frame_almost_equal(result, exp)
def split_balance(balance, tree): """ Splits a balance into its log ratio components. Parameters ---------- balance : pd.Series A vector corresponding to a single balance. These values that will be split into its numberator and denominator components. Returns ------- pd.DataFrame Dataframe where the first column contains the numerator and the second column contains the denominator of the balance. Note ---- The balance must have a name associated with it. """ node = tree.find(balance.name) if node.is_tip(): raise ValueError("%s is not a balance." % balance.name) left = node.children[0] right = node.children[1] if left.is_tip(): L = 1 else: L = len([n for n in left.tips()]) if right.is_tip(): R = 1 else: R = len([n for n in right.tips()]) b = np.expand_dims(balance.values, axis=1) # need to scale down by the number of children in subtrees b = np.exp(b / (np.sqrt((L * R) / (L + R)))) o = np.ones((len(b), 1)) k = np.hstack((b, o)) p = closure(k) return pd.DataFrame(p, columns=[left.name, right.name], index=balance.index)
def split_balance(balance, tree): """ Splits a balance into its log ratio components. Parameters ---------- balance : pd.Series A vector corresponding to a single balance. These values that will be split into its numberator and denominator components. Returns ------- pd.DataFrame Dataframe where the first column contains the numerator and the second column contains the denominator of the balance. Note ---- The balance must have a name associated with it. """ node = tree.find(balance.name) if node.is_tip(): raise ValueError("%s is not a balance." % balance.name) left = node.children[0] right = node.children[1] if left.is_tip(): L = 1 else: L = len([n for n in left.tips()]) if right.is_tip(): R = 1 else: R = len([n for n in right.tips()]) b = np.expand_dims(balance.values, axis=1) # need to scale down by the number of children in subtrees b = np.exp(b / (np.sqrt((L*R) / (L + R)))) o = np.ones((len(b), 1)) k = np.hstack((b, o)) p = closure(k) return pd.DataFrame(p, columns=[left.name, right.name], index=balance.index)
def test_multinomial_sample(self): rng = RandomState(0) X = np.array([[ 8.76415025e-03, 4.97385694e-02, 1.40955938e-01, 1.99471140e-01, 1.40955938e-01, 4.97385694e-02, 8.76415025e-03, 7.71139498e-04, 3.38815049e-05, 7.43359757e-07 ], [ 7.43359757e-07, 3.38815049e-05, 7.71139498e-04, 8.76415025e-03, 4.97385694e-02, 1.40955938e-01, 1.99471140e-01, 1.40955938e-01, 4.97385694e-02, 8.76415025e-03 ]]) X = closure(X) lam = 5 res = multinomial_sample(X, lam, rng) exp = np.array([[0, 2, 3, 3, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 3, 1, 1, 0, 0]]) npt.assert_allclose(res, exp)
def test_ilr_basis_isomorphism(self): # tests to make sure that the isomorphism holds # with the introduction of the basis. basis = np.array([[0.80442968, 0.19557032]]) table = np.array([[np.log(1/10)*np.sqrt(1/2), np.log(1.14141414 / 9.90909091)*np.sqrt(1/2), np.log(1.28282828 / 9.81818182)*np.sqrt(1/2), np.log(1.42424242 / 9.72727273)*np.sqrt(1/2), np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T res = ilr(ilr_inv(table, basis=basis), basis=basis) npt.assert_allclose(res, table.squeeze()) table = np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]]) res = ilr_inv(np.atleast_2d(ilr(table, basis=basis)).T, basis=basis) npt.assert_allclose(res, closure(table.squeeze()))
def test_ilr_basis_isomorphism(self): # tests to make sure that the isomorphism holds # with the introduction of the basis. basis = np.array([[0.80442968, 0.19557032]]) table = np.array([[ np.log(1 / 10) * np.sqrt(1 / 2), np.log(1.14141414 / 9.90909091) * np.sqrt(1 / 2), np.log(1.28282828 / 9.81818182) * np.sqrt(1 / 2), np.log(1.42424242 / 9.72727273) * np.sqrt(1 / 2), np.log(1.56565657 / 9.63636364) * np.sqrt(1 / 2) ]]).T res = ilr(ilr_inv(table, basis=basis), basis=basis) npt.assert_allclose(res, table.squeeze()) table = np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]]) res = ilr_inv(np.atleast_2d(ilr(table, basis=basis)).T, basis=basis) npt.assert_allclose(res, closure(table.squeeze()))
def variation_matrix(X): """ Calculate Aitchison variation matrix. This calculates the Aitchison variation matrix. Given a compositional matrix :math:`X`, and columns :math:`i` and :math:`j`, the :math:`ij` entry in the variation matrix of :math:`X` is given by .. math: V_{ij} = \frac{1}{2} var(\ln \frac{x_i}{x_j}) Parameters ---------- X : pd.DataFrame Contingency table where there are n rows corresponding to samples and p features corresponding to columns. Returns ------- skbio.DistanceMatrix Total variation matrix of size n x n. References ---------- .. [1] V. Pawlowsky-Glahn, J. J. Egozcue, R. Tolosana-Delgado (2015), Modeling and Analysis of Compositional Data, Wiley, Chichester, UK .. [2] J. J. Egozcue, V. Pawlowsky-Glahn (2004), Groups of Parts and Their Balances in Compositional Data Analysis, Mathematical Geology """ v = np.zeros((X.shape[1], X.shape[1])) x = closure(X) for i in range(X.shape[1]): for j in range(i): v[i, j] = np.var(np.log(x[:, i]) - np.log(x[:, j])) # Making matrix symmetry since V(ln (x/y) ) = V(ln (y/x) ) # Also dividing by 2, to ensure unit norm for balances. # See Eqn 4 in [2] return DistanceMatrix((v + v.T) / 2, ids=X.columns)
def split_balance(self, balance_name): """ Splits a balance into its log ratio components. Parameters ---------- node : str Name of internal node in the tree to be retrieved for Returns ------- pd.DataFrame Dataframe where the first column contains the numerator and the second column contains the denominator of the balance. """ node = self.tree.find(balance_name) if node.is_tip(): raise ValueError("%s is not a balance." % balance_name) left = node.children[0] right = node.children[1] if left.is_tip(): L = 1 else: L = len([n for n in left.tips()]) if right.is_tip(): R = 1 else: R = len([n for n in right.tips()]) b = np.expand_dims(self.balances[balance_name].values, axis=1) # need to scale down by the number of children in subtrees b = np.exp(b / (np.sqrt((L * R) / (L + R)))) o = np.ones((len(b), 1)) k = np.hstack((b, o)) p = closure(k) return pd.DataFrame(p, columns=[left.name, right.name], index=self.balances.index)
def test_perturb_inv(self): pmat = perturb_inv(closure(self.data1), closure([.1, .1, .1])) imat = perturb(closure(self.data1), closure([10, 10, 10])) npt.assert_allclose(pmat, imat) pmat = perturb_inv(closure(self.data1), closure([1, 1, 1])) npt.assert_allclose(pmat, closure([[.2, .2, .6], [.4, .4, .2]])) pmat = perturb_inv(closure(self.data5), closure([.1, .1, .1])) imat = perturb(closure(self.data1), closure([10, 10, 10])) npt.assert_allclose(pmat, imat) with self.assertRaises(ValueError): perturb_inv(closure(self.data1), self.bad1) # make sure that inplace modification is not occurring perturb_inv(self.data2, [1, 2, 3]) npt.assert_allclose(self.data2, np.array([2, 2, 6]))
def test_perturb(self): pmat = perturb(closure(self.data1), closure(np.array([1, 1, 1]))) npt.assert_allclose(pmat, np.array([[.2, .2, .6], [.4, .4, .2]])) pmat = perturb(closure(self.data1), closure(np.array([10, 10, 20]))) npt.assert_allclose(pmat, np.array([[.125, .125, .75], [1./3, 1./3, 1./3]])) pmat = perturb(closure(self.data1), closure(np.array([10, 10, 20]))) npt.assert_allclose(pmat, np.array([[.125, .125, .75], [1./3, 1./3, 1./3]])) pmat = perturb(closure(self.data2), closure([1, 2, 1])) npt.assert_allclose(pmat, np.array([1./6, 2./6, 3./6])) pmat = perturb(closure(self.data5), closure(np.array([1, 1, 1]))) npt.assert_allclose(pmat, np.array([[.2, .2, .6], [.4, .4, .2]])) with self.assertRaises(ValueError): perturb(closure(self.data5), self.bad1) # make sure that inplace modification is not occurring perturb(self.data2, [1, 2, 3]) npt.assert_allclose(self.data2, np.array([2, 2, 6]))