def _get_distance_matrix(self, X): """ computes UniFrac distances with the fitted samples Parameters ---------- X : biom.Table new samples Returns ------- dm : DistanceMatrix distances from old samples to new samples """ # TODO one problem with this approach is that # if any samples in X overlap self.table, the counts will # be doubled merged_table = self.table.merge(X) with self.hdf5_table(merged_table) as f: dm = ssu( f.name, self.tree_path, unifrac_method=self.unifrac_method, variance_adjust=False, alpha=1.0, bypass_tips=False, threads=1, ) return dm
def test_unweighted_root_eval_issue_46(self): tree = self.get_data_path('crawford.tre') table = self.get_data_path('crawford.biom') table_inmem = load_table(table) tree_inmem = skbio.TreeNode.read(tree) ids = table_inmem.ids() otu_ids = table_inmem.ids(axis='observation') cnts = table_inmem.matrix_data.astype(int).toarray().T exp = skbio.diversity.beta_diversity('unweighted_unifrac', cnts, ids=ids, otu_ids=otu_ids, tree=tree_inmem) obs = ssu(table, tree, 'unweighted', False, 1.0, False, 1) npt.assert_almost_equal(obs.data, exp.data) obs2 = unweighted(table, tree) npt.assert_almost_equal(obs2.data, exp.data) tmpfile = '/tmp/uf_ta_1.md5' unweighted_to_file(table, tree, tmpfile, pcoa_dims=0) try: obs3 = h5unifrac(tmpfile) npt.assert_almost_equal(obs3.data, exp.data) finally: os.unlink(tmpfile)
def transform(self, X): """ X : biom.Table """ # TODO one problem with this approach is that # if any samples in X overlap self.table, the counts will # be doubled merged_table = self.table.merge(X) with tempfile.NamedTemporaryFile() as f: with biom_open(f.name, 'w') as b: merged_table.to_hdf5(b, "merged") dm = ssu( f.name, self.tree_path, unifrac_method='unweighted', variance_adjust=False, alpha=1.0, bypass_tips=False, threads=1, ) # get indices of test ID's X_idx = [dm.index(name) for name in X.ids('sample')] # get indices of table ID's ref_idx = [dm.index(name) for name in self.table.ids('sample')] # extract sub-distance matrix idxs = np.ix_(X_idx, ref_idx) sub_dm = dm.data[idxs] return sub_dm
def weighted_unnormalized_fp32(table: str, phylogeny: str, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False ) -> skbio.DistanceMatrix: # noqa """Compute weighted unnormalized UniFrac using fp32 math Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional The number of threads to use. Default is 1. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ _validate(table, phylogeny) return qsu.ssu(str(table), str(phylogeny), 'weighted_unnormalized_fp32', variance_adjusted, 1.0, bypass_tips, threads)
def unweighted(table: str, phylogeny: str, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False) -> skbio.DistanceMatrix: """Compute Unweighted UniFrac Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional The number of threads to use. Default of 1. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Notes ----- Unweighted UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, and while its application to Unweighted UniFrac was not described, factoring in the variance adjustment is still feasible and so it is exposed. References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ _validate(table, phylogeny) return qsu.ssu(table, phylogeny, 'unweighted', variance_adjusted, 1.0, bypass_tips, threads)
def test_meta_unifrac(self): t1 = self.get_data_path('t1.newick') e1 = self.get_data_path('e1.biom') result = ssu(e1, t1, 'unweighted', False, 1.0, False, 1) u1_distances = np.array([[0, 10 / 16., 8 / 13.], [10 / 16., 0, 8 / 17.], [8 / 13., 8 / 17., 0]]) npt.assert_almost_equal(u1_distances, result.data) self.assertEqual(tuple('ABC'), result.ids)
def _work(self, u_counts, v_counts, otu_ids, tree, method): data = np.array([u_counts, v_counts]).T bt = Table(data, otu_ids, ['u', 'v']) ta = os.path.join(gettempdir(), 'table.biom') tr = os.path.join(gettempdir(), 'tree.biom') self.files_to_delete.append(ta) self.files_to_delete.append(tr) with biom_open(ta, 'w') as fhdf5: bt.to_hdf5(fhdf5, 'Table for unit testing') tree.write(tr) # return value is a distance matrix, get the distance from u->v return ssu(ta, tr, method, False, 1.0, False, 1)['u', 'v']
def test_unweighted_root_eval_issue_46(self): tree = self.get_data_path('crawford.tre') table = self.get_data_path('crawford.biom') table_inmem = load_table(table) tree_inmem = skbio.TreeNode.read(tree) ids = table_inmem.ids() otu_ids = table_inmem.ids(axis='observation') cnts = table_inmem.matrix_data.astype(int).toarray().T exp = skbio.diversity.beta_diversity('unweighted_unifrac', cnts, ids=ids, otu_ids=otu_ids, tree=tree_inmem) obs = ssu(table, tree, 'unweighted', False, 1.0, False, 1) npt.assert_almost_equal(obs.data, exp.data)
def generalized_fp32(table: str, phylogeny: str, threads: int = 1, alpha: float = 1.0, variance_adjusted: bool = False, bypass_tips: bool = False) -> skbio.DistanceMatrix: """Compute Generalized UniFrac using fp32 math Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional The number of threads to use. Default is 1 alpha : float, optional The level of contribution of high abundance branches. Higher alpha increases the contribution of from high abundance branches while lower alpha reduces the contribution. Alpha was originally defined over the range [0, 1]. Default is 1.0. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Notes ----- Generalized UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, but was not described in as applied to Generalized UniFrac. It is feasible to do, so it is exposed here. An alpha of 1.0 is Weighted normalized UniFrac. An alpha of 0.0 is approximately Unweighted UniFrac, and is if the proportions are dichotomized. References ---------- .. [1] Chen, J., Bittinger, K., Charlson, E. S., Hoffmann C., Lewis, J., Wu, G. D., Collman R. G., Bushman, F. D. & Hongzhe L. Associating microbiome composition with environmental covariates using generalized UniFrac distances. Bioinformatics 28(16), 2106–2113 (2012). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ _validate(table, phylogeny) if alpha == 1.0: warn("alpha of 1.0 is weighted-normalized UniFrac. " "Weighted-normalized is being used instead as it is more " "optimized.", Warning) return weighted_normalized_fp32(table, phylogeny, threads, variance_adjusted) else: return qsu.ssu(str(table), str(phylogeny), 'generalized_fp32', variance_adjusted, alpha, bypass_tips, threads)
def test_ssu_bad_method(self): t1 = self.get_data_path('t1.newick') e1 = self.get_data_path('e1.biom') with self.assertRaisesRegex(ValueError, "Unknown method."): ssu(e1, t1, 'unweightedfoo', False, 1.0, False, 1)
def test_ssu_bad_table(self): t1 = self.get_data_path('t1.newick') with self.assertRaisesRegex(IOError, "Table file not found."): ssu('bad-file', t1, 'unweighted', False, 1.0, False, 1)
def test_ssu_bad_tree(self): e1 = self.get_data_path('e1.biom') with self.assertRaisesRegex(IOError, "Tree file not found."): ssu(e1, 'bad-file', 'unweighted', False, 1.0, False, 1)