def _pca(ranks_df: pd.DataFrame, n_components: int = None) -> (OrdinationResults, OrdinationResults): # perform PCA pca_result = PCA(n_components=n_components) pca_result.fit(ranks_df) # transform ranks ranks_transformed = pd.DataFrame(pca_result.transform(ranks_df)) ranks_transformed.index = ranks_df.index components_loadings = pd.DataFrame(-1 * pca_result.components_.T * np.sqrt(pca_result.explained_variance_)) components_loadings.index = ranks_df.columns eigenvalues = pd.Series(pca_result.explained_variance_) ores_scores = OrdinationResults( short_method_name="PCA", long_method_name="Principal Components Analysis", eigvals=eigenvalues, samples=ranks_transformed, features=None, biplot_scores=None, proportion_explained=pd.Series(pca_result.explained_variance_ratio_)) ores_loadings = OrdinationResults( short_method_name="PCA", long_method_name="Principal Components Analysis", eigvals=eigenvalues, samples=components_loadings, features=None, biplot_scores=None, proportion_explained=pd.Series(pca_result.explained_variance_ratio_)) return ores_scores, ores_loadings
def procrustes_analysis( reference: OrdinationResults, other: OrdinationResults, dimensions: int = 5, permutations: int = 999 ) -> (OrdinationResults, OrdinationResults, pd.DataFrame): if reference.samples.shape != other.samples.shape: raise ValueError('The matrices cannot be fitted unless they have the ' 'same dimensions') if reference.samples.shape[1] < dimensions: raise ValueError('Cannot fit fewer dimensions than available') # fail if there are any elements in the symmetric difference diff = reference.samples.index.symmetric_difference(other.samples.index) if not diff.empty: raise ValueError('The ordinations represent two different sets of ' 'samples') # make the matrices be comparable other.samples = other.samples.reindex(index=reference.samples.index) mtx1, mtx2, m2 = procrustes(reference.samples.values[:, :dimensions], other.samples.values[:, :dimensions]) axes = reference.samples.columns[:dimensions] samples1 = pd.DataFrame(data=mtx1, index=reference.samples.index.copy(), columns=axes.copy()) samples2 = pd.DataFrame(data=mtx2, index=reference.samples.index.copy(), columns=axes.copy()) info = _procrustes_monte_carlo(reference.samples.values[:, :dimensions], other.samples.values[:, :dimensions], m2, permutations) out1 = OrdinationResults(short_method_name=reference.short_method_name, long_method_name=reference.long_method_name, eigvals=reference.eigvals[:dimensions].copy(), samples=samples1, features=reference.features, biplot_scores=reference.biplot_scores, sample_constraints=reference.sample_constraints, proportion_explained=reference. proportion_explained[:dimensions].copy()) out2 = OrdinationResults( short_method_name=other.short_method_name, long_method_name=other.long_method_name, eigvals=other.eigvals[:dimensions].copy(), samples=samples2, features=other.features, biplot_scores=other.biplot_scores, sample_constraints=other.sample_constraints, proportion_explained=other.proportion_explained[:dimensions].copy()) return out1, out2, info
def setUp(self): self.alpha = pd.Series([1, 2, 3], index=list('abc')) data = np.asarray([[0, 0, 1], [1, 3, 42]]) self.biom = biom.Table(data, ['O1', 'O2'], ['a', 'b', 'c']) eigvals = [0.51236726, 0.30071909, 0.26791207] proportion_explained = [0.2675738328, 0.157044696, 0.1399118638] sample_ids = ['a', 'b', 'c'] axis_labels = ['PC%d' % i for i in range(1, 4)] np.random.seed(11) data = np.random.randn(3, 3) expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame( data, index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) self.ordination = expected_results self.metadata = pd.DataFrame(data=[[':0', ':)', ':/'], [':D', 'xD', '<3'], [';L', ']:->', ':S']], index=list('abc'), columns=['foo', 'bar', 'baz'])
def _create_ordination_results(self): eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868] proportion_explained = [ 0.2675738328, 0.157044696, 0.1399118638, 0.1091402725 ] sample_ids = [ '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192', '1.SKB7.640196' ] axis_labels = ['PC1', 'PC2', 'PC3', 'PC4'] samples = [[-2.584, 1.739, 3.828, -1.944], [-2.710, -1.859, -8.648, 1.180], [2.350, 9.625, -3.457, -3.208], [2.614, -1.114, 1.476, 2.908], [2.850, -1.925, 6.232, 1.381]] ord_res = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(np.asarray(samples), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) fd, fp = mkstemp(suffix='.txt', dir=self.out_dir) close(fd) ord_res.write(fp) return fp
def test_str(self): exp = ("Ordination results:\n" "\tMethod: Correspondance Analysis (CA)\n" "\tEigvals: 2\n" "\tProportion explained: N/A\n" "\tFeatures: 3x2\n" "\tSamples: 3x2\n" "\tBiplot Scores: N/A\n" "\tSample constraints: N/A\n" "\tFeature IDs: 'Species1', 'Species2', 'Species3'\n" "\tSample IDs: 'Site1', 'Site2', 'Site3'") obs = str(self.ordination_results) self.assertEqual(obs, exp) # all optional attributes missing exp = ("Ordination results:\n" "\tMethod: Principal Coordinate Analysis (PCoA)\n" "\tEigvals: 1\n" "\tProportion explained: N/A\n" "\tFeatures: N/A\n" "\tSamples: 2x1\n" "\tBiplot Scores: N/A\n" "\tSample constraints: N/A\n" "\tFeature IDs: N/A\n" "\tSample IDs: 0, 1") samples_df = pd.DataFrame(np.array([[1], [2]])) obs = str( OrdinationResults('PCoA', 'Principal Coordinate Analysis', pd.Series(np.array([4.2])), samples_df)) self.assertEqual(obs.split('\n'), exp.split('\n'))
def setUp(self): self.tree = self.mock_tree_from_nwk() self.bp_tree = from_skbio_treenode(self.tree) self.table = biom.Table( np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [0, 0, 0, 0]]).T, list('abed'), ['Sample1', 'Sample2', 'Sample3', 'Sample4']) self.sample_metadata = pd.DataFrame( { "Metadata1": [0, 0, 0, 1], "Metadata2": [0, 0, 0, 0], "Metadata3": [1, 2, 3, 4], "Metadata4": ["abc", "def", "ghi", "jkl"] }, index=list(self.table.ids())) # (These are some Greengenes taxonomy annotations I took from the # moving pictures taxonomy.qza file. I made up the confidences.) self.feature_metadata = pd.DataFrame( { "Taxonomy": [("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; " "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; " "s__"), ("k__Bacteria; p__Proteobacteria; " "c__Gammaproteobacteria; o__Pasteurellales; " "f__Pasteurellaceae; g__; s__"), ("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; " "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; " "s__uniformis")], "Confidence": [0.95, 0.8, 0] }, index=["e", "h", "a"]) self.split_tax_fm, self.taxcols = split_taxonomy(self.feature_metadata) self.tip_md = self.split_tax_fm.loc[["a", "e"]] self.int_md = self.split_tax_fm.loc[["h"]] # This is designed to match the shearing that's done in the core test # for --p-shear-to-table self.shorn_tree = parse_newick( "(((a:1)EmpressNode0:1,b:2)g:1,(d:3)h:2)EmpressNode1:1;") self.exp_split_fm_cols = [ "Level 1", "Level 2", "Level 3", "Level 4", "Level 5", "Level 6", "Level 7", "Confidence" ] eigvals = pd.Series([0.50, 0.25, 0.25], index=['PC1', 'PC2', 'PC3']) samples = [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]] proportion_explained = pd.Series([15.5, 12.2, 8.8], index=['PC1', 'PC2', 'PC3']) samples_df = pd.DataFrame( samples, index=['Sample1', 'Sample2', 'Sample3', 'Sample4'], columns=['PC1', 'PC2', 'PC3']) self.ordination = OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df, proportion_explained=proportion_explained)
def test_book_example_dataset(self): # Adapted from PyCogent's `test_principal_coordinate_analysis`: # "I took the example in the book (see intro info), and did # the principal coordinates analysis, plotted the data and it # looked right". eigvals = [ 0.73599103, 0.26260032, 0.14926222, 0.06990457, 0.02956972, 0.01931184, 0., 0., 0., 0., 0., 0., 0., 0. ] proportion_explained = [ 0.58105792, 0.20732046, 0.1178411, 0.05518899, 0.02334502, 0.01524651, 0., 0., 0., 0., 0., 0., 0., 0. ] sample_ids = [str(i) for i in range(14)] axis_labels = ['PC%d' % i for i in range(1, 15)] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(np.loadtxt( get_data_path('exp_PCoAzeros_site')), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) results = npt.assert_warns(RuntimeWarning, pcoa, self.dm) # Note the absolute value because column can have signs swapped results.samples = np.abs(results.samples) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def test_scaling1(self): eigvals = pd.Series(np.array([0.09613302, 0.04094181]), self.pc_ids) # p. 458 features = pd.DataFrame( np.array([ [1.31871, -0.34374], # V [-0.37215, 1.48150], [-0.99972, -0.92612] ]), self.feature_ids, self.pc_ids) samples = pd.DataFrame( np.array([ [-0.26322, -0.17862], # F [-0.06835, 0.27211], [0.51685, -0.09517] ]), self.sample_ids, self.pc_ids) exp = OrdinationResults('CA', 'Correspondance Analysis', eigvals=eigvals, features=features, samples=samples) scores = ca(self.contingency, 1) assert_ordination_results_equal(exp, scores, decimal=5, ignore_directionality=True)
def test_simple(self): eigvals = [ 0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895, 0.16054235, 0.15017696, 0.12245775, 0.0 ] proportion_explained = [ 0.2675738328, 0.157044696, 0.1399118638, 0.1091402725, 0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0 ] sample_ids = [ 'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634' ] axis_labels = ['PC%d' % i for i in range(1, 10)] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(np.loadtxt( get_data_path('exp_PCoAEigenResults_site')), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def test_scaling2(self): eigvals = pd.Series(np.array([0.09613302, 0.04094181]), self.pc_ids) # p. 460 L&L 1998 features = pd.DataFrame( np.array([ [0.40887, -0.06955], # F_hat [-0.11539, 0.29977], [-0.30997, -0.18739] ]), self.feature_ids, self.pc_ids) samples = pd.DataFrame( np.array([ [-0.84896, -0.88276], # V_hat [-0.22046, 1.34482], [1.66697, -0.47032] ]), self.sample_ids, self.pc_ids) exp = OrdinationResults('CA', 'Correspondance Analysis', eigvals=eigvals, features=features, samples=samples) scores = ca(self.contingency, 2) assert_ordination_results_equal(exp, scores, decimal=5, ignore_directionality=True)
def test_scaling2(self): scores = rda(self.Y, self.X, scaling=2) mat = np.loadtxt(get_data_path('example2_biplot_scaling2')) cropped_pc_ids = self.pc_ids[:mat.shape[1]] biplot_scores = pd.DataFrame(mat, index=self.env_ids, columns=cropped_pc_ids) sample_constraints = pd.DataFrame(np.loadtxt( get_data_path('example2_sample_constraints_scaling2'))) # Load data as computed with vegan 2.0-8 vegan_features = pd.DataFrame( np.loadtxt(get_data_path( 'example2_species_scaling2_from_vegan')), index=self.feature_ids, columns=self.pc_ids) vegan_samples = pd.DataFrame( np.loadtxt(get_data_path( 'example2_site_scaling2_from_vegan')), index=self.sample_ids, columns=self.pc_ids) sample_constraints = pd.DataFrame( np.loadtxt(get_data_path( 'example2_sample_constraints_scaling2')), index=self.sample_ids, columns=self.pc_ids) mat = np.loadtxt(get_data_path( 'example2_biplot_scaling2')) cropped_pc_ids = self.pc_ids[:mat.shape[1]] biplot_scores = pd.DataFrame(mat, index=self.env_ids, columns=cropped_pc_ids) proportion_explained = pd.Series([0.44275783, 0.25614586, 0.15280354, 0.10497021, 0.02873375, 0.00987052, 0.00471828], index=self.pc_ids) eigvals = pd.Series([25.897954, 14.982578, 8.937841, 6.139956, 1.680705, 0.577350, 0.275984], index=self.pc_ids) exp = OrdinationResults( 'RDA', 'Redundancy Analysis', samples=vegan_samples, features=vegan_features, sample_constraints=sample_constraints, biplot_scores=biplot_scores, proportion_explained=proportion_explained, eigvals=eigvals) assert_ordination_results_equal(scores, exp, ignore_directionality=True, decimal=6)
def setUp(self): # Define in-memory CA results to serialize and deserialize. eigvals = pd.Series([0.0961330159181, 0.0409418140138], ['CA1', 'CA2']) features = np.array([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]]) samples = np.array([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]]) features_ids = ['Species1', 'Species2', 'Species3'] sample_ids = ['Site1', 'Site2', 'Site3'] samples_df = pd.DataFrame(samples, index=sample_ids, columns=['CA1', 'CA2']) features_df = pd.DataFrame(features, index=features_ids, columns=['CA1', 'CA2']) self.ordination_results = OrdinationResults('CA', 'Correspondance Analysis', eigvals=eigvals, samples=samples_df, features=features_df) # DataFrame for testing plot method. Has a categorical column with a # mix of numbers and strings. Has a numeric column with a mix of ints, # floats, and strings that can be converted to floats. Has a numeric # column with missing data (np.nan). self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8], [22, -4.2, np.nan], ['foo', '42.19', 11]], index=['A', 'B', 'C', 'D'], columns=['categorical', 'numeric', 'nancolumn']) # Minimal ordination results for easier testing of plotting method. # Paired with df above. eigvals = np.array([0.50, 0.25, 0.25]) samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) samples_df = pd.DataFrame(samples, ['A', 'B', 'C', 'D'], ['PC1', 'PC2', 'PC3']) self.min_ord_results = OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df)
def apca(df): """Performs Aitchison PCA on a feature table. Parameters ---------- df: pd.DataFrame A numeric DataFrame whose rows are "features" and whose columns are "samples." Returns ------- A 3-tuple (U, p, V) where: U: pd.DataFrame Feature loadings. p: pd.DataFrame Proportions of variance explained. V: pd.DataFrame Sample loadings. """ # do A-PCA U, s, V = svds(clr(df), k=2) V = V.T # reverse (see SVDs docs) U = np.flip(U, axis=1) V = np.flip(V, axis=1) s = s[::-1] # Rename columns; we use "Axis 1", etc. to be consistent with the Qurro # interface pcs = min(V.shape) cols = ["Axis {}".format(pc + 1) for pc in range(pcs)] # Make DataFrames from the feature (U) and sample (V) loadings U = pd.DataFrame(U[:, :pcs], df.index, cols) V = pd.DataFrame(V[:, :pcs], df.columns, cols) # For clarity, rename top-left cell in both loading DataFrames U.index.name = "FeatureID" V.index.name = "SampleID" # get prop. var. explained p = s**2 / np.sum(s**2) p = pd.Series(p.T, index=cols) # format eigenvalues in a way that OrdinationResults expects eigvals = pd.Series(s.T, index=cols) return OrdinationResults("apca", "Aitchison PCA", eigvals, samples=V, features=U, proportion_explained=p)
def rpca(in_biom: str, output_dir: str, min_sample_depth: int, rank: int) -> None: """ Runs RPCA with an rclr preprocessing step""" # import table table = load_table(in_biom) # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_depth table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T.drop_duplicates() # rclr for saving the transformed OTU table (RSC edited) tablefit = rclr().fit_transform(table.copy()) U,s,V = OptSpace().fit_transform(tablefit) tablefit = np.dot(np.dot(U, s), V.T) tablefit = pd.DataFrame(tablefit.T, index=table.columns, columns=table.index) with open(os.path.join(output_dir, 'rclr_OTUtable.txt'), 'w'): tablefit.to_csv(os.path.join(output_dir, 'rclr_OTUtable.txt'), sep='\t', index_label='OTU_ID') # rclr preprocessing and OptSpace (RPCA) opt = OptSpace(rank=rank).fit(rclr().fit_transform(table.copy())) rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)} # Feature Loadings feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns) feature_loading = feature_loading.rename(columns=rename_cols) feature_loading.sort_values('PC1', inplace=True, ascending=True) # Sample Loadings sample_loading = pd.DataFrame(opt.sample_weights, index=table.index) sample_loading = sample_loading.rename(columns=rename_cols) proportion_explained = pd.Series(opt.explained_variance_ratio, index=list(rename_cols.values())) eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values())) # save ordination results ord_res = OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals.copy(), sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # write files to output folder ord_res.write(os.path.join(output_dir, 'RPCA_Ordination.txt')) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix( opt.distance, ids=sample_loading.index) dist_res.write(os.path.join(output_dir, 'RPCA_distance.txt')) return
def regression_biplot(coefficients: pd.DataFrame) -> skbio.OrdinationResults: coefs = clr(centralize(clr_inv(coefficients))) u, s, v = np.linalg.svd(coefs) pc_ids = ['PC%d' % i for i in range(len(s))] samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s), columns=pc_ids, index=coefficients.index) features = pd.DataFrame(v.T[:, :len(s)], columns=pc_ids, index=coefficients.columns) short_method_name = 'regression_biplot' long_method_name = 'Multinomial regression biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = eigvals / eigvals.sum() res = OrdinationResults(short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) return res
def test_scaling2(self): scores = cca(self.Y, self.X, scaling=2) # Load data as computed with vegan 2.0-8 vegan_features = pd.DataFrame(np.loadtxt( get_data_path('example3_species_scaling2_from_vegan')), index=self.feature_ids, columns=self.pc_ids) vegan_samples = pd.DataFrame(np.loadtxt( get_data_path('example3_site_scaling2_from_vegan')), index=self.sample_ids, columns=self.pc_ids) sample_constraints = pd.DataFrame(np.loadtxt( get_data_path('example3_sample_constraints_scaling2')), index=self.sample_ids, columns=self.pc_ids) mat = np.loadtxt(get_data_path('example3_biplot_scaling2')) cropped_pc_ids = self.pc_ids[:mat.shape[1]] biplot_scores = pd.DataFrame(mat, index=self.env_ids, columns=cropped_pc_ids) proportion_explained = pd.Series([ 0.466911, 0.238327, 0.100548, 0.104937, 0.044805, 0.029747, 0.012631, 0.001562, 0.000532 ], index=self.pc_ids) eigvals = pd.Series([ 0.366136, 0.186888, 0.078847, 0.082288, 0.035135, 0.023327, 0.009905, 0.001225, 0.000417 ], index=self.pc_ids) exp = OrdinationResults('CCA', 'Canonical Correspondence Analysis', samples=vegan_samples, features=vegan_features, sample_constraints=sample_constraints, biplot_scores=biplot_scores, proportion_explained=proportion_explained, eigvals=eigvals) assert_ordination_results_equal(scores, exp, decimal=6)
def test_extensive(self): eigvals = [ 0.3984635, 0.36405689, 0.28804535, 0.27479983, 0.19165361, 0.0 ] proportion_explained = [ 0.2626621381, 0.2399817314, 0.1898758748, 0.1811445992, 0.1263356565, 0.0 ] sample_ids = [str(i) for i in range(6)] axis_labels = ['PC%d' % i for i in range(1, 7)] samples = [ [-0.028597, 0.22903853, 0.07055272, 0.26163576, 0.28398669, 0.0], [ 0.37494056, 0.22334055, -0.20892914, 0.05057395, -0.18710366, 0.0 ], [ -0.33517593, -0.23855979, -0.3099887, 0.11521787, -0.05021553, 0.0 ], [0.25412394, -0.4123464, 0.23343642, 0.06403168, -0.00482608, 0.0], [ -0.28256844, 0.18606911, 0.28875631, -0.06455635, -0.21141632, 0.0 ], [0.01727687, 0.012458, -0.07382761, -0.42690292, 0.1695749, 0.0] ] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(samples, index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) data = np.loadtxt(get_data_path('PCoA_sample_data_2')) # test passing a numpy.ndarray and a DistanceMatrix to pcoa # gives same results for dm in (data, DistanceMatrix(data)): results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def ilr_phylogenetic_ordination( table: pd.DataFrame, tree: skbio.TreeNode, pseudocount: float = 0.5, top_k_var: int = 10, clades: list = None ) -> (OrdinationResults, skbio.TreeNode, pd.DataFrame): t = tree.copy() t.bifurcate() _table, _tree = match_tips(table, t) _tree = rename_internal_nodes(_tree) if not clades: in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()] basis = _balance_basis(_tree)[0] _table = add_pseudocount(_table, pseudocount) basis = pd.DataFrame(basis.T, index=_table.columns, columns=in_nodes) balances = np.log(_table) @ basis var = balances.var(axis=0).sort_values(ascending=False) clades = var.index[:top_k_var] balances = balances[clades] basis = basis[clades] else: clades = clades[0].split(',') balances, basis = _fast_ilr(_tree, _table, clades, pseudocount=0.5) var = balances.var(axis=0).sort_values(ascending=False) balances.index.name = 'sampleid' # feature metadata eigvals = var prop = var[clades] / var.sum() balances = OrdinationResults( short_method_name='ILR', long_method_name='Phylogenetic Isometric Log Ratio Transform', samples=balances, features=pd.DataFrame(np.eye(len(clades)), index=clades), eigvals=eigvals, proportion_explained=prop) basis.index.name = 'featureid' return balances, _tree, basis
def paired_omics( microbes: biom.Table, metabolites: biom.Table, metadata: Metadata = None, training_column: str = None, num_testing_examples: int = 5, min_feature_count: int = 10, epochs: int = 100, batch_size: int = 50, latent_dim: int = 3, input_prior: float = 1, output_prior: float = 1, learning_rate: float = 0.001, summary_interval: int = 60) -> (pd.DataFrame, OrdinationResults): if metadata is not None: metadata = metadata.to_dataframe() # Note: there are a couple of biom -> pandas conversions taking # place here. This is currently done on purpose, since we # haven't figured out how to handle sparse matrix multiplication # in the context of this algorithm. That is a future consideration. res = split_tables(microbes, metabolites, metadata=metadata, training_column=training_column, num_test=num_testing_examples, min_samples=min_feature_count) (train_microbes_df, test_microbes_df, train_metabolites_df, test_metabolites_df) = res train_microbes_coo = coo_matrix(train_microbes_df.values) test_microbes_coo = coo_matrix(test_microbes_df.values) with tf.Graph().as_default(), tf.Session() as session: model = MMvec(latent_dim=latent_dim, u_scale=input_prior, v_scale=output_prior, learning_rate=learning_rate) model(session, train_microbes_coo, train_metabolites_df.values, test_microbes_coo, test_metabolites_df.values) loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval) U, V = model.U, model.V U_ = np.hstack((np.ones( (model.U.shape[0], 1)), model.Ubias.reshape(-1, 1), U)) V_ = np.vstack( (model.Vbias.reshape(1, -1), np.ones((1, model.V.shape[1])), V)) ranks = pd.DataFrame(np.hstack((np.zeros( (model.U.shape[0], 1)), U_ @ V_)), index=train_microbes_df.columns, columns=train_metabolites_df.columns) ranks = ranks - ranks.mean(axis=1).values.reshape(-1, 1) ranks = ranks - ranks.mean(axis=0) u, s, v = svds(ranks, k=latent_dim) s = s[::-1] u = u[:, ::-1] v = v[::-1, :] microbe_embed = u @ np.diag(s) metabolite_embed = v.T pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])] features = pd.DataFrame(microbe_embed, columns=pc_ids, index=train_microbes_df.columns) samples = pd.DataFrame(metabolite_embed, columns=pc_ids, index=train_metabolites_df.columns) short_method_name = 'mmvec biplot' long_method_name = 'Multiomics mmvec biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids) biplot = OrdinationResults(short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) return ranks, biplot
def scatterplot(df, x=None, y=None, z=None, remote=True): """Create an Emperor scatter plot from a Pandas DataFrame Parameters ---------- df : pd.DataFrame Pandas DataFrame with the data to display, this includes both *metadata* and *coordinates* to position the samples in a 3D space. x, y, z : str, optional Column names in `df`, to use as first (``x``), second (``y``) and third (``z``) axes in the visualization. If these are not specified, axes are chosen according to the variance (in decremental order). remote : bool, optional Whether the JavaScript resources should be loaded locally or from GitHub. Defaults to ``True``. Returns ------- emperor.core.Emperor Emperor object with the numerical data as the `ordination` attribute and the entire DataFrame as the `mf` attribute. Raises ------ ValueError If `df` is not a PandasDataFrame If `x`, `y` or `z` are missing from `df` or if they are not numeric columns. If after removing rows with missing data there are fewer than 3 samples. Notes ----- If a row has missing data, that data point will be removed from the visualization. See Also -------- emperor.core.Emperor """ if not isinstance(df, pd.DataFrame): raise ValueError("The argument is not a Pandas DataFrame") for col in [z, y, x]: if col is None: continue if col not in df.columns: raise ValueError("'%s' is not a column in the DataFrame" % col) if not np.issubdtype(df[col].dtype, np.number): raise ValueError("'%s' is not a numeric column" % col) # remove NAs samples = df.select_dtypes(include=[np.number]).copy() samples.dropna(axis=0, how='any', inplace=True) if len(samples.columns) < 3: raise ValueError("Not enough data to plot") # sort columns by variance variance = samples.var().sort_values(ascending=False) samples = samples[variance.index] # re-order x, y and z ordered = samples.columns.tolist() for col in [z, y, x]: if col is not None: ordered.remove(col) ordered = [col] + ordered samples = samples[ordered] # match up the metadata and coordinates df = df.loc[samples.index] ores = OrdinationResults(short_method_name='', long_method_name='', eigvals=np.zeros_like(samples.columns), samples=samples, proportion_explained=variance) df.index.name = '#SampleID' # HACK: scale the position of the samples to fit better within the screen ores.samples = ores.samples / ores.samples.max(axis=0) return Emperor(ores, df, dimensions=len(ores.samples.columns), remote=remote)
def multinomial(table: biom.Table, metadata: Metadata, formula: str, training_column: str = DEFAULTS["training-column"], num_random_test_examples: int = ( DEFAULTS["num-random-test-examples"] ), epochs: int = DEFAULTS["epochs"], batch_size: int = DEFAULTS["batch-size"], differential_prior: float = DEFAULTS["differential-prior"], learning_rate: float = DEFAULTS["learning-rate"], clipnorm: float = DEFAULTS["clipnorm"], min_sample_count: int = DEFAULTS["min-sample-count"], min_feature_count: int = DEFAULTS["min-feature-count"], summary_interval: int = DEFAULTS["summary-interval"], random_seed: int = DEFAULTS["random-seed"], ) -> ( pd.DataFrame, qiime2.Metadata, skbio.OrdinationResults ): # load metadata and tables metadata = metadata.to_dataframe() # match them table, metadata, design = match_and_filter( table, metadata, formula, min_sample_count, min_feature_count ) # convert to dense representation dense_table = table.to_dataframe().to_dense().T # split up training and testing trainX, testX, trainY, testY = split_training( dense_table, metadata, design, training_column, num_random_test_examples, seed=random_seed, ) model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm, beta_mean=differential_prior, batch_size=batch_size, save_path=None) with tf.Graph().as_default(), tf.Session() as session: tf.set_random_seed(random_seed) model(session, trainX, trainY, testX, testY) loss, cv, its = model.fit( epochs=epochs, summary_interval=summary_interval, checkpoint_interval=None) md_ids = np.array(design.columns) obs_ids = table.ids(axis='observation') beta_ = np.hstack((np.zeros((model.p, 1)), model.B)) beta_ = beta_ - beta_.mean(axis=1).reshape(-1, 1) differentials = pd.DataFrame( beta_.T, columns=md_ids, index=obs_ids, ) differentials.index.name = 'featureid' convergence_stats = pd.DataFrame( { 'loss': loss, 'cross-validation': cv, 'iteration': its } ) convergence_stats.index.name = 'id' convergence_stats.index = convergence_stats.index.astype(np.str) c = convergence_stats['loss'].astype(np.float) convergence_stats['loss'] = c c = convergence_stats['cross-validation'].astype(np.float) convergence_stats['cross-validation'] = c c = convergence_stats['iteration'].astype(np.int) convergence_stats['iteration'] = c # regression biplot if differentials.shape[-1] > 1: u, s, v = np.linalg.svd(differentials) pc_ids = ['PC%d' % i for i in range(len(s))] samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s), columns=pc_ids, index=differentials.index) features = pd.DataFrame(v.T[:, :len(s)], columns=pc_ids, index=differentials.columns) short_method_name = 'regression_biplot' long_method_name = 'Multinomial regression biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = eigvals**2 / (eigvals**2).sum() biplot = OrdinationResults( short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) else: # this is to handle the edge case with only intercepts biplot = OrdinationResults('', '', pd.Series(), pd.DataFrame()) return differentials, qiime2.Metadata(convergence_stats), biplot
coords = (np.random.randn(N, 10) * 1000).tolist() pct_var = pd.Series(1/np.exp(np.arange(10))) pct_var = pct_var / pct_var.sum() md_headers = ['SampleID', 'DOB', 'Strings'] metadata = [] for _id in coords_ids: metadata.append([_id, ''.join(sample(set(categories), 1)), ''.join(choice( ascii_letters) for x in range(10))]) samples = pd.DataFrame(index=coords_ids, data=coords) mf = pd.DataFrame(data=metadata, columns=md_headers) mf.set_index('SampleID', inplace=True) minerals = ['rhodium', 'platinum', 'gold', 'ruthenium'] mf['subject'] = np.random.randint(low=0, high=len(minerals), size=N) mf['subject'] = mf['subject'].apply(lambda x: minerals[x]) res = OrdinationResults(short_method_name='PC', long_method_name='Principal ' 'Coordinates Analysis', eigvals=pct_var, samples=samples, proportion_explained=pct_var) viz = Emperor(res, mf, remote=get_emperor_support_files_dir()) with open('new-emperor.html', 'w') as f: f.write(viz.make_emperor(standalone=True))
def ctf_helper( table: biom.Table, sample_metadata: DataFrame, individual_id_column: str, state_columns: list, n_components: int = DEFAULT_COMP, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, max_iterations_als: int = DEFAULT_MAXITER, max_iterations_rptm: int = DEFAULT_MAXITER, n_initializations: int = DEFAULT_MAXITER, feature_metadata: DataFrame = DEFFM ) -> (dict, OrdinationResults, dict, tuple): """ Runs Compositional Tensor Factorization CTF. """ # validate the metadata using q2 as a wrapper if sample_metadata is not None and not isinstance(sample_metadata, DataFrame): sample_metadata = sample_metadata.to_dataframe() keep_cols = state_columns + [individual_id_column] all_sample_metadata = sample_metadata.drop(keep_cols, axis=1) sample_metadata = sample_metadata[keep_cols] # validate the metadata using q2 as a wrapper if feature_metadata is not None and not isinstance(feature_metadata, DataFrame): feature_metadata = feature_metadata.to_dataframe() # match the data (borrowed in part from gneiss.util.match) subtablefids = table.ids('observation') subtablesids = table.ids('sample') if len(subtablesids) != len(set(subtablesids)): raise ValueError('Data-table contains duplicate sample IDs') if len(subtablefids) != len(set(subtablefids)): raise ValueError('Data-table contains duplicate feature IDs') submetadataids = set(sample_metadata.index) subtablesids = set(subtablesids) subtablefids = set(subtablefids) if feature_metadata is not None: submetadatafeat = set(feature_metadata.index) fidx = subtablefids & submetadatafeat if len(fidx) == 0: raise ValueError(("No more features left. Check to make " "sure that the sample names between " "`feature-metadata` and `table` are " "consistent")) feature_metadata = feature_metadata.reindex(fidx) sidx = subtablesids & submetadataids if len(sidx) == 0: raise ValueError(("No more features left. Check to make sure that " "the sample names between `sample-metadata` and" " `table` are consistent")) if feature_metadata is not None: table.filter(list(fidx), axis='observation', inplace=True) table.filter(list(sidx), axis='sample', inplace=True) sample_metadata = sample_metadata.reindex(sidx) # filter and import table for axis, min_sum in zip(['sample', 'observation'], [min_sample_count, min_feature_count]): table = table.filter(table.ids(axis)[table.sum(axis) >= min_sum], axis=axis, inplace=True) # table to dataframe table = DataFrame(table.matrix_data.toarray(), table.ids('observation'), table.ids('sample')) # tensor building tensor = build() tensor.construct(table, sample_metadata, individual_id_column, state_columns) # factorize TF = TensorFactorization(n_components=n_components, max_als_iterations=max_iterations_als, max_rtpm_iterations=max_iterations_rptm, n_initializations=n_initializations).fit( rclr(tensor.counts)) # label tensor loadings TF.label(tensor, taxonomy=feature_metadata) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> if n_components == 2: TF.subjects.loc[:, 'PC3'] = [0] * len(TF.subjects.index) TF.features.loc[:, 'PC3'] = [0] * len(TF.features.index) TF.proportion_explained['PC3'] = 0 TF.eigvals['PC3'] = 0 # save ordination results short_method_name = 'CTF_Biplot' long_method_name = 'Compositional Tensor Factorization Biplot' # only keep PC -- other tools merge metadata keep_PC = [col for col in TF.features.columns if 'PC' in col] subj_ordin = OrdinationResults( short_method_name, long_method_name, TF.eigvals, samples=TF.subjects[keep_PC].dropna(axis=0), features=TF.features[keep_PC].dropna(axis=0), proportion_explained=TF.proportion_explained) # save distance matrix for each condition distances = {} state_ordn = {} subject_trajectories = {} feature_trajectories = {} for condition, cond, dist, straj, ftraj in zip(tensor.conditions, TF.conditions, TF.subject_distances, TF.subject_trajectory, TF.feature_trajectory): # match distances to metadata ids = straj.index ind_dict = dict((ind, ind_i) for ind_i, ind in enumerate(ids)) inter = set(ind_dict).intersection(sample_metadata.index) indices = sorted([ind_dict[ind] for ind in inter]) dist = dist[indices, :][:, indices] distances[condition] = skbio.stats.distance.DistanceMatrix( dist, ids=ids[indices]) # fix conditions if n_components == 2: cond['PC3'] = [0] * len(cond.index) cond = OrdinationResults(short_method_name, long_method_name, TF.eigvals, samples=cond[keep_PC].dropna(axis=0), features=TF.features[keep_PC].dropna(axis=0), proportion_explained=TF.proportion_explained) state_ordn[condition] = cond # add the sample metadata before returning output # addtionally only keep metadata with trajectory # output available. pre_merge_cols = list(straj.columns) straj = concat( [straj.reindex(all_sample_metadata.index), all_sample_metadata], axis=1, sort=True) straj = straj.dropna(subset=pre_merge_cols) # ensure index name for q2 straj.index.name = "#SampleID" # save traj. keep_PC_traj = [col for col in straj.columns if 'PC' in col] straj[keep_PC_traj] -= straj[keep_PC_traj].mean() ftraj[keep_PC_traj] -= ftraj[keep_PC_traj].mean() subject_trajectories[condition] = straj ftraj.index = ftraj.index.astype(str) feature_trajectories[condition] = ftraj return (state_ordn, subj_ordin, distances, subject_trajectories, feature_trajectories)
def paired_omics(microbes: biom.Table, metabolites: biom.Table, metadata: Metadata = None, training_column: str = None, num_testing_examples: int = 5, min_feature_count: int = 10, epochs: int = 100, batch_size: int = 50, latent_dim: int = 3, input_prior: float = 1, output_prior: float = 1, learning_rate: float = 1e-3, equalize_biplot: float = False, arm_the_gpu: bool = False, summary_interval: int = 60) -> ( pd.DataFrame, OrdinationResults, qiime2.Metadata ): if metadata is not None: metadata = metadata.to_dataframe() if arm_the_gpu: # pick out the first GPU device_name = '/device:GPU:0' else: device_name = '/cpu:0' # Note: there are a couple of biom -> pandas conversions taking # place here. This is currently done on purpose, since we # haven't figured out how to handle sparse matrix multiplication # in the context of this algorithm. That is a future consideration. res = split_tables( microbes, metabolites, metadata=metadata, training_column=training_column, num_test=num_testing_examples, min_samples=min_feature_count) (train_microbes_df, test_microbes_df, train_metabolites_df, test_metabolites_df) = res train_microbes_coo = coo_matrix(train_microbes_df.values) test_microbes_coo = coo_matrix(test_microbes_df.values) with tf.Graph().as_default(), tf.Session() as session: model = MMvec( latent_dim=latent_dim, u_scale=input_prior, v_scale=output_prior, batch_size=batch_size, device_name=device_name, learning_rate=learning_rate) model(session, train_microbes_coo, train_metabolites_df.values, test_microbes_coo, test_metabolites_df.values) loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval) ranks = pd.DataFrame(model.ranks(), index=train_microbes_df.columns, columns=train_metabolites_df.columns) if latent_dim > 0: u, s, v = svds(ranks - ranks.mean(axis=0), k=latent_dim) else: # fake it until you make it u, s, v = svds(ranks - ranks.mean(axis=0), k=1) ranks = ranks.T ranks.index.name = 'featureid' s = s[::-1] u = u[:, ::-1] v = v[::-1, :] if equalize_biplot: microbe_embed = u @ np.sqrt(np.diag(s)) metabolite_embed = v.T @ np.sqrt(np.diag(s)) else: microbe_embed = u @ np.diag(s) metabolite_embed = v.T pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])] features = pd.DataFrame( microbe_embed, columns=pc_ids, index=train_microbes_df.columns) samples = pd.DataFrame( metabolite_embed, columns=pc_ids, index=train_metabolites_df.columns) short_method_name = 'mmvec biplot' long_method_name = 'Multiomics mmvec biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids) biplot = OrdinationResults( short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) its = np.arange(len(loss)) convergence_stats = pd.DataFrame( { 'loss': loss, 'cross-validation': cv, 'iteration': its } ) convergence_stats.index.name = 'id' convergence_stats.index = convergence_stats.index.astype(np.str) c = convergence_stats['loss'].astype(np.float) convergence_stats['loss'] = c c = convergence_stats['cross-validation'].astype(np.float) convergence_stats['cross-validation'] = c c = convergence_stats['iteration'].astype(np.int) convergence_stats['iteration'] = c return ranks, biplot, qiime2.Metadata(convergence_stats)
def setUp(self): super(OrdinationResultsReaderWriterTests, self).setUp() # define in-memory results, one for each of the valid files in # self.valid_fps # CA results axes_ids = ['CA1', 'CA2'] species_ids = ['Species1', 'Species2', 'Species3'] site_ids = ['Site1', 'Site2', 'Site3'] eigvals = pd.Series([0.0961330159181, 0.0409418140138], axes_ids) species = pd.DataFrame([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]], index=species_ids, columns=axes_ids) site = pd.DataFrame([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]], index=site_ids, columns=axes_ids) biplot = None site_constraints = None prop_explained = None ca_scores = OrdinationResults('CA', 'Correspondence Analysis', eigvals=eigvals, features=species, samples=site, biplot_scores=biplot, sample_constraints=site_constraints, proportion_explained=prop_explained) # CCA results axes_ids = ['CCA%d' % i for i in range(1, 10)] species_ids = [ 'Species0', 'Species1', 'Species2', 'Species3', 'Species4', 'Species5', 'Species6', 'Species7', 'Species8' ] site_ids = [ 'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6', 'Site7', 'Site8', 'Site9' ] eigvals = pd.Series([ 0.366135830393, 0.186887643052, 0.0788466514249, 0.082287840501, 0.0351348475787, 0.0233265839374, 0.0099048981912, 0.00122461669234, 0.000417454724117 ], axes_ids) species = pd.DataFrame(np.loadtxt( get_data_path('ordination_exp_Ordination_CCA_species')), index=species_ids, columns=axes_ids) site = pd.DataFrame(np.loadtxt( get_data_path('ordination_exp_Ordination_CCA_site')), index=site_ids, columns=axes_ids) biplot = pd.DataFrame( [[-0.169746767979, 0.63069090084, 0.760769036049], [-0.994016563505, 0.0609533148724, -0.0449369418179], [0.184352565909, -0.974867543612, 0.0309865007541]], columns=axes_ids[:3]) site_constraints = pd.DataFrame(np.loadtxt( get_data_path('ordination_exp_Ordination_CCA_site_constraints')), index=site_ids, columns=axes_ids) prop_explained = None cca_scores = OrdinationResults('CCA', 'Canonical Correspondence Analysis', eigvals=eigvals, features=species, samples=site, biplot_scores=biplot, sample_constraints=site_constraints, proportion_explained=prop_explained) # PCoA results axes_ids = ['PC%d' % i for i in range(1, 10)] species_ids = None site_ids = [ 'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634' ] eigvals = pd.Series([ 0.512367260461, 0.300719094427, 0.267912066004, 0.208988681078, 0.19169895326, 0.16054234528, 0.15017695712, 0.122457748167, 0.0 ], axes_ids) species = None site = pd.DataFrame(np.loadtxt( get_data_path('ordination_exp_Ordination_PCoA_site')), index=site_ids, columns=axes_ids) biplot = None site_constraints = None prop_explained = pd.Series([ 0.267573832777, 0.15704469605, 0.139911863774, 0.109140272454, 0.100111048503, 0.0838401161912, 0.0784269939011, 0.0639511763509, 0.0 ], axes_ids) pcoa_scores = OrdinationResults('PCoA', 'Principal Coordinate Analysis', eigvals=eigvals, features=species, samples=site, biplot_scores=biplot, sample_constraints=site_constraints, proportion_explained=prop_explained) # RDA results axes_ids = ['RDA%d' % i for i in range(1, 8)] species_ids = [ 'Species0', 'Species1', 'Species2', 'Species3', 'Species4', 'Species5' ] site_ids = [ 'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6', 'Site7', 'Site8', 'Site9' ] eigvals = pd.Series([ 25.8979540892, 14.9825779819, 8.93784077262, 6.13995623072, 1.68070536498, 0.57735026919, 0.275983624351 ], axes_ids) species = pd.DataFrame(np.loadtxt( get_data_path('ordination_exp_Ordination_RDA_species')), index=species_ids, columns=axes_ids) site = pd.DataFrame(np.loadtxt( get_data_path('ordination_exp_Ordination_RDA_site')), index=site_ids, columns=axes_ids) biplot = pd.DataFrame( [[0.422650019179, -0.559142585857, -0.713250678211], [0.988495963777, 0.150787422017, -0.0117848614073], [-0.556516618887, 0.817599992718, 0.147714267459], [-0.404079676685, -0.9058434809, -0.127150316558]], columns=axes_ids[:3]) site_constraints = pd.DataFrame(np.loadtxt( get_data_path('ordination_exp_Ordination_RDA_site_constraints')), index=site_ids, columns=axes_ids) prop_explained = None rda_scores = OrdinationResults('RDA', 'Redundancy Analysis', eigvals=eigvals, features=species, samples=site, biplot_scores=biplot, sample_constraints=site_constraints, proportion_explained=prop_explained) self.ordination_results_objs = [ ca_scores, cca_scores, pcoa_scores, rda_scores ]
def pcoa(distance_matrix, algorithm, num_dimensions_out=10): """Perform Principal Coordinate Analysis using a given algorithm to do so. Adapted from scikit-bio. Principal Coordinate Analysis (PCoA) is a method similar to PCA that works from distance matrices, and so it can be used with ecologically meaningful distances like UniFrac for bacteria. In ecology, the euclidean distance preserved by Principal Component Analysis (PCA) is often not a good choice because it deals poorly with double zeros (Species have unimodal distributions along environmental gradients, so if a species is absent from two sites at the same site, it can't be known if an environmental variable is too high in one of them and too low in the other, or too low in both, etc. On the other hand, if an species is present in two sites, that means that the sites are similar.). Parameters ---------- algorithm : Algorithm Algorithm to use to decompose matrix into eigenvectors and eigenvalues num_dimensions_out k number of dimensions to return: selects k eigenvectors corresponding to the k largest eigenvalues distance_matrix : DistanceMatrix A distance matrix. Returns ------- OrdinationResults Object that stores the PCoA results, including eigenvalues, the proportion explained by each of them, and transformed sample coordinates. See Also -------- OrdinationResults Notes ----- It is sometimes known as metric multidimensional scaling or classical scaling. .. note:: If the distance is not euclidean (for example if it is a semimetric and the triangle inequality doesn't hold), negative eigenvalues can appear. There are different ways to deal with that problem (see Legendre & Legendre 1998, \S 9.2.3), but none are currently implemented here. However, a warning is raised whenever negative eigenvalues appear, allowing the user to decide if they can be safely ignored. """ if algorithm is None or not isinstance(algorithm, Algorithm): raise ValueError('Must specify algorithm and ensure it is a subclass' ' of Algorithm.') # If distance_matrix is a raw numpy array representing a matrix, then # coerce it to scikitbio DistanceMatrix object if not isinstance(distance_matrix, DistanceMatrix): distance_matrix = DistanceMatrix(distance_matrix) # Implemented as per algorithm outlined in # Numerical Ecology (Legendre & Legendre 1998) # See Chapter 9, Equation 9.20 E_matrix = e_matrix(distance_matrix.data) # FYI: If the used distance was euclidean, pairwise distances # needn't be computed from the data table Y because F_matrix = # Y.dot(Y.T) (if Y has been centred). # But since we're expecting distance_matrix to be non-euclidian, # we do the following computation as per # Numerical Ecology (Legendre & Legendre 1998) # See Chapter 9, Equation 9.21 # ... which centers the matrix (a requirement for PcoA) F_matrix = f_matrix(E_matrix) # Run the given algorithm that decomposes the matrix into eigenvectors # and eigenvalues. eigenvectors, eigenvalues = algorithm.run(F_matrix, num_dimensions_out) # Coerce to numpy array just in case eigenvectors = np.array(eigenvectors) eigenvalues = np.array(eigenvalues) # Ensure eigenvectors are normalized eigenvectors = np.apply_along_axis(lambda vec: vec / np.linalg.norm(vec), axis=1, arr=eigenvectors) # Generate axis labels for output axis_labels = ['PC%d' % i for i in range(1, len(eigenvectors) + 1)] # Some algorithms do not return eigenvalues. Thus, we cannot compute # the array of proportion of variance explained and we cannot sort the # eigenvectors by their corresponding eigenvalues. if np.all(np.isnan(eigenvalues)): # Only return an OrdinationResults object wrapping the result's # eigenvectors. Leave the eigenvalues as NaNs. # TODO: Nystrom and SCMDS do not return # num_dimensions_out number of eigenvectors # Figure out if we need to throw away eigenvectors here # or if that's the intended behavior. return OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', samples=pd.DataFrame(eigenvectors, index=distance_matrix.ids, columns=axis_labels), eigvals=pd.Series(eigenvalues)) else: # cogent makes eigenvalues positive by taking the # abs value, but that doesn't seem to be an approach accepted # by Legendre & Legendre to deal with negative eigenvalues. # We raise a warning in that case. # First, we coerce values close to 0 to equal 0. indices_close_to_zero = np.isclose(eigenvalues, np.zeros(eigenvalues.shape)) eigenvalues[indices_close_to_zero] = 0 if np.any(eigenvalues < 0): warn( "The result contains negative eigenvalues." " Please compare their magnitude with the magnitude of some" " of the largest positive eigenvalues. If the negative ones" " are smaller, it's probably safe to ignore them, but if they" " are large in magnitude, the results won't be useful. See the" " Notes section for more details. The smallest eigenvalue is" " {0} and the largest is {1}.".format(eigenvalues.min(), eigenvalues.max()), RuntimeWarning) # eigvals might not be ordered, so we order them (at least one # is zero). indices_descending = eigenvalues.argsort()[::-1] eigenvalues = eigenvalues[indices_descending] # Sort eigenvectors in correspondance with eigenvalues' order eigenvectors = eigenvectors[:, indices_descending] # Note that at # least one eigenvalue is zero because only n-1 axes are # needed to represent n points in an euclidean space. # If we return only the coordinates that make sense (i.e., that have a # corresponding positive eigenvalue), then Jackknifed Beta Diversity # won't work as it expects all the OrdinationResults to have the same # number of coordinates. In order to solve this issue, we return the # coordinates that have a negative eigenvalue as 0 num_positive = (eigenvalues >= 0).sum() eigenvectors[:, num_positive:] = np.zeros( eigenvectors[:, num_positive:].shape) eigenvalues[num_positive:] = np.zeros(eigenvalues[num_positive:].shape) # Scale eigenvalues to have length = sqrt(eigenvalue). This # works because our eigenvectors are normalized before doing this # operation. eigenvectors = eigenvectors * np.sqrt(eigenvalues) # Now remove the dimensions with the least information # Only select k (num_dimensions_out) first eigenvectors # and their corresponding eigenvalues from the sorted array # of eigenvectors / eigenvalues if len(eigenvalues) > num_dimensions_out: eigenvectors = eigenvectors[:, :num_dimensions_out] eigenvalues = eigenvalues[:num_dimensions_out] axis_labels = axis_labels[:num_dimensions_out] # Calculate the array of proportion of variance explained proportion_explained = eigenvalues / eigenvalues.sum() return OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigenvalues, index=axis_labels), samples=pd.DataFrame(eigenvectors, index=distance_matrix.ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels))
def test_assert_ordination_results_equal(self): minimal1 = OrdinationResults('foo', 'bar', pd.Series([1.0, 2.0]), pd.DataFrame([[1, 2, 3], [4, 5, 6]])) # a minimal set of results should be equal to itself assert_ordination_results_equal(minimal1, minimal1) # type mismatch with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, 'foo') # numeric values should be checked that they're almost equal almost_minimal1 = OrdinationResults( 'foo', 'bar', pd.Series([1.0000001, 1.9999999]), pd.DataFrame([[1, 2, 3], [4, 5, 6]])) assert_ordination_results_equal(minimal1, almost_minimal1) # test each of the optional numeric attributes for attr in ('features', 'samples', 'biplot_scores', 'sample_constraints'): # missing optional numeric attribute in one, present in the other setattr(almost_minimal1, attr, pd.DataFrame([[1, 2], [3, 4]])) with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) setattr(almost_minimal1, attr, None) # optional numeric attributes present in both, but not almost equal setattr(minimal1, attr, pd.DataFrame([[1, 2], [3, 4]])) setattr(almost_minimal1, attr, pd.DataFrame([[1, 2], [3.00002, 4]])) with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) setattr(minimal1, attr, None) setattr(almost_minimal1, attr, None) # optional numeric attributes present in both, and almost equal setattr(minimal1, attr, pd.DataFrame([[1.0, 2.0], [3.0, 4.0]])) setattr(almost_minimal1, attr, pd.DataFrame([[1.0, 2.0], [3.00000002, 4]])) assert_ordination_results_equal(minimal1, almost_minimal1) setattr(minimal1, attr, None) setattr(almost_minimal1, attr, None) # missing optional numeric attribute in one, present in the other almost_minimal1.proportion_explained = pd.Series([1, 2, 3]) with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) almost_minimal1.proportion_explained = None # optional numeric attributes present in both, but not almost equal minimal1.proportion_explained = pd.Series([1, 2, 3]) almost_minimal1.proportion_explained = pd.Series([1, 2, 3.00002]) with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) almost_minimal1.proportion_explained = None almost_minimal1.proportion_explained = None # optional numeric attributes present in both, and almost equal minimal1.proportion_explained = pd.Series([1, 2, 3]) almost_minimal1.proportion_explained = pd.Series([1, 2, 3.00000002]) assert_ordination_results_equal(minimal1, almost_minimal1) almost_minimal1.proportion_explained = None almost_minimal1.proportion_explained = None
def test_biplot_score(self): rda_ = rda(y=self.Y, x=self.X, scale_Y=False, scaling=1) # Load data as computed with vegan 2.4-3: # library(vegan) # data(varechem) # data(varespec) # rda_ = rda(X=varespec, Y=varechem, scale=FALSE) # write.table(summary(rda_, scaling=1)$biplot, # 'vare_rda_biplot_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$sites, # 'vare_rda_sites_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$species, # 'vare_rda_species_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$constraints, # # 'vare_rda_constraints_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$cont$importance[2, ], # 'vare_rda_propexpl_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$cont$importance[1, ], # 'vare_rda_eigvals_from_vegan.csv', sep=',') vegan_features = pd.read_csv( get_data_path('vare_rda_species_from_vegan.csv')) vegan_samples = pd.read_csv( get_data_path('vare_rda_sites_from_vegan.csv')) vegan_biplot = pd.read_csv( get_data_path('vare_rda_biplot_from_vegan.csv')) vegan_constraints = pd.read_csv( get_data_path('vare_rda_constraints_from_vegan.csv')) vegan_propexpl = pd.read_csv( get_data_path('vare_rda_propexpl_from_vegan.csv')) vegan_propexpl = pd.Series(vegan_propexpl.x.values, index=rda_.eigvals.index) vegan_eigvals = pd.read_csv( get_data_path('vare_rda_eigvals_from_vegan.csv')) vegan_eigvals = pd.Series(vegan_eigvals.x.values, index=rda_.eigvals.index) # scikit-bio returns singular values, whereas vegan returns eigenvalues vegan_eigvals = np.sqrt(vegan_eigvals * vegan_eigvals.shape[0]) vegan_propexpl = vegan_eigvals / vegan_eigvals.sum() # transform the output of rda_ to match column selection of vegan res_samples = rda_.samples.iloc[:, 0:6] res_features = rda_.features.iloc[:, 0:6] rda_ = OrdinationResults( 'RDA', 'Redundancy Analysis', samples=res_samples, features=res_features, sample_constraints=rda_.sample_constraints.iloc[:, 0:6], biplot_scores=rda_.biplot_scores.iloc[:, 0:6], proportion_explained=rda_.proportion_explained, eigvals=rda_.eigvals) exp = OrdinationResults('RDA', 'Redundancy Analysis', samples=vegan_samples, features=vegan_features, sample_constraints=vegan_constraints, biplot_scores=vegan_biplot, proportion_explained=vegan_propexpl, eigvals=vegan_eigvals) # This scaling constant is required to make skbio comparable to vegan. scaling = (rda_.eigvals[0] / rda_.eigvals[:6]) exp.biplot_scores *= scaling assert_ordination_results_equal(rda_, exp, ignore_directionality=False, decimal=6)