def biplot(output_dir: str, biplot: skbio.OrdinationResults, sample_metadata: qiime2.Metadata, feature_metadata: qiime2.Metadata = None, ignore_missing_samples: bool = False, invert: bool = False, number_of_features: int = 5) -> None: if invert: biplot.samples, biplot.features = biplot.features, biplot.samples sample_metadata, feature_metadata = feature_metadata, sample_metadata # select the top N most important features based on the vector's magnitude feats = biplot.features.copy() origin = np.zeros_like(feats.columns) feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, )) feats.sort_values('importance', inplace=True, ascending=False) feats.drop(['importance'], inplace=True, axis=1) biplot.features = feats[:number_of_features].copy() generic_plot(output_dir, master=biplot, other_pcoa=None, ignore_missing_samples=ignore_missing_samples, metadata=sample_metadata, feature_metadata=feature_metadata, plot_name='biplot')
def _create_ordination_results(self): eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868] proportion_explained = [ 0.2675738328, 0.157044696, 0.1399118638, 0.1091402725 ] sample_ids = [ '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192', '1.SKB7.640196' ] axis_labels = ['PC1', 'PC2', 'PC3', 'PC4'] samples = [[-2.584, 1.739, 3.828, -1.944], [-2.710, -1.859, -8.648, 1.180], [2.350, 9.625, -3.457, -3.208], [2.614, -1.114, 1.476, 2.908], [2.850, -1.925, 6.232, 1.381]] ord_res = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(np.asarray(samples), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) fd, fp = mkstemp(suffix='.txt', dir=self.out_dir) close(fd) ord_res.write(fp) return fp
def test_standalone_rpca(self): """Checks the output produced by DEICODE's standalone script. This is more of an "integration test" than a unit test -- the details of the algorithm used by the standalone RPCA script are checked in more detail in deicode/tests/test_optspace.py, etc. """ in_ = get_data_path('test.biom') out_ = os_path_sep.join(in_.split(os_path_sep)[:-1]) runner = CliRunner() result = runner.invoke(sdc.commands['rpca'], ['--in-biom', in_, '--output-dir', out_]) # Read the results dist_res = pd.read_csv(get_data_path('distance-matrix.tsv'), sep='\t', index_col=0) ord_res = OrdinationResults.read(get_data_path('ordination.txt')) # Read the expected results dist_exp = pd.read_csv(get_data_path('expected-distance-matrix.tsv'), sep='\t', index_col=0) ord_exp = OrdinationResults.read(get_data_path( 'expected-ordination.txt')) # Check that the distance matrix matches our expectations assert_array_almost_equal(dist_res.values, dist_exp.values) # Check that the ordination results match our expectations -- checking # each value for both features and samples assert_deicode_ordinationresults_equal(ord_res, ord_exp) # Lastly, check that DEICODE's exit code was 0 (indicating success) self.assertEqual(result.exit_code, 0)
def _pca(ranks_df: pd.DataFrame, n_components: int = None) -> (OrdinationResults, OrdinationResults): # perform PCA pca_result = PCA(n_components=n_components) pca_result.fit(ranks_df) # transform ranks ranks_transformed = pd.DataFrame(pca_result.transform(ranks_df)) ranks_transformed.index = ranks_df.index components_loadings = pd.DataFrame(-1 * pca_result.components_.T * np.sqrt(pca_result.explained_variance_)) components_loadings.index = ranks_df.columns eigenvalues = pd.Series(pca_result.explained_variance_) ores_scores = OrdinationResults( short_method_name="PCA", long_method_name="Principal Components Analysis", eigvals=eigenvalues, samples=ranks_transformed, features=None, biplot_scores=None, proportion_explained=pd.Series(pca_result.explained_variance_ratio_)) ores_loadings = OrdinationResults( short_method_name="PCA", long_method_name="Principal Components Analysis", eigvals=eigenvalues, samples=components_loadings, features=None, biplot_scores=None, proportion_explained=pd.Series(pca_result.explained_variance_ratio_)) return ores_scores, ores_loadings
def procrustes_analysis( reference: OrdinationResults, other: OrdinationResults, dimensions: int = 5, permutations: int = 999 ) -> (OrdinationResults, OrdinationResults, pd.DataFrame): if reference.samples.shape != other.samples.shape: raise ValueError('The matrices cannot be fitted unless they have the ' 'same dimensions') if reference.samples.shape[1] < dimensions: raise ValueError('Cannot fit fewer dimensions than available') # fail if there are any elements in the symmetric difference diff = reference.samples.index.symmetric_difference(other.samples.index) if not diff.empty: raise ValueError('The ordinations represent two different sets of ' 'samples') # make the matrices be comparable other.samples = other.samples.reindex(index=reference.samples.index) mtx1, mtx2, m2 = procrustes(reference.samples.values[:, :dimensions], other.samples.values[:, :dimensions]) axes = reference.samples.columns[:dimensions] samples1 = pd.DataFrame(data=mtx1, index=reference.samples.index.copy(), columns=axes.copy()) samples2 = pd.DataFrame(data=mtx2, index=reference.samples.index.copy(), columns=axes.copy()) info = _procrustes_monte_carlo(reference.samples.values[:, :dimensions], other.samples.values[:, :dimensions], m2, permutations) out1 = OrdinationResults(short_method_name=reference.short_method_name, long_method_name=reference.long_method_name, eigvals=reference.eigvals[:dimensions].copy(), samples=samples1, features=reference.features, biplot_scores=reference.biplot_scores, sample_constraints=reference.sample_constraints, proportion_explained=reference. proportion_explained[:dimensions].copy()) out2 = OrdinationResults( short_method_name=other.short_method_name, long_method_name=other.long_method_name, eigvals=other.eigvals[:dimensions].copy(), samples=samples2, features=other.features, biplot_scores=other.biplot_scores, sample_constraints=other.sample_constraints, proportion_explained=other.proportion_explained[:dimensions].copy()) return out1, out2, info
def rpca(in_biom: str, output_dir: str, min_sample_depth: int, rank: int) -> None: """ Runs RPCA with an rclr preprocessing step""" # import table table = load_table(in_biom) # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_depth table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T.drop_duplicates() # rclr for saving the transformed OTU table (RSC edited) tablefit = rclr().fit_transform(table.copy()) U,s,V = OptSpace().fit_transform(tablefit) tablefit = np.dot(np.dot(U, s), V.T) tablefit = pd.DataFrame(tablefit.T, index=table.columns, columns=table.index) with open(os.path.join(output_dir, 'rclr_OTUtable.txt'), 'w'): tablefit.to_csv(os.path.join(output_dir, 'rclr_OTUtable.txt'), sep='\t', index_label='OTU_ID') # rclr preprocessing and OptSpace (RPCA) opt = OptSpace(rank=rank).fit(rclr().fit_transform(table.copy())) rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)} # Feature Loadings feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns) feature_loading = feature_loading.rename(columns=rename_cols) feature_loading.sort_values('PC1', inplace=True, ascending=True) # Sample Loadings sample_loading = pd.DataFrame(opt.sample_weights, index=table.index) sample_loading = sample_loading.rename(columns=rename_cols) proportion_explained = pd.Series(opt.explained_variance_ratio, index=list(rename_cols.values())) eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values())) # save ordination results ord_res = OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals.copy(), sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # write files to output folder ord_res.write(os.path.join(output_dir, 'RPCA_Ordination.txt')) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix( opt.distance, ids=sample_loading.index) dist_res.write(os.path.join(output_dir, 'RPCA_distance.txt')) return
def test_scaling1(self): eigvals = pd.Series(np.array([0.09613302, 0.04094181]), self.pc_ids) # p. 458 features = pd.DataFrame( np.array([ [1.31871, -0.34374], # V [-0.37215, 1.48150], [-0.99972, -0.92612] ]), self.feature_ids, self.pc_ids) samples = pd.DataFrame( np.array([ [-0.26322, -0.17862], # F [-0.06835, 0.27211], [0.51685, -0.09517] ]), self.sample_ids, self.pc_ids) exp = OrdinationResults('CA', 'Correspondance Analysis', eigvals=eigvals, features=features, samples=samples) scores = ca(self.contingency, 1) assert_ordination_results_equal(exp, scores, decimal=5, ignore_directionality=True)
def test_book_example_dataset(self): # Adapted from PyCogent's `test_principal_coordinate_analysis`: # "I took the example in the book (see intro info), and did # the principal coordinates analysis, plotted the data and it # looked right". eigvals = [ 0.73599103, 0.26260032, 0.14926222, 0.06990457, 0.02956972, 0.01931184, 0., 0., 0., 0., 0., 0., 0., 0. ] proportion_explained = [ 0.58105792, 0.20732046, 0.1178411, 0.05518899, 0.02334502, 0.01524651, 0., 0., 0., 0., 0., 0., 0., 0. ] sample_ids = [str(i) for i in range(14)] axis_labels = ['PC%d' % i for i in range(1, 15)] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(np.loadtxt( get_data_path('exp_PCoAzeros_site')), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) results = npt.assert_warns(RuntimeWarning, pcoa, self.dm) # Note the absolute value because column can have signs swapped results.samples = np.abs(results.samples) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def test_scaling2(self): eigvals = pd.Series(np.array([0.09613302, 0.04094181]), self.pc_ids) # p. 460 L&L 1998 features = pd.DataFrame( np.array([ [0.40887, -0.06955], # F_hat [-0.11539, 0.29977], [-0.30997, -0.18739] ]), self.feature_ids, self.pc_ids) samples = pd.DataFrame( np.array([ [-0.84896, -0.88276], # V_hat [-0.22046, 1.34482], [1.66697, -0.47032] ]), self.sample_ids, self.pc_ids) exp = OrdinationResults('CA', 'Correspondance Analysis', eigvals=eigvals, features=features, samples=samples) scores = ca(self.contingency, 2) assert_ordination_results_equal(exp, scores, decimal=5, ignore_directionality=True)
def test_simple(self): eigvals = [ 0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895, 0.16054235, 0.15017696, 0.12245775, 0.0 ] proportion_explained = [ 0.2675738328, 0.157044696, 0.1399118638, 0.1091402725, 0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0 ] sample_ids = [ 'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634' ] axis_labels = ['PC%d' % i for i in range(1, 10)] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(np.loadtxt( get_data_path('exp_PCoAEigenResults_site')), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def test_str(self): exp = ("Ordination results:\n" "\tMethod: Correspondance Analysis (CA)\n" "\tEigvals: 2\n" "\tProportion explained: N/A\n" "\tFeatures: 3x2\n" "\tSamples: 3x2\n" "\tBiplot Scores: N/A\n" "\tSample constraints: N/A\n" "\tFeature IDs: 'Species1', 'Species2', 'Species3'\n" "\tSample IDs: 'Site1', 'Site2', 'Site3'") obs = str(self.ordination_results) self.assertEqual(obs, exp) # all optional attributes missing exp = ("Ordination results:\n" "\tMethod: Principal Coordinate Analysis (PCoA)\n" "\tEigvals: 1\n" "\tProportion explained: N/A\n" "\tFeatures: N/A\n" "\tSamples: 2x1\n" "\tBiplot Scores: N/A\n" "\tSample constraints: N/A\n" "\tFeature IDs: N/A\n" "\tSample IDs: 0, 1") samples_df = pd.DataFrame(np.array([[1], [2]])) obs = str( OrdinationResults('PCoA', 'Principal Coordinate Analysis', pd.Series(np.array([4.2])), samples_df)) self.assertEqual(obs.split('\n'), exp.split('\n'))
def setUp(self): self.alpha = pd.Series([1, 2, 3], index=list('abc')) data = np.asarray([[0, 0, 1], [1, 3, 42]]) self.biom = biom.Table(data, ['O1', 'O2'], ['a', 'b', 'c']) eigvals = [0.51236726, 0.30071909, 0.26791207] proportion_explained = [0.2675738328, 0.157044696, 0.1399118638] sample_ids = ['a', 'b', 'c'] axis_labels = ['PC%d' % i for i in range(1, 4)] np.random.seed(11) data = np.random.randn(3, 3) expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame( data, index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) self.ordination = expected_results self.metadata = pd.DataFrame(data=[[':0', ':)', ':/'], [':D', 'xD', '<3'], [';L', ']:->', ':S']], index=list('abc'), columns=['foo', 'bar', 'baz'])
def setUp(self): self.tree = self.mock_tree_from_nwk() self.bp_tree = from_skbio_treenode(self.tree) self.table = biom.Table( np.array([[1, 2, 0, 4], [8, 7, 0, 5], [1, 0, 0, 0], [0, 0, 0, 0]]).T, list('abed'), ['Sample1', 'Sample2', 'Sample3', 'Sample4']) self.sample_metadata = pd.DataFrame( { "Metadata1": [0, 0, 0, 1], "Metadata2": [0, 0, 0, 0], "Metadata3": [1, 2, 3, 4], "Metadata4": ["abc", "def", "ghi", "jkl"] }, index=list(self.table.ids())) # (These are some Greengenes taxonomy annotations I took from the # moving pictures taxonomy.qza file. I made up the confidences.) self.feature_metadata = pd.DataFrame( { "Taxonomy": [("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; " "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; " "s__"), ("k__Bacteria; p__Proteobacteria; " "c__Gammaproteobacteria; o__Pasteurellales; " "f__Pasteurellaceae; g__; s__"), ("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; " "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; " "s__uniformis")], "Confidence": [0.95, 0.8, 0] }, index=["e", "h", "a"]) self.split_tax_fm, self.taxcols = split_taxonomy(self.feature_metadata) self.tip_md = self.split_tax_fm.loc[["a", "e"]] self.int_md = self.split_tax_fm.loc[["h"]] # This is designed to match the shearing that's done in the core test # for --p-shear-to-table self.shorn_tree = parse_newick( "(((a:1)EmpressNode0:1,b:2)g:1,(d:3)h:2)EmpressNode1:1;") self.exp_split_fm_cols = [ "Level 1", "Level 2", "Level 3", "Level 4", "Level 5", "Level 6", "Level 7", "Confidence" ] eigvals = pd.Series([0.50, 0.25, 0.25], index=['PC1', 'PC2', 'PC3']) samples = [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]] proportion_explained = pd.Series([15.5, 12.2, 8.8], index=['PC1', 'PC2', 'PC3']) samples_df = pd.DataFrame( samples, index=['Sample1', 'Sample2', 'Sample3', 'Sample4'], columns=['PC1', 'PC2', 'PC3']) self.ordination = OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df, proportion_explained=proportion_explained)
def test_scaling2(self): scores = rda(self.Y, self.X, scaling=2) mat = np.loadtxt(get_data_path('example2_biplot_scaling2')) cropped_pc_ids = self.pc_ids[:mat.shape[1]] biplot_scores = pd.DataFrame(mat, index=self.env_ids, columns=cropped_pc_ids) sample_constraints = pd.DataFrame(np.loadtxt( get_data_path('example2_sample_constraints_scaling2'))) # Load data as computed with vegan 2.0-8 vegan_features = pd.DataFrame( np.loadtxt(get_data_path( 'example2_species_scaling2_from_vegan')), index=self.feature_ids, columns=self.pc_ids) vegan_samples = pd.DataFrame( np.loadtxt(get_data_path( 'example2_site_scaling2_from_vegan')), index=self.sample_ids, columns=self.pc_ids) sample_constraints = pd.DataFrame( np.loadtxt(get_data_path( 'example2_sample_constraints_scaling2')), index=self.sample_ids, columns=self.pc_ids) mat = np.loadtxt(get_data_path( 'example2_biplot_scaling2')) cropped_pc_ids = self.pc_ids[:mat.shape[1]] biplot_scores = pd.DataFrame(mat, index=self.env_ids, columns=cropped_pc_ids) proportion_explained = pd.Series([0.44275783, 0.25614586, 0.15280354, 0.10497021, 0.02873375, 0.00987052, 0.00471828], index=self.pc_ids) eigvals = pd.Series([25.897954, 14.982578, 8.937841, 6.139956, 1.680705, 0.577350, 0.275984], index=self.pc_ids) exp = OrdinationResults( 'RDA', 'Redundancy Analysis', samples=vegan_samples, features=vegan_features, sample_constraints=sample_constraints, biplot_scores=biplot_scores, proportion_explained=proportion_explained, eigvals=eigvals) assert_ordination_results_equal(scores, exp, ignore_directionality=True, decimal=6)
def setUp(self): # Define in-memory CA results to serialize and deserialize. eigvals = pd.Series([0.0961330159181, 0.0409418140138], ['CA1', 'CA2']) features = np.array([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]]) samples = np.array([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]]) features_ids = ['Species1', 'Species2', 'Species3'] sample_ids = ['Site1', 'Site2', 'Site3'] samples_df = pd.DataFrame(samples, index=sample_ids, columns=['CA1', 'CA2']) features_df = pd.DataFrame(features, index=features_ids, columns=['CA1', 'CA2']) self.ordination_results = OrdinationResults('CA', 'Correspondance Analysis', eigvals=eigvals, samples=samples_df, features=features_df) # DataFrame for testing plot method. Has a categorical column with a # mix of numbers and strings. Has a numeric column with a mix of ints, # floats, and strings that can be converted to floats. Has a numeric # column with missing data (np.nan). self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8], [22, -4.2, np.nan], ['foo', '42.19', 11]], index=['A', 'B', 'C', 'D'], columns=['categorical', 'numeric', 'nancolumn']) # Minimal ordination results for easier testing of plotting method. # Paired with df above. eigvals = np.array([0.50, 0.25, 0.25]) samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) samples_df = pd.DataFrame(samples, ['A', 'B', 'C', 'D'], ['PC1', 'PC2', 'PC3']) self.min_ord_results = OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df)
def test_from_seralized_results(self): # the current implementation of ordination results loses some # information, test that pcoa_biplot works fine regardless results = OrdinationResults.read(get_data_path('PCoA_skbio')) serialized = pcoa_biplot(results, self.descriptors) in_memory = pcoa_biplot(self.ordination, self.descriptors) assert_ordination_results_equal(serialized, in_memory, ignore_directionality=True, ignore_axis_labels=True, ignore_method_names=True)
def apca(df): """Performs Aitchison PCA on a feature table. Parameters ---------- df: pd.DataFrame A numeric DataFrame whose rows are "features" and whose columns are "samples." Returns ------- A 3-tuple (U, p, V) where: U: pd.DataFrame Feature loadings. p: pd.DataFrame Proportions of variance explained. V: pd.DataFrame Sample loadings. """ # do A-PCA U, s, V = svds(clr(df), k=2) V = V.T # reverse (see SVDs docs) U = np.flip(U, axis=1) V = np.flip(V, axis=1) s = s[::-1] # Rename columns; we use "Axis 1", etc. to be consistent with the Qurro # interface pcs = min(V.shape) cols = ["Axis {}".format(pc + 1) for pc in range(pcs)] # Make DataFrames from the feature (U) and sample (V) loadings U = pd.DataFrame(U[:, :pcs], df.index, cols) V = pd.DataFrame(V[:, :pcs], df.columns, cols) # For clarity, rename top-left cell in both loading DataFrames U.index.name = "FeatureID" V.index.name = "SampleID" # get prop. var. explained p = s**2 / np.sum(s**2) p = pd.Series(p.T, index=cols) # format eigenvalues in a way that OrdinationResults expects eigvals = pd.Series(s.T, index=cols) return OrdinationResults("apca", "Aitchison PCA", eigvals, samples=V, features=U, proportion_explained=p)
def procrustes_analysis(reference: OrdinationResults, other: OrdinationResults, dimensions: int=5) -> (OrdinationResults, OrdinationResults): if reference.samples.shape != other.samples.shape: raise ValueError('The matrices cannot be fitted unless they have the ' 'same dimensions') if reference.samples.shape[1] < dimensions: raise ValueError('Cannot fit fewer dimensions than available') # fail if there are any elements in the symmetric difference if not (reference.samples.index ^ other.samples.index).empty: raise ValueError('The ordinations represent two different sets of ' 'samples') # make the matrices be comparable other.samples = other.samples.reindex(index=reference.samples.index) mtx1, mtx2, _ = procrustes(reference.samples.values[:, :dimensions], other.samples.values[:, :dimensions]) axes = reference.samples.columns[:dimensions] samples1 = pd.DataFrame(data=mtx1, index=reference.samples.index.copy(), columns=axes.copy()) samples2 = pd.DataFrame(data=mtx2, index=reference.samples.index.copy(), columns=axes.copy()) out1 = OrdinationResults( short_method_name=reference.short_method_name, long_method_name=reference.long_method_name, eigvals=reference.eigvals[:dimensions].copy(), samples=samples1, features=reference.features, biplot_scores=reference.biplot_scores, sample_constraints=reference.sample_constraints, proportion_explained=reference.proportion_explained[:dimensions] .copy()) out2 = OrdinationResults( short_method_name=other.short_method_name, long_method_name=other.long_method_name, eigvals=other.eigvals[:dimensions].copy(), samples=samples2, features=other.features, biplot_scores=other.biplot_scores, sample_constraints=other.sample_constraints, proportion_explained=other.proportion_explained[:dimensions] .copy()) return out1, out2
def test_standalone_rpca_rank_est(self): """Checks the standalone rank estimate is used instead of a explicit rank setting. """ in_ = get_data_path('test.biom') out_ = os_path_sep.join(in_.split(os_path_sep)[:-1]) runner = CliRunner() result = runner.invoke(sdc.commands['auto-rpca'], ['--in-biom', in_, '--output-dir', out_]) # Read the results dist_res = pd.read_csv(get_data_path('distance-matrix.tsv'), sep='\t', index_col=0) ord_res = OrdinationResults.read(get_data_path('ordination.txt')) # Read the expected results file_ = 'expected-est-distance-matrix.tsv' dist_exp = pd.read_csv(get_data_path(file_), sep='\t', index_col=0) ord_exp = OrdinationResults.read(get_data_path( 'expected-est-ordination.txt')) # Check that the distance matrix matches our expectations assert_array_almost_equal(dist_res.values, dist_exp.values) # Check that the ordination results match our expectations -- checking # each value for both features and samples assert_deicode_ordinationresults_equal(ord_res, ord_exp) # Lastly, check that DEICODE's exit code was 0 (indicating success) try: self.assertEqual(0, result.exit_code) except AssertionError: ex = result.exception error = Exception('Command failed with non-zero exit code') raise error.with_traceback(ex.__traceback__)
def _generate_ordination_results_summary(files, metadata, out_dir): # Magic number [0] -> there is only one plain text file and it is the # ordination results ord_res = OrdinationResults.read(files['plain_text'][0]) md_df = pd.DataFrame.from_dict(metadata, orient='index') emp = Emperor(ord_res, md_df, remote="emperor_support_files") html_summary_fp = join(out_dir, 'index.html') esf_dp = join(out_dir, 'emperor_support_files') makedirs(esf_dp) with open(html_summary_fp, 'w') as f: f.write(emp.make_emperor(standalone=True)) emp.copy_support_files(esf_dp) return html_summary_fp, esf_dp
def test_standalone_rpca_rank_est(self): """Checks the standalone RPCA rank estimate is used instead of a explicit rank setting. """ in_ = get_data_path('test.biom', subfolder='rpca_data') out_ = os_path_sep.join(in_.split(os_path_sep)[:-1]) runner = CliRunner() result = runner.invoke(sdc.commands['auto-rpca'], ['--in-biom', in_, '--output-dir', out_]) # Read the results dist_res = pd.read_csv(get_data_path('distance-matrix.tsv', subfolder='rpca_data'), sep='\t', index_col=0) ord_res = OrdinationResults.read( get_data_path('ordination.txt', subfolder='rpca_data')) # Read the expected results file_ = 'expected-est-distance-matrix.tsv' dist_exp = pd.read_csv(get_data_path(file_, subfolder='rpca_data'), sep='\t', index_col=0) ord_exp = OrdinationResults.read( get_data_path('expected-est-ordination.txt', subfolder='rpca_data')) # Check that the distance matrix matches our expectations assert_array_almost_equal(dist_res.values, dist_exp.values) # Check that the ordination results match our expectations -- checking # each value for both features and samples assert_ordinationresults_equal(ord_res, ord_exp) # Lastly, check that gemelli's exit code was 0 (indicating success) CliTestCase().assertExitCode(0, result)
def plot(output_dir: str, pcoa: skbio.OrdinationResults, metadata: qiime2.Metadata, custom_axes: str = None, ignore_missing_samples: bool = False, ignore_pcoa_features: bool = False) -> None: if ignore_pcoa_features: pcoa.features = None if pcoa.features is not None: raise ValueError("Arrows cannot be visualized with the 'plot' method, " "use 'biplot' instead, or enable " "`ignore_pcoa_features`.") generic_plot(output_dir, master=pcoa, metadata=metadata, other_pcoa=None, ignore_missing_samples=ignore_missing_samples, custom_axes=custom_axes, plot_name='plot')
def test_standalone_rpca_n_components(self): """Tests the standalone script when n_components is 2 """ in_ = get_data_path('test.biom') out_ = os_path_sep.join(in_.split(os_path_sep)[:-1]) runner = CliRunner() # run the same command but with rank==2 result = runner.invoke(standalone_rpca, [ '--in-biom', in_, '--output-dir', out_, '--n_components', 2, '--max_iterations', 5 ]) self.assertEqual(result.exit_code, 0) ord_res = OrdinationResults.read(get_data_path('ordination.txt')) # check it contains three axis if len(ord_res.proportion_explained) == 3: pass
def regression_biplot(coefficients: pd.DataFrame) -> skbio.OrdinationResults: coefs = clr(centralize(clr_inv(coefficients))) u, s, v = np.linalg.svd(coefs) pc_ids = ['PC%d' % i for i in range(len(s))] samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s), columns=pc_ids, index=coefficients.index) features = pd.DataFrame(v.T[:, :len(s)], columns=pc_ids, index=coefficients.columns) short_method_name = 'regression_biplot' long_method_name = 'Multinomial regression biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = eigvals / eigvals.sum() res = OrdinationResults(short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) return res
def biplot(output_dir: str, biplot: skbio.OrdinationResults, sample_metadata: qiime2.Metadata, feature_metadata: qiime2.Metadata=None, number_of_features: int=5) -> None: # select the top N most important features based on the vector's magnitude feats = biplot.features.copy() origin = np.zeros_like(feats.columns) feats['importance'] = feats.apply(euclidean, axis=1, args=(origin,)) feats.sort_values('importance', inplace=True, ascending=False) feats.drop(['importance'], inplace=True, axis=1) biplot.features = feats[:number_of_features].copy() _generic_plot(output_dir, master=biplot, other_pcoa=None, metadata=sample_metadata, feature_metadata=feature_metadata, plot_name='biplot')
def test_scaling2(self): scores = cca(self.Y, self.X, scaling=2) # Load data as computed with vegan 2.0-8 vegan_features = pd.DataFrame(np.loadtxt( get_data_path('example3_species_scaling2_from_vegan')), index=self.feature_ids, columns=self.pc_ids) vegan_samples = pd.DataFrame(np.loadtxt( get_data_path('example3_site_scaling2_from_vegan')), index=self.sample_ids, columns=self.pc_ids) sample_constraints = pd.DataFrame(np.loadtxt( get_data_path('example3_sample_constraints_scaling2')), index=self.sample_ids, columns=self.pc_ids) mat = np.loadtxt(get_data_path('example3_biplot_scaling2')) cropped_pc_ids = self.pc_ids[:mat.shape[1]] biplot_scores = pd.DataFrame(mat, index=self.env_ids, columns=cropped_pc_ids) proportion_explained = pd.Series([ 0.466911, 0.238327, 0.100548, 0.104937, 0.044805, 0.029747, 0.012631, 0.001562, 0.000532 ], index=self.pc_ids) eigvals = pd.Series([ 0.366136, 0.186888, 0.078847, 0.082288, 0.035135, 0.023327, 0.009905, 0.001225, 0.000417 ], index=self.pc_ids) exp = OrdinationResults('CCA', 'Canonical Correspondence Analysis', samples=vegan_samples, features=vegan_features, sample_constraints=sample_constraints, biplot_scores=biplot_scores, proportion_explained=proportion_explained, eigvals=eigvals) assert_ordination_results_equal(scores, exp, decimal=6)
def _simulation_data(data, ids): with open("ordination.txt","w", encoding='utf8') as ordination: ordination.write('Eigvals\t0'+'\n\n') ordination.write('Proportion explained\t0'+'\n\n') ordination.write('Species\t0\t0\n\n') ordination.write('Site\t'+str(len(data)*len(data[0][0]))+'\t3\n') dm = {} j=0 for row in data: identifier = ids[j] for i in range(len(row[0])): ordination.write(str(identifier)+"_t"+str(i)+"\t"+str(row[0][i])+"\t"+str(row[1][i])+"\t"+str(row[2][i])+"\n") dm.update({str(identifier)+"."+str(i):[row[0][i],row[1][i],row[2][i]]}) j+=1 ordination.write("\n") ordination.write("Biplot\t0\t0\n\n") ordination.write("Site constraints\t0\t0\n") ordination_results = OrdinationResults.read("ordination.txt") ordination.close os.remove("ordination.txt") # Distance matrix (euclidean) dm_0 = [] dm_0.append("") distance_matrix = [] for key in dm.keys(): dm_0.append(key) distance_matrix.append(dm_0) for key in dm.keys(): dm_1 = [] dm_1.append(key) for key1 in dm.keys(): dm_1.append(str(distance.euclidean(dm[key],dm[key1]))) distance_matrix.append(dm_1) #Mapping file md_0 = ["#SampleID","Subject","Treatment","Timepoint"] md_1 = ["#q2:types","categorical","categorical","numeric"] md = [] for id in ids: for i in range(len(data[0][0])): md.append([id+"_t"+str(i),id,''.join([k for k in id if not k.isdigit()])[:-1],i]) metadata = [md_0,md_1] for row in md: metadata.append(row) #ADD FUNCTIONALITY TO RETURN MAPPING FILE return ordination_results, distance_matrix
def test_standalone_rpca_n_components(self): """Tests the standalone RPCA script when n_components is 2 """ in_ = get_data_path('test.biom', subfolder='rpca_data') out_ = os_path_sep.join(in_.split(os_path_sep)[:-1]) runner = CliRunner() # run the same command but with rank==2 result = runner.invoke(sdc.commands['rpca'], [ '--in-biom', in_, '--output-dir', out_, '--n_components', 2, '--max_iterations', 5 ]) CliTestCase().assertExitCode(0, result) ord_res = OrdinationResults.read( get_data_path('ordination.txt', subfolder='rpca_data')) # check it contains three axis if len(ord_res.proportion_explained) == 3: pass
def test_extensive(self): eigvals = [ 0.3984635, 0.36405689, 0.28804535, 0.27479983, 0.19165361, 0.0 ] proportion_explained = [ 0.2626621381, 0.2399817314, 0.1898758748, 0.1811445992, 0.1263356565, 0.0 ] sample_ids = [str(i) for i in range(6)] axis_labels = ['PC%d' % i for i in range(1, 7)] samples = [ [-0.028597, 0.22903853, 0.07055272, 0.26163576, 0.28398669, 0.0], [ 0.37494056, 0.22334055, -0.20892914, 0.05057395, -0.18710366, 0.0 ], [ -0.33517593, -0.23855979, -0.3099887, 0.11521787, -0.05021553, 0.0 ], [0.25412394, -0.4123464, 0.23343642, 0.06403168, -0.00482608, 0.0], [ -0.28256844, 0.18606911, 0.28875631, -0.06455635, -0.21141632, 0.0 ], [0.01727687, 0.012458, -0.07382761, -0.42690292, 0.1695749, 0.0] ] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(samples, index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) data = np.loadtxt(get_data_path('PCoA_sample_data_2')) # test passing a numpy.ndarray and a DistanceMatrix to pcoa # gives same results for dm in (data, DistanceMatrix(data)): results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def plot(output_dir: str, tree: NewickFormat, feature_table: pd.DataFrame, sample_metadata: qiime2.Metadata, pcoa: OrdinationResults = None, feature_metadata: qiime2.Metadata = None, ignore_missing_samples: bool = False, filter_missing_features: bool = False, number_of_features: int = 5, filter_unobserved_features_from_phylogeny: bool = True) -> None: if pcoa is not None and pcoa.features is not None: # select the top N most important features based on the vector's # magnitude (coped from q2-emperor) feats = pcoa.features.copy() origin = np.zeros_like(feats.columns) feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, )) feats.sort_values('importance', inplace=True, ascending=False) feats.drop(['importance'], inplace=True, axis=1) pcoa.features = feats[:number_of_features].copy() sample_metadata = sample_metadata.to_dataframe() if feature_metadata is not None: feature_metadata = feature_metadata.to_dataframe() # path to the actual newick file with open(str(tree)) as file: t = parse_newick(file.readline()) trim_tree = filter_unobserved_features_from_phylogeny viz = Empress(tree=t, table=feature_table, sample_metadata=sample_metadata, feature_metadata=feature_metadata, ordination=pcoa, ignore_missing_samples=ignore_missing_samples, filter_missing_features=filter_missing_features, filter_unobserved_features_from_phylogeny=trim_tree) with open(os.path.join(output_dir, 'empress.html'), 'w') as file: file.write(str(viz)) viz.copy_support_files(output_dir) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir)
def community_plot(output_dir: str, tree: NewickFormat, feature_table: biom.Table, sample_metadata: qiime2.Metadata, pcoa: OrdinationResults = None, feature_metadata: qiime2.Metadata = None, ignore_missing_samples: bool = False, filter_extra_samples: bool = False, filter_missing_features: bool = False, number_of_features: int = 5, shear_tree: bool = True) -> None: """Visualizes a tree alongside community-level data. The functionality available in this visualization is a superset of the functionality in tree_plot() -- including sample metadata coloring / barplots, animations, and Emperor integration support. """ if pcoa is not None and pcoa.features is not None: # select the top N most important features based on the vector's # magnitude (coped from q2-emperor) feats = pcoa.features.copy() # in cases where the axes are all zero there might be all-NA # columns feats.fillna(0, inplace=True) origin = np.zeros_like(feats.columns) feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, )) feats.sort_values('importance', inplace=True, ascending=False) feats.drop(['importance'], inplace=True, axis=1) pcoa.features = feats[:number_of_features].copy() sample_metadata = sample_metadata.to_dataframe() if feature_metadata is not None: feature_metadata = feature_metadata.to_dataframe() t = get_bp(tree) viz = Empress(tree=t, table=feature_table, sample_metadata=sample_metadata, feature_metadata=feature_metadata, ordination=pcoa, ignore_missing_samples=ignore_missing_samples, filter_extra_samples=filter_extra_samples, filter_missing_features=filter_missing_features, shear_tree=shear_tree) save_viz(viz, output_dir)
def _validate_ordination_results(files, metadata, out_dir): # Magic number [0] -> there is only one plain text file, which is the # ordination results ord_res_fp = files['plain_text'][0] ord_res = OrdinationResults.read(ord_res_fp) # Get the ids of the ordination results and the metadata ord_res_ids = set(ord_res.samples.index) metadata_ids = set(metadata) if not metadata_ids.issuperset(ord_res_ids): return (False, None, "The ordination results contain samples not " "present in the metadata") filepaths = [(ord_res_fp, 'plain_text')] return True, [ArtifactInfo(None, 'ordination_results', filepaths)], ""
def setUp(self): # Define in-memory CA results to serialize and deserialize. eigvals = pd.Series([0.0961330159181, 0.0409418140138], ['CA1', 'CA2']) features = np.array([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]]) samples = np.array([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]]) features_ids = ['Species1', 'Species2', 'Species3'] sample_ids = ['Site1', 'Site2', 'Site3'] samples_df = pd.DataFrame(samples, index=sample_ids, columns=['CA1', 'CA2']) features_df = pd.DataFrame(features, index=features_ids, columns=['CA1', 'CA2']) self.ordination_results = OrdinationResults( 'CA', 'Correspondance Analysis', eigvals=eigvals, samples=samples_df, features=features_df) # DataFrame for testing plot method. Has a categorical column with a # mix of numbers and strings. Has a numeric column with a mix of ints, # floats, and strings that can be converted to floats. Has a numeric # column with missing data (np.nan). self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8], [22, -4.2, np.nan], ['foo', '42.19', 11]], index=['A', 'B', 'C', 'D'], columns=['categorical', 'numeric', 'nancolumn']) # Minimal ordination results for easier testing of plotting method. # Paired with df above. eigvals = np.array([0.50, 0.25, 0.25]) samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) samples_df = pd.DataFrame(samples, ['A', 'B', 'C', 'D'], ['PC1', 'PC2', 'PC3']) self.min_ord_results = OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df)
def create_emperor_visual(args, pcfile): """ Sample .pc file # Eigvals 4 # 0.2705559825337763 0.07359266496720843 0.02997793703738496 0.0 # # Proportion explained 4 # 0.7231669539538659 0.19670525434062255 0.0801277917055116 0.0 # # Species 0 0 # # Site 4 4 # ICM_LCY_Bv6--LCY_0001_2003_05_11 -0.04067063044757823 -0.09380781760926289 0.13680474645584195 0.0 # ICM_LCY_Bv6--LCY_0003_2003_05_04 -0.11521436634022217 -0.15957409396683217 -0.10315005726535573 0.0 # ICM_LCY_Bv6--LCY_0005_2003_05_16 0.4268532792747924 0.06657577342833808 -0.02212569426459717 0.0 # ICM_LCY_Bv6--LCY_0007_2003_05_04 -0.2709682824869916 0.18680613814775715 -0.011528994925888972 0.0 # # Biplot 0 0 # # Site constraints 0 0 """ #print PCoA_result from emperor import Emperor from skbio import OrdinationResults #load metadata mf = load_mf(args.map_fp) # must read from file (scikit-bio version 0.5.1 http://scikit-bio.org/docs/0.5.1/generated/generated/skbio.stats.ordination.OrdinationResults.html res = OrdinationResults.read(pcfile) emp = Emperor(res, mf) pcoa_outdir = os.path.join(args.basedir,'views', 'tmp',args.prefix+'_pcoa3d') print('OUT?',pcoa_outdir,args.basedir) os.makedirs(pcoa_outdir, exist_ok=True) with open(os.path.join(pcoa_outdir, 'index.html'), 'w') as f: f.write(emp.make_emperor(standalone=True)) emp.copy_support_files(pcoa_outdir)
def setUp(self): self.test_matrix = OrdinationResults.read( get_data_path('unweighted_unifrac_pc.txt'))
def _1(data: skbio.OrdinationResults) -> OrdinationFormat: ff = OrdinationFormat() data.write(str(ff), format='ordination') return ff
def test_assert_ordination_results_equal(self): minimal1 = OrdinationResults('foo', 'bar', pd.Series([1.0, 2.0]), pd.DataFrame([[1, 2, 3], [4, 5, 6]])) # a minimal set of results should be equal to itself assert_ordination_results_equal(minimal1, minimal1) # type mismatch with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, 'foo') # numeric values should be checked that they're almost equal almost_minimal1 = OrdinationResults( 'foo', 'bar', pd.Series([1.0000001, 1.9999999]), pd.DataFrame([[1, 2, 3], [4, 5, 6]])) assert_ordination_results_equal(minimal1, almost_minimal1) # test each of the optional numeric attributes for attr in ('features', 'samples', 'biplot_scores', 'sample_constraints'): # missing optional numeric attribute in one, present in the other setattr(almost_minimal1, attr, pd.DataFrame([[1, 2], [3, 4]])) with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) setattr(almost_minimal1, attr, None) # optional numeric attributes present in both, but not almost equal setattr(minimal1, attr, pd.DataFrame([[1, 2], [3, 4]])) setattr(almost_minimal1, attr, pd.DataFrame([[1, 2], [3.00002, 4]])) with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) setattr(minimal1, attr, None) setattr(almost_minimal1, attr, None) # optional numeric attributes present in both, and almost equal setattr(minimal1, attr, pd.DataFrame([[1.0, 2.0], [3.0, 4.0]])) setattr(almost_minimal1, attr, pd.DataFrame([[1.0, 2.0], [3.00000002, 4]])) assert_ordination_results_equal(minimal1, almost_minimal1) setattr(minimal1, attr, None) setattr(almost_minimal1, attr, None) # missing optional numeric attribute in one, present in the other almost_minimal1.proportion_explained = pd.Series([1, 2, 3]) with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) almost_minimal1.proportion_explained = None # optional numeric attributes present in both, but not almost equal minimal1.proportion_explained = pd.Series([1, 2, 3]) almost_minimal1.proportion_explained = pd.Series([1, 2, 3.00002]) with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) almost_minimal1.proportion_explained = None almost_minimal1.proportion_explained = None # optional numeric attributes present in both, and almost equal minimal1.proportion_explained = pd.Series([1, 2, 3]) almost_minimal1.proportion_explained = pd.Series([1, 2, 3.00000002]) assert_ordination_results_equal(minimal1, almost_minimal1) almost_minimal1.proportion_explained = None almost_minimal1.proportion_explained = None
class TestOrdinationResults(unittest.TestCase): def setUp(self): # Define in-memory CA results to serialize and deserialize. eigvals = pd.Series([0.0961330159181, 0.0409418140138], ['CA1', 'CA2']) features = np.array([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]]) samples = np.array([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]]) features_ids = ['Species1', 'Species2', 'Species3'] sample_ids = ['Site1', 'Site2', 'Site3'] samples_df = pd.DataFrame(samples, index=sample_ids, columns=['CA1', 'CA2']) features_df = pd.DataFrame(features, index=features_ids, columns=['CA1', 'CA2']) self.ordination_results = OrdinationResults( 'CA', 'Correspondance Analysis', eigvals=eigvals, samples=samples_df, features=features_df) # DataFrame for testing plot method. Has a categorical column with a # mix of numbers and strings. Has a numeric column with a mix of ints, # floats, and strings that can be converted to floats. Has a numeric # column with missing data (np.nan). self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8], [22, -4.2, np.nan], ['foo', '42.19', 11]], index=['A', 'B', 'C', 'D'], columns=['categorical', 'numeric', 'nancolumn']) # Minimal ordination results for easier testing of plotting method. # Paired with df above. eigvals = np.array([0.50, 0.25, 0.25]) samples = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) samples_df = pd.DataFrame(samples, ['A', 'B', 'C', 'D'], ['PC1', 'PC2', 'PC3']) self.min_ord_results = OrdinationResults( 'PCoA', 'Principal Coordinate Analysis', eigvals, samples_df) def test_str(self): exp = ("Ordination results:\n" "\tMethod: Correspondance Analysis (CA)\n" "\tEigvals: 2\n" "\tProportion explained: N/A\n" "\tFeatures: 3x2\n" "\tSamples: 3x2\n" "\tBiplot Scores: N/A\n" "\tSample constraints: N/A\n" "\tFeature IDs: 'Species1', 'Species2', 'Species3'\n" "\tSample IDs: 'Site1', 'Site2', 'Site3'") obs = str(self.ordination_results) self.assertEqual(obs, exp) # all optional attributes missing exp = ("Ordination results:\n" "\tMethod: Principal Coordinate Analysis (PCoA)\n" "\tEigvals: 1\n" "\tProportion explained: N/A\n" "\tFeatures: N/A\n" "\tSamples: 2x1\n" "\tBiplot Scores: N/A\n" "\tSample constraints: N/A\n" "\tFeature IDs: N/A\n" "\tSample IDs: 0, 1") samples_df = pd.DataFrame(np.array([[1], [2]])) obs = str(OrdinationResults('PCoA', 'Principal Coordinate Analysis', pd.Series(np.array([4.2])), samples_df)) self.assertEqual(obs.split('\n'), exp.split('\n')) def check_basic_figure_sanity(self, fig, exp_num_subplots, exp_title, exp_legend_exists, exp_xlabel, exp_ylabel, exp_zlabel): # check type assert_is_instance(fig, mpl.figure.Figure) # check number of subplots axes = fig.get_axes() npt.assert_equal(len(axes), exp_num_subplots) # check title ax = axes[0] npt.assert_equal(ax.get_title(), exp_title) # shouldn't have tick labels for tick_label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()): npt.assert_equal(tick_label.get_text(), '') # check if legend is present legend = ax.get_legend() if exp_legend_exists: assert_true(legend is not None) else: assert_true(legend is None) # check axis labels npt.assert_equal(ax.get_xlabel(), exp_xlabel) npt.assert_equal(ax.get_ylabel(), exp_ylabel) npt.assert_equal(ax.get_zlabel(), exp_zlabel) def test_plot_no_metadata(self): fig = self.min_ord_results.plot() self.check_basic_figure_sanity(fig, 1, '', False, '0', '1', '2') def test_plot_with_numeric_metadata_and_plot_options(self): fig = self.min_ord_results.plot( self.df, 'numeric', axes=(1, 0, 2), axis_labels=['PC 2', 'PC 1', 'PC 3'], title='a title', cmap='Reds') self.check_basic_figure_sanity( fig, 2, 'a title', False, 'PC 2', 'PC 1', 'PC 3') def test_plot_with_categorical_metadata_and_plot_options(self): fig = self.min_ord_results.plot( self.df, 'categorical', axes=[2, 0, 1], title='a title', cmap='Accent') self.check_basic_figure_sanity(fig, 1, 'a title', True, '2', '0', '1') def test_plot_with_invalid_axis_labels(self): with six.assertRaisesRegex(self, ValueError, 'axis_labels.*4'): self.min_ord_results.plot(axes=[2, 0, 1], axis_labels=('a', 'b', 'c', 'd')) def test_validate_plot_axes_valid_input(self): # shouldn't raise an error on valid input. nothing is returned, so # nothing to check here samples = self.min_ord_results.samples.values.T self.min_ord_results._validate_plot_axes(samples, (1, 2, 0)) def test_validate_plot_axes_invalid_input(self): # not enough dimensions with six.assertRaisesRegex(self, ValueError, '2 dimension\(s\)'): self.min_ord_results._validate_plot_axes( np.asarray([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]]), (0, 1, 2)) coord_matrix = self.min_ord_results.samples.values.T # wrong number of axes with six.assertRaisesRegex(self, ValueError, 'exactly three.*found 0'): self.min_ord_results._validate_plot_axes(coord_matrix, []) with six.assertRaisesRegex(self, ValueError, 'exactly three.*found 4'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, 1, 2, 3)) # duplicate axes with six.assertRaisesRegex(self, ValueError, 'must be unique'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, 1, 0)) # out of range axes with six.assertRaisesRegex(self, ValueError, 'axes\[1\].*3'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, -1, 2)) with six.assertRaisesRegex(self, ValueError, 'axes\[2\].*3'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, 2, 3)) def test_get_plot_point_colors_invalid_input(self): # column provided without df with npt.assert_raises(ValueError): self.min_ord_results._get_plot_point_colors(None, 'numeric', ['B', 'C'], 'jet') # df provided without column with npt.assert_raises(ValueError): self.min_ord_results._get_plot_point_colors(self.df, None, ['B', 'C'], 'jet') # column not in df with six.assertRaisesRegex(self, ValueError, 'missingcol'): self.min_ord_results._get_plot_point_colors(self.df, 'missingcol', ['B', 'C'], 'jet') # id not in df with six.assertRaisesRegex(self, ValueError, 'numeric'): self.min_ord_results._get_plot_point_colors( self.df, 'numeric', ['B', 'C', 'missingid', 'A'], 'jet') # missing data in df with six.assertRaisesRegex(self, ValueError, 'nancolumn'): self.min_ord_results._get_plot_point_colors(self.df, 'nancolumn', ['B', 'C', 'A'], 'jet') def test_get_plot_point_colors_no_df_or_column(self): obs = self.min_ord_results._get_plot_point_colors(None, None, ['B', 'C'], 'jet') npt.assert_equal(obs, (None, None)) def test_get_plot_point_colors_numeric_column(self): # subset of the ids in df exp = [0.0, -4.2, 42.0] obs = self.min_ord_results._get_plot_point_colors( self.df, 'numeric', ['B', 'C', 'A'], 'jet') npt.assert_almost_equal(obs[0], exp) assert_true(obs[1] is None) # all ids in df exp = [0.0, 42.0, 42.19, -4.2] obs = self.min_ord_results._get_plot_point_colors( self.df, 'numeric', ['B', 'A', 'D', 'C'], 'jet') npt.assert_almost_equal(obs[0], exp) assert_true(obs[1] is None) def test_get_plot_point_colors_categorical_column(self): # subset of the ids in df exp_colors = [[0., 0., 0.5, 1.], [0., 0., 0.5, 1.], [0.5, 0., 0., 1.]] exp_color_dict = { 'foo': [0.5, 0., 0., 1.], 22: [0., 0., 0.5, 1.] } obs = self.min_ord_results._get_plot_point_colors( self.df, 'categorical', ['B', 'C', 'A'], 'jet') npt.assert_almost_equal(obs[0], exp_colors) npt.assert_equal(obs[1], exp_color_dict) # all ids in df exp_colors = [[0., 0., 0.5, 1.], [0.5, 0., 0., 1.], [0.5, 0., 0., 1.], [0., 0., 0.5, 1.]] obs = self.min_ord_results._get_plot_point_colors( self.df, 'categorical', ['B', 'A', 'D', 'C'], 'jet') npt.assert_almost_equal(obs[0], exp_colors) # should get same color dict as before npt.assert_equal(obs[1], exp_color_dict) def test_plot_categorical_legend(self): fig = plt.figure() ax = fig.add_subplot(111, projection='3d') # we shouldn't have a legend yet assert_true(ax.get_legend() is None) self.min_ord_results._plot_categorical_legend( ax, {'foo': 'red', 'bar': 'green'}) # make sure we have a legend now legend = ax.get_legend() assert_true(legend is not None) # do some light sanity checking to make sure our input labels and # colors are present. we're not using nose.tools.assert_items_equal # because it isn't available in Python 3. labels = [t.get_text() for t in legend.get_texts()] npt.assert_equal(sorted(labels), ['bar', 'foo']) colors = [l.get_color() for l in legend.get_lines()] npt.assert_equal(sorted(colors), ['green', 'red']) def test_repr_png(self): obs = self.min_ord_results._repr_png_() assert_is_instance(obs, binary_type) assert_true(len(obs) > 0) def test_repr_svg(self): obs = self.min_ord_results._repr_svg_() # print_figure(format='svg') can return text or bytes depending on the # version of IPython assert_true(isinstance(obs, text_type) or isinstance(obs, binary_type)) assert_true(len(obs) > 0) def test_png(self): assert_is_instance(self.min_ord_results.png, Image) def test_svg(self): assert_is_instance(self.min_ord_results.svg, SVG)