def community_plot( tree: str, table: str, sample_metadata: str, output_dir: str, pcoa: str, feature_metadata: str, ignore_missing_samples: bool, filter_extra_samples: bool, filter_missing_features: bool, number_of_pcoa_features: int, shear_to_table: bool, ) -> None: tree_newick, fm = check_and_process_files(output_dir, tree, feature_metadata) table = load_table(table) sample_metadata = pd.read_csv(sample_metadata, sep="\t", index_col=0) if pcoa is not None: pcoa = OrdinationResults.read(pcoa) pcoa = prepare_pcoa(pcoa, number_of_pcoa_features) viz = Empress( tree_newick, table=table, sample_metadata=sample_metadata, feature_metadata=fm, ordination=pcoa, ignore_missing_samples=ignore_missing_samples, filter_extra_samples=filter_extra_samples, filter_missing_features=filter_missing_features, shear_to_table=shear_to_table, ) os.makedirs(output_dir) save_viz(viz, output_dir, q2=False)
def parse_coords(lines): """Parse skbio's ordination results file into coords, labels, eigvals, pct_explained. Returns: - list of sample labels in order - array of coords (rows = samples, cols = axes in descending order) - list of eigenvalues - list of percent variance explained For the file format check skbio.stats.ordination.OrdinationResults.read Strategy: read the file using skbio's parser and return the objects we want """ try: pcoa_results = OrdinationResults.read(lines) return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals, pcoa_results.proportion_explained) except FileFormatError: try: lines.seek(0) except AttributeError: # looks like we have a list of lines, not a file-like object pass return qiime_parse_coords(lines)
def test_io(self): # Very basic check that read/write public API is present and appears to # be functioning. Roundtrip from memory -> disk -> memory and ensure # results match. fh = StringIO() self.ordination_results.write(fh) fh.seek(0) deserialized = OrdinationResults.read(fh) assert_ordination_results_equal(deserialized, self.ordination_results) self.assertTrue(type(deserialized) == OrdinationResults)
def setUp(self): or_f = StringIO(PCOA_STRING) self.ord_res = OrdinationResults.read(or_f) self.data = [['PC.354', 'Control', '20061218', 'Ctrol_mouse_I.D._354'], ['PC.355', 'Control', '20061218', 'Control_mouse_I.D._355'], ['PC.356', 'Control', '20061126', 'Control_mouse_I.D._356'], ['PC.481', 'Control', '20070314', 'Control_mouse_I.D._481'], ['PC.593', 'Control', '20071210', 'Control_mouse_I.D._593'], ['PC.607', 'Fast', '20071112', 'Fasting_mouse_I.D._607'], ['PC.634', 'Fast', '20080116', 'Fasting_mouse_I.D._634'], ['PC.635', 'Fast', '20080116', 'Fasting_mouse_I.D._635'], ['PC.636', 'Fast', '20080116', 'Fasting_mouse_I.D._636']] self.headers = ['SampleID', 'Treatment', 'DOB', 'Description']
def parse_coords(lines): """Parse skbio's ordination results file into coords, labels, eigvals, pct_explained. Returns: - list of sample labels in order - array of coords (rows = samples, cols = axes in descending order) - list of eigenvalues - list of percent variance explained For the file format check skbio.stats.ordination.OrdinationResults.read Strategy: read the file using skbio's parser and return the objects we want """ pcoa_results = OrdinationResults.read(lines) return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals, pcoa_results.proportion_explained)
if __name__ == '__main__': option_parser, opts, args = parse_command_line_parameters(**script_info) ord_fp = opts.input_fp mapping_fp = opts.map_fp categories = opts.categories.split(',') output_dir = opts.output_dir sort_by = opts.sort_by algorithm = opts.algorithm axes = opts.axes weighted = opts.weight_by_vector window_size = opts.window_size # Parse the ordination results with open(ord_fp, 'U') as f: ord_res = OrdinationResults.read(f) # Parse the mapping file with open(mapping_fp, 'U') as f: map_dict = parse_mapping_file_to_dict(f)[0] metamap = pd.DataFrame.from_dict(map_dict, orient='index') for category in categories: if category not in metamap.keys(): option_parser.error("Category %s does not exist in the mapping " "file" % categories) sort_category = None if sort_by: if sort_by == 'SampleID': sort_category = None
def get_procrustes_results(coords_f1, coords_f2, sample_id_map=None, randomize=None, max_dimensions=None, get_eigenvalues=get_mean_eigenvalues, get_percent_variation_explained=get_mean_percent_variation): """ """ # Parse the PCoA files ord_res_1 = OrdinationResults.read(coords_f1) ord_res_2 = OrdinationResults.read(coords_f2) sample_ids1 = ord_res_1.site_ids coords1 = ord_res_1.site eigvals1 = ord_res_1.eigvals pct_var1 = ord_res_1.proportion_explained sample_ids2 = ord_res_2.site_ids coords2 = ord_res_2.site eigvals2 = ord_res_2.eigvals pct_var2 = ord_res_2.proportion_explained if sample_id_map: sample_ids1 = map_sample_ids(sample_ids1, sample_id_map) sample_ids2 = map_sample_ids(sample_ids2, sample_id_map) # rearrange the order of coords in coords2 to correspond to # the order of coords in coords1 order = list(set(sample_ids1) & set(sample_ids2)) coords1 = reorder_coords(coords1, sample_ids1, order) coords2 = reorder_coords(coords2, sample_ids2, order) if len(order) == 0: raise ValueError('No overlapping samples in the two files') # If this is a random trial, apply the shuffling function passed as # randomize() if randomize: coords2 = randomize(coords2) randomized_coords2 = OrdinationResults(eigvals=eigvals2, proportion_explained=pct_var2, site=coords2, site_ids=order) else: randomized_coords2 = None coords1, coords2 = pad_coords_matrices(coords1, coords2) if max_dimensions: coords1 = filter_coords_matrix(coords1, max_dimensions) coords2 = filter_coords_matrix(coords2, max_dimensions) pct_var1 = pct_var1[:max_dimensions] pct_var2 = pct_var2[:max_dimensions] eigvals1 = eigvals1[:max_dimensions] eigvals2 = eigvals2[:max_dimensions] else: if len(pct_var1) > len(pct_var2): pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2))) eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2))) elif len(pct_var1) < len(pct_var2): pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1))) eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1))) # Run the Procrustes analysis transformed_coords_m1, transformed_coords_m2, m_squared =\ procrustes(coords1, coords2) # print coords2 # print transformed_coords_m2 eigvals = get_eigenvalues(eigvals1, eigvals2) pct_var = get_percent_variation_explained(pct_var1, pct_var2) transformed_coords1 = OrdinationResults(eigvals=asarray(eigvals), proportion_explained=asarray(pct_var), site=asarray(transformed_coords_m1), site_ids=order) transformed_coords2 = OrdinationResults(eigvals=asarray(eigvals), proportion_explained=asarray(pct_var), site=asarray(transformed_coords_m2), site_ids=order) # Return the results return (transformed_coords1, transformed_coords2, m_squared, randomized_coords2)
def load_mp_data(use_artifact_api=True, is_empire=True): """Loads data from the QIIME 2 moving pictures tutorial for visualization. It's assumed that this data is already stored in docs/moving-pictures/, aka the PREFIX_DIR global variable set above, which should be located relative to where this function is being run from. If this directory or the data files within it cannot be accessed, this function will (probably) break. Parameters ---------- use_artifact_api: bool, optional (default True) If True, this will load the artifacts using the QIIME 2 Artifact API, and the returned objects will have types corresponding to the first listed types (before the | characters) shown below. If False, this will instead load the artifacts without using QIIME 2's APIs; in this case, the returned objects will have types corresponding to the second listed types (after the | characters) shown below. is_empire: bool, optional(default True) If True, this will return an ordination. If False, will return None in place of an ordination. Returns ------- (tree, table, md, fmd, ordination) tree: qiime2.Artifact | skbio.tree.TreeNode Phylogenetic tree. table: qiime2.Artifact | biom.Table Feature table. md: qiime2.Metadata | pandas.DataFrame Sample metadata. fmd: qiime2.Metadata | pandas.DataFrame Feature metadata. (Although this is stored in the repository as a FeatureData[Taxonomy] artifact, we transform it to Metadata if use_artifact_api is True.) pcoa: qiime2.Artifact | skbio.OrdinationResults | None """ q2_tree_loc = os.path.join(PREFIX_DIR, "rooted-tree.qza") q2_table_loc = os.path.join(PREFIX_DIR, "table.qza") q2_pcoa_loc = os.path.join(PREFIX_DIR, "unweighted_unifrac_pcoa_results.qza") q2_tax_loc = os.path.join(PREFIX_DIR, "taxonomy.qza") md_loc = os.path.join(PREFIX_DIR, "sample_metadata.tsv") if use_artifact_api: from qiime2 import Artifact, Metadata tree = Artifact.load(q2_tree_loc) table = Artifact.load(q2_table_loc) pcoa = Artifact.load(q2_pcoa_loc) if is_empire else None md = Metadata.load(md_loc) # We have to transform the taxonomy QZA to Metadata ourselves fmd = Artifact.load(q2_tax_loc).view(Metadata) else: import biom import pandas as pd from skbio.stats.ordination import OrdinationResults from skbio.tree import TreeNode with tempfile.TemporaryDirectory() as _tmp: tree_loc = extract_q2_artifact_to_path(_tmp, q2_tree_loc, "tree.nwk") tree = TreeNode.read(tree_loc) tbl_loc = extract_q2_artifact_to_path(_tmp, q2_table_loc, "feature-table.biom") table = biom.load_table(tbl_loc) if is_empire: pcoa_loc = extract_q2_artifact_to_path(_tmp, q2_pcoa_loc, "ordination.txt") pcoa = OrdinationResults.read(pcoa_loc) else: pcoa = None tax_loc = extract_q2_artifact_to_path(_tmp, q2_tax_loc, "taxonomy.tsv") fmd = pd.read_csv(tax_loc, sep="\t", index_col=0) md = pd.read_csv(md_loc, sep="\t", index_col=0, skiprows=[1]) return tree, table, md, fmd, pcoa
distances[dataset_][(fold_, Nsamp_)]['Bray_Curtis'] = table_ table_ = pd.read_table(os.path.join(subpath_, sub_set, 'Robust_Aitchison_Distance.tsv'), index_col=0, low_memory=False) table_.index = table_.index.astype(str) table_.columns = table_.columns.astype(str) table_ = table_.reindex(index=index_me, columns=index_me) distances[dataset_][(fold_, Nsamp_)]['Robust_Aitchison'] = table_ # ordination type file in_ord = os.path.join(subpath_, sub_set, 'RPCA_Ordination.txt') # get loadings from ordination files ordinations[dataset_][( fold_, Nsamp_)]['RPCA_Samples'] = OrdinationResults.read(in_ord).samples ordinations[dataset_][( fold_, Nsamp_)]['RPCA_Features'] = OrdinationResults.read(in_ord).features # permanova analysis from skbio import DistanceMatrix from skbio.stats.distance import permanova both_perm_res = {} perm_res = {} perm_res_tmp = {} for dataset_, subs in distances.items(): perm_res[dataset_] = {} perm_res_tmp[dataset_] = {} for (fold_, Nsamp_), methods_ in subs.items():
def get_pair_cmds(self, omics_pairs): crowdeds = [0, 1] pc_sb_correlations = [] for keys, values in self.mmvec_res.items(): pair, case, omic1, omic2, filt1, filt2, sams, mmvec = keys ranks_fp, ordi_fp, meta_fp, omic1_common, omic2_common = values order_omics = get_order_omics(omic1, omic2, filt1, filt2, case, omics_pairs) omic1 = order_omics[0] omic2 = order_omics[1] filt1 = order_omics[2] filt2 = order_omics[3] omic_feature = order_omics[4] omic_sample = order_omics[5] omic_microbe = order_omics[6] omic_metabolite = order_omics[7] # get differentials meta1, meta_pd1, diff_cols1 = self.metas[(pair, case, omic1, filt1, omic2, filt2)] meta2, meta_pd2, diff_cols2 = self.metas[(pair, case, omic2, filt2, omic1, filt1)] # features are biplot, samples are dots ordi = OrdinationResults.read(ordi_fp) cur_pc_sb_correlations, max_r = get_pc_sb_correlations( pair, case, ordi, omic1, omic2, filt1, filt2, diff_cols1, meta_pd1, diff_cols2, meta_pd2, meta_fp, omic1_common, omic2_common, ranks_fp) pc_sb_correlations.append(cur_pc_sb_correlations) cmd = '' if pair in self.highlights: pair_highlights = self.highlights[pair] for highlight, regexes_list in pair_highlights.items(): n_edit, meta_edit, ordi_edit_fp = edit_ordi_qzv( ordi, ordi_fp, highlight, regexes_list, meta1, meta_pd1) if n_edit: qza, qzv = get_qzs(ordi_edit_fp) cmd += get_biplot_commands(ordi_edit_fp, qza, qzv, omic_feature, omic_sample, meta_edit, meta2, n_edit, max_r) ordi_edit_fp = ordi_fp qza, qzv = get_qzs(ordi_edit_fp) for crowded in crowdeds: if crowded: n_ordi_feats = ordi.features.shape[0] qzv = qzv.replace('.qzv', '_crowded.qzv') else: n_ordi_feats = 15 # heat_qza, heat_qzv = get_heatmap_qzs(ranks_fp) # cmd += get_heatmap_commands( # ranks_fp, heat_qza, heat_qzv, meta1, # meta2, meta_pd1, meta_pd2) cmd += get_biplot_commands(ordi_edit_fp, qza, qzv, omic_feature, omic_sample, meta1, meta2, n_ordi_feats, max_r) cmd += get_xmmvec_commands(ordi_edit_fp, omic1, omic2, meta1, meta2, self.xmmvecs, pair) topn = 5 features_names = [] if features_names: heat = '%s_paired_heatmaps_custom.qzv' % splitext(ranks_fp)[0] else: heat = '%s_paired_heatmaps_top%s.qzv' % (splitext(ranks_fp)[0], topn) cmd += get_paired_heatmaps_command(ranks_fp, omic1_common, omic2_common, meta1, features_names, topn, heat) self.cmds.setdefault(pair, []).append(cmd) return pc_sb_correlations