def differentialtest(table: biom.Table, metadata: qiime2.Metadata, variable: str, taxonomy: TSVTaxonomyFormat) -> pd.DataFrame: if table.is_empty(): raise ValueError("The provided table object is empty") ## run the R script on the file with tempfile.TemporaryDirectory() as temp_dir_name: ## write the biom table to file input_table = os.path.join(temp_dir_name, 'table.tsv') input_metadata = os.path.join(temp_dir_name, 'metadata.tsv') with open(input_table, 'w') as fh: fh.write(table.to_tsv()) metadata.save(input_metadata) output = os.path.join(temp_dir_name, 'data.tsv') cmd = [ 'differentialtest.R', input_table, input_metadata, str(taxonomy), str(variable), str(output) ] run_commands([cmd]) data = pd.read_csv(output, sep='\t') data.index.name = 'Feature ID' return data
def subsample(table: biom.Table, subsampling_depth: int, axis: str) -> biom.Table: if axis == 'feature': # we are transposing the table due to biocore/biom-format#759 table = table.transpose() if len(table.ids()) < subsampling_depth: raise ValueError('The subsampling depth exceeds the number of ' 'elements on the desired axis. The maximum depth ' 'is: %d.' % len(table.ids())) # the axis is always 'sample' due to the above transpose table = table.subsample(subsampling_depth, axis='sample', by_id=True) # the inverted axis is always observation due to the above transpose invaxis = 'observation' table.filter(lambda v, i, m: v.sum() > 0, axis=invaxis) if axis == 'feature': # reverse the transpose necessary due to biocore/biom-format#759 table = table.transpose() if table.is_empty(): raise ValueError('The subsampled table contains no samples or features' ' (samples/features that sum to zero after filtering' ' are automatically removed). It may be a good idea' ' to double check that your table is valid/nonempty.') return table
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, metric: str, n_jobs: int = 1) -> skbio.DistanceMatrix: if metric not in phylogenetic_metrics(): raise ValueError("Unknown phylogenetic metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") if n_jobs != 1 and metric == 'weighted_unifrac': raise ValueError("Weighted UniFrac is not parallelizable") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') try: results = skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, otu_ids=feature_ids, tree=phylogeny, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs) except skbio.tree.MissingNodeError as e: message = str(e).replace('otu_ids', 'feature_ids') message = message.replace('tree', 'phylogeny') raise skbio.tree.MissingNodeError(message) return results
def beta(table: biom.Table, metric: str, pseudocount: int = 1, n_jobs: int = 1) -> skbio.DistanceMatrix: if not (metric in non_phylogenetic_metrics()): raise ValueError("Unknown metric: %s" % metric) counts = table.matrix_data.toarray().T def aitchison(x, y, **kwds): return euclidean(clr(x), clr(y)) if metric == 'aitchison': counts += pseudocount metric = aitchison if table.is_empty(): raise ValueError("The provided table object is empty") sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs)
def SRS(table: biom.Table, c_min: int, set_seed: bool = True, seed: int = 1) -> biom.Table: if table.is_empty(): raise ValueError("The provided table object is empty") #normalized_table = biom.Table() ## run the R script on the file with tempfile.TemporaryDirectory() as temp_dir_name: ## write the biom table to file input_name = os.path.join(temp_dir_name, 'table.tsv') with open(input_name, 'w') as fh: fh.write(table.to_tsv()) cmd = ['SRS.R', input_name, str(c_min), str(set_seed), str(seed)] run_commands([cmd]) norm_table_df = pd.read_csv(input_name, sep='\t') norm_table_biom = biom.Table(data=norm_table_df.values, observation_ids=norm_table_df.index, sample_ids=norm_table_df.columns) return norm_table_biom
def beta(table: biom.Table, metric: str, pseudocount: int=1, n_jobs: int=1)-> skbio.DistanceMatrix: if not (metric in non_phylogenetic_metrics()): raise ValueError("Unknown metric: %s" % metric) counts = table.matrix_data.toarray().T def aitchison(x, y, **kwds): return euclidean(clr(x), clr(y)) if metric == 'aitchison': counts += pseudocount metric = aitchison if table.is_empty(): raise ValueError("The provided table object is empty") sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs )
def group(table: biom.Table, axis: str, metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table: if table.is_empty(): raise ValueError("Cannot group an empty table.") if axis == 'feature': biom_axis = 'observation' else: biom_axis = axis metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis), axis) grouped_table = table.collapse( lambda axis_id, _: metadata.get_value(axis_id), collapse_f=_mode_lookup[mode], axis=biom_axis, norm=False, include_collapsed_metadata=False) # Reorder axis by first unique appearance of each group value in metadata # (makes it stable for identity mappings and easier to test) # TODO use CategoricalMetadataColumn API for retrieving categories/groups, # when the API exists. series = metadata.to_series() return grouped_table.sort_order(series.unique(), axis=biom_axis)
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, metric: str, n_jobs: int=1)-> skbio.DistanceMatrix: if metric not in phylogenetic_metrics(): raise ValueError("Unknown phylogenetic metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") if n_jobs != 1 and metric == 'weighted_unifrac': raise ValueError("Weighted UniFrac is not parallelizable") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') try: results = skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, otu_ids=feature_ids, tree=phylogeny, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs ) except skbio.tree.MissingNodeError as e: message = str(e).replace('otu_ids', 'feature_ids') message = message.replace('tree', 'phylogeny') raise skbio.tree.MissingNodeError(message) return results
def alpha(table: biom.Table): """ :param table: :return: """ if table.is_empty(): raise ValueError("The provided table object is empty") table = get_biom_table(table) alpha_diversities = [] counts = table.matrix_data.toarray().astype(float).T sample_ids = table.ids(axis='sample') sample_metadata = dict(zip(table.ids(), table.metadata())) for metric in ALPHA_DIVERSITY_METHODS: result = alpha_diversity(metric=metric, counts=counts, ids=sample_ids) result.name = metric alpha_diversities.append(result) aggregated_diversity_results = aggregate_results(alpha_diversities, sample_ids) formatted_diversity_results = _format_alpha_results_to_json( aggregated_diversity_results, sample_metadata) return formatted_diversity_results
def rarefy(table: biom.Table, sampling_depth: int) -> biom.Table: table = table.subsample(sampling_depth, axis='sample', by_id=False) if table.is_empty(): raise ValueError('The rarefied table contains no samples or features. ' 'Verify your table is valid and that you provided a ' 'shallow enough sampling depth.') return table
def collapse_biom(table: biom.Table, mapping: dict, normalize=False): """Collapse a BIOM table in many-to-many mode. Parameters ---------- table : biom.Table Table to collapse. mapping : dict of list of str Source-to-target(s) mapping. normalize : bool, optional Whether normalize per-target counts by number of targets per source. Returns ------- biom.Table Collapsed BIOM table. Notes ----- Metadata will not be retained in the collapsed table. See Also -------- .table.collapse_table """ # filter table features table = table.filter(lambda data, id_, md: id_ in mapping, axis='observation', inplace=False) # stop if no feature left if table.is_empty(): return table # add mapping to table metadata table.add_metadata({k: dict(part=v) for k, v in mapping.items()}, axis='observation') # determine collapsing method kwargs = dict(norm=False, one_to_many=True, axis='observation', one_to_many_mode=('divide' if normalize else 'add')) # collapse table in many-to-many mode table = table.collapse(lambda id_, md: zip(md['part'], md['part']), **kwargs) # round to integers if normalize: round_biom(table) # clean up table.del_metadata(keys=['Path']) return table
def beta(table: biom.Table, metric: str) -> skbio.DistanceMatrix: if metric not in non_phylogenetic_metrics(): raise ValueError("Unknown metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity(metric=metric, counts=counts, ids=sample_ids)
def SRScurve(output_dir: str, table: biom.Table, metric: str = 'richness', step: int = 50, sample: int = 0, max_sample_size: int = 0, rarefy_comparison: bool = False, rarefy_repeats: int = 10, rarefy_comparison_legend: bool = False, srs_color: str = 'black', rarefy_color: str = 'red', srs_linetype: str = 'solid', rarefy_linetype: str = 'longdash', label: bool = False) -> None: if table.is_empty(): raise ValueError("The provided table object is empty") ## run the R script on the file with tempfile.TemporaryDirectory() as temp_dir_name: ## write the biom table to file input_name = os.path.join(temp_dir_name, 'table.tsv') #input_name = 'table.tsv' with open(input_name, 'w') as fh: fh.write(table.to_tsv()) #table_df = pd.read_csv(input_name, sep='\t') cmd = [ 'SRScurve.R', input_name, str(metric), str(step), str(sample), str(max_sample_size), str(rarefy_comparison), str(rarefy_repeats), str(rarefy_comparison_legend), str(srs_color), str(rarefy_color), str(srs_linetype), str(rarefy_linetype), str(label), str(output_dir) ] run_commands([cmd]) plot = os.path.join(output_dir, 'plot.png') index = os.path.join(output_dir, 'index.html') with open(index, 'w') as fh: fh.write( '<!DOCTYPE html><head></head><body><img src="SRScurve_plot.png" style="max-width: 100vw;max-height: 100vh;object-fit: contain" /></body></html>' )
def alpha(table: biom.Table, metric: str) -> pd.Series: if metric not in non_phylogenetic_metrics(): raise ValueError("Unknown metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') result = skbio.diversity.alpha_diversity(metric=metric, counts=counts, ids=sample_ids) result.name = metric return result
def beta(table: biom.Table, metric: str, pseudocount: int = 1, n_jobs: int = 1) -> skbio.DistanceMatrix: if not (metric in non_phylogenetic_metrics()): raise ValueError("Unknown metric: %s" % metric) counts = table.matrix_data.toarray().T def aitchison(x, y, **kwds): return euclidean(clr(x), clr(y)) def canberra_adkins(x, y, **kwds): if (x < 0).any() or (y < 0).any(): raise ValueError("Canberra-Adkins is only defined over positive " "values.") nz = ((x > 0) | (y > 0)) x_ = x[nz] y_ = y[nz] nnz = nz.sum() return (1. / nnz) * np.sum(np.abs(x_ - y_) / (x_ + y_)) def jensen_shannon(x, y, **kwds): return jensenshannon(x, y) if metric == 'aitchison': counts += pseudocount metric = aitchison elif metric == 'canberra_adkins': metric = canberra_adkins elif metric == 'jensenshannon': metric = jensen_shannon if table.is_empty(): raise ValueError("The provided table object is empty") sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs)
def alpha(table: biom.Table) -> AlphaDiversityFormat: if table.is_empty(): raise ValueError("The provided table object is empty") output = AlphaDiversityFormat() ## run the R script on the file with tempfile.TemporaryDirectory() as temp_dir_name: ## write the biom table to file input_name = os.path.join(temp_dir_name, 'table.tsv') with open(input_name, 'w') as fh: fh.write(table.to_tsv()) cmd = ['run_new_richness.R', input_name, str(output)] run_commands([cmd]) return output
def beta(table: biom.Table, metric: str, n_jobs: int=1)-> skbio.DistanceMatrix: if metric not in non_phylogenetic_metrics(): raise ValueError("Unknown metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs )
def beta(self, table: biom.Table, metric: str, pseudocount: int = 1, n_jobs: int = 1): counts = table.matrix_data.toarray().T if table.is_empty(): raise ValueError("The provided table object is empty") sample_ids = table.ids(axis='sample') beta_dv = beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs) return beta_dv
def beta(table: biom.Table, metric: str, pseudocount: int = 1, n_jobs: int = 1) -> skbio.DistanceMatrix: if not (metric in non_phylogenetic_metrics()): raise ValueError("Unknown metric: %s" % metric) counts = table.matrix_data.toarray().T def aitchison(x, y, **kwds): return euclidean(clr(x), clr(y)) def canberra_adkins(x, y, **kwds): if (x < 0).any() or (y < 0).any(): raise ValueError("Canberra-Adkins is only defined over positive " "values.") nz = ((x > 0) | (y > 0)) x_ = x[nz] y_ = y[nz] nnz = nz.sum() return (1. / nnz) * np.sum(np.abs(x_ - y_) / (x_ + y_)) if metric == 'aitchison': counts += pseudocount metric = aitchison elif metric == 'canberra_adkins': metric = canberra_adkins if table.is_empty(): raise ValueError("The provided table object is empty") sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs )
def group(table: biom.Table, axis: str, metadata: qiime2.MetadataCategory, mode: str) -> biom.Table: if table.is_empty(): raise ValueError("Cannot group an empty table.") if axis == 'feature': biom_axis = 'observation' else: biom_axis = axis series = _munge_metadata_category(metadata, table.ids(axis=biom_axis), axis) grouped_table = table.collapse(lambda axis_id, _: series.loc[axis_id], collapse_f=_mode_lookup[mode], axis=biom_axis, norm=False, include_collapsed_metadata=False) # Reorder axis by first unique appearance of each group value in metadata # (makes it stable for identity mappings and easier to test) return grouped_table.sort_order(series.unique(), axis=biom_axis)
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, metric: str) -> skbio.DistanceMatrix: if metric not in phylogenetic_metrics(): raise ValueError("Unknown phylogenetic metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') try: results = skbio.diversity.beta_diversity(metric=metric, counts=counts, ids=sample_ids, otu_ids=feature_ids, tree=phylogeny) except skbio.tree.MissingNodeError as e: message = str(e).replace('otu_ids', 'feature_ids') message = message.replace('tree', 'phylogeny') raise skbio.tree.MissingNodeError(message) return results
def alpha_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, metric: str) -> pd.Series: if metric not in phylogenetic_metrics(): raise ValueError("Unknown phylogenetic metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') try: result = skbio.diversity.alpha_diversity(metric=metric, counts=counts, ids=sample_ids, otu_ids=feature_ids, tree=phylogeny) except skbio.tree.MissingNodeError as e: message = str(e).replace('otu_ids', 'feature_ids') message = message.replace('tree', 'phylogeny') raise skbio.tree.MissingNodeError(message) result.name = metric return result
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str, clustering_method: str, metadata: qiime2.Metadata, sampling_depth: int, iterations: int = 10, phylogeny: skbio.TreeNode = None, correlation_method: str = 'spearman', color_scheme: str = 'BrBG') -> None: with qiime2.sdk.Context() as scope: if table.is_empty(): raise ValueError("Input feature table is empty.") # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) table = qiime2.Artifact.import_data('FeatureTable[Frequency]', table) if metric in phylogenetic_metrics(): if phylogeny is None: raise ValueError("A phylogenetic metric (%s) was requested, " "but a phylogenetic tree was not provided. " "Phylogeny must be provided when using a " "phylogenetic diversity metric." % metric) phylogeny = qiime2.Artifact.import_data('Phylogeny[Rooted]', phylogeny) api_method = scope.ctx.get_action('diversity', 'beta_phylogenetic') beta_func = functools.partial(api_method, phylogeny=phylogeny) else: beta_func = scope.ctx.get_action('diversity', 'beta') rare_func = scope.ctx.get_action('feature-table', 'rarefy') distance_matrices = _get_multiple_rarefaction(beta_func, rare_func, metric, iterations, table, sampling_depth) primary = distance_matrices[0] support = distance_matrices[1:] heatmap_fig, similarity_df = _make_heatmap(distance_matrices, metric, correlation_method, color_scheme) heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg')) similarity_df.to_csv(os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'), sep='\t') tree = _cluster_samples(primary, support, clustering_method) tree.write( os.path.join(output_dir, 'sample-clustering-%s.tre' % clustering_method)) emperor = _jackknifed_emperor(primary, support, metadata) emperor_dir = os.path.join(output_dir, 'emperor') emperor.copy_support_files(emperor_dir) with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh: fh.write(emperor.make_emperor(standalone=True)) templates = list( map( lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page), ['index.html', 'heatmap.html', 'tree.html', 'emperor.html'])) context = { 'metric': metric, 'clustering_method': clustering_method, 'tabs': [{ 'url': 'emperor.html', 'title': 'PCoA' }, { 'url': 'heatmap.html', 'title': 'Heatmap' }, { 'url': 'tree.html', 'title': 'Clustering' }] } q2templates.render(templates, output_dir, context=context)
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() if metadata_df.empty or len(metadata.columns) == 0: raise ValueError("All metadata filtered after dropping columns " "that contained non-categorical data.") metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata(column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, metric: str, weighted: bool)-> skbio.DistanceMatrix: if metric not in phylogenetic_metrics(): raise ValueError("Unknown phylogenetic metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") # Write table to temp file with tempfile.TemporaryDirectory() as temp_dir_name: table_fp = os.path.join(temp_dir_name, 'otu_table.tsv') newick_fp = os.path.join(temp_dir_name, 'tree.newick') with open(table_fp, 'w') as out_table, open(newick_fp, 'w') as newick: # This is easy, just write to newick phylogeny.write(newick) # We have to iterate through each sample out_table.write("\t" + "\t".join(table.ids(axis='observation'))) for sample_id in table.ids(axis='sample'): row = table.data(sample_id) out_table.write("\n" + str(sample_id) + "\t" + \ "\t".join([str(x) for x in row])) # Run ExpressBetaDiversity on them name_map = {'braycurtis': 'Bray-Curtis', 'sorensen': 'Bray-Curtis', 'canberra': 'Canberra', 'chi_squared': 'Chi-squared', 'coeff_similarity': 'CS', 'complete_tree': 'CT', 'euclidean': 'Euclidean', 'f_st': 'Fst', 'p_st': 'Fst', 'gower': 'Gower', 'hellinger': 'Hellinger', 'kulczynski': 'Kulczynski', 'lennon': 'Lennon', 'manhattan': 'Manhattan', 'weighted_unifrac': 'Manhattan', 'mnnd': 'MNND', 'mpd': 'MPD', 'morisita_horn': 'Morisita-Horn', 'normalized_weighted_unifrac': 'NWU', 'pearson': 'Pearson', 'raohp': 'RaoHp', 'soergel': 'Soergel', 'jaccard': 'Soergel', 'unweighted_unifrac': 'Soergel', 'ruzicka': 'Soergel', 'tamas_coeff': 'TC', 'weighted_corr': 'WC', 'whittaker': 'Whittaker', 'yue_clayton': 'Yue-Clayton' } if weighted: weighted = "-w" else: weighted = "" cmd = 'ExpressBetaDiversity -t tree.newick -s otu_table.tsv %s -c %s' \ % (weighted, name_map[metric]) subprocess.run(cmd, cwd=temp_dir_name, shell=True) with open(os.path.join(temp_dir_name, 'output.diss'), 'r') as dist_file: nsamples = int(dist_file.readline()) dist_mat = np.zeros((nsamples, nsamples)) ids = [] for i, line in enumerate(dist_file): ids.append(line.split("\t")[0].strip()) for j, dist in enumerate(line.split("\t")[1:]): dist_mat[i,j] = float(dist) dist_mat[j,i] = float(dist) # Suck the data matrix back in # Return a DistanceMatrix object results = skbio.DistanceMatrix(dist_mat, ids) return results
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str, clustering_method: str, metadata: qiime2.Metadata, sampling_depth: int, iterations: int=10, phylogeny: skbio.TreeNode=None, correlation_method: str='spearman', color_scheme: str='BrBG') -> None: if metric in phylogenetic_metrics(): if phylogeny is None: raise ValueError("A phylogenetic metric (%s) was requested, " "but a phylogenetic tree was not provided. " "Phylogeny must be provided when using a " "phylogenetic diversity metric." % metric) beta_func = functools.partial(beta_phylogenetic, phylogeny=phylogeny) else: beta_func = beta if table.is_empty(): raise ValueError("Input feature table is empty.") # Filter metadata to only include sample IDs present in the feature table. # Also ensures every feature table sample ID is present in the metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) distance_matrices = _get_multiple_rarefaction( beta_func, metric, iterations, table, sampling_depth) primary = distance_matrices[0] support = distance_matrices[1:] heatmap_fig, similarity_df = _make_heatmap( distance_matrices, metric, correlation_method, color_scheme) heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg')) similarity_df.to_csv( os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'), sep='\t') tree = _cluster_samples(primary, support, clustering_method) tree.write(os.path.join(output_dir, 'sample-clustering-%s.tre' % clustering_method)) emperor = _jackknifed_emperor(primary, support, metadata) emperor_dir = os.path.join(output_dir, 'emperor') emperor.copy_support_files(emperor_dir) with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh: fh.write(emperor.make_emperor(standalone=True)) templates = list(map( lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page), ['index.html', 'heatmap.html', 'tree.html', 'emperor.html'])) context = { 'metric': metric, 'clustering_method': clustering_method, 'tabs': [{'url': 'emperor.html', 'title': 'PCoA'}, {'url': 'heatmap.html', 'title': 'Heatmap'}, {'url': 'tree.html', 'title': 'Clustering'}] } q2templates.render(templates, output_dir, context=context)
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode=None, metrics: set=None, metadata: qiime2.Metadata=None, min_depth: int=1, steps: int=10, iterations: int=10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is not None: metadata_ids = metadata.ids() table_ids = set(table.ids(axis='sample')) if not table_ids.issubset(metadata_ids): raise ValueError('Missing samples in metadata: %r' % table_ids.difference(metadata_ids)) filenames, categories, empty_columns = [], [], [] data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: metadata_df = metadata.to_dataframe() metadata_df = metadata_df.loc[data.index] all_columns = metadata_df.columns metadata_df.dropna(axis='columns', how='all', inplace=True) empty_columns = set(all_columns) - set(metadata_df.columns) metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) merged = data.join(metadata_df, how='left') categories = metadata_df.columns.get_level_values(0) for category in categories: category_name = quote(category) reindexed_df, counts = _reindex_with_metadata(category, categories, merged) c_df = _compute_summary(reindexed_df, category, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, category_name) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': filenames, 'categories': list(categories), 'empty_columns': sorted(empty_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() metadata_df.columns = pd.MultiIndex.from_tuples([ (c, '') for c in metadata_df.columns ]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata( column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = [ 'depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values ] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={ 'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def collapse_biom(table: biom.Table, mapping: dict, divide=False, field=None): """Collapse a BIOM table in many-to-many mode. Parameters ---------- table : biom.Table Table to collapse. mapping : dict of list of str Source-to-target(s) mapping. divide : bool, optional Whether divide per-target counts by number of targets per source. field : int, optional Index of field to be collapsed in a stratified table. Returns ------- biom.Table Collapsed BIOM table. Raises ------ ValueError Field index is not present in a feature ID. Notes ----- Metadata will not be retained in the collapsed table. See Also -------- .table.collapse_table """ # generate metadata metadata = {} for id_ in table.ids('observation'): feature = id_ if field: fields = feature.split('|') try: feature = fields[field] except IndexError: raise ValueError( f'Feature "{feature}" has less than {field + 1} fields.') if feature not in mapping: continue targets = [] for target in mapping[feature]: if field: fields[field] = target target = '|'.join(fields) targets.append(target) metadata[id_] = dict(part=targets) # filter table features table = table.filter(lambda data, id_, md: id_ in metadata, axis='observation', inplace=False) # stop if no feature left if table.is_empty(): return table # add mapping to table metadata table.add_metadata(metadata, axis='observation') # determine collapsing method kwargs = dict(norm=False, one_to_many=True, axis='observation', one_to_many_mode=('divide' if divide else 'add')) # collapse table in many-to-many mode table = table.collapse(lambda _, md: zip(md['part'], md['part']), **kwargs) # round to integers if divide: round_biom(table) # clean up table.del_metadata(keys=['Path']) return table