def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) # drop non-numeric columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric') non_numeric_cols = pre_filtered_cols - set(metadata.columns) # Drop samples that have any missing values. # TODO use Metadata API if more filtering is supported in the future. df = metadata.to_dataframe() df = df.dropna() metadata = qiime2.Metadata(df) # filter 0 variance numerical columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_zero_variance=True, drop_all_missing=True) zero_variance_cols = pre_filtered_cols - set(metadata.columns) df = metadata.to_dataframe() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'non_numeric_cols': ', '.join(sorted(non_numeric_cols)), 'zero_variance_cols': ', '.join(sorted(zero_variance_cols)), 'result': result })
def plot(output_dir, table: biom.Table, metadata: q2.Metadata, case_where: str, control_where: str, feature_tree: skbio.TreeNode = None): with open('/tmp/tree.nwk', 'w') as fh: feature_tree.write(fh) copy_tree(os.path.join(PLOT, 'assets', 'dist'), output_dir) data_dir = os.path.join(output_dir, 'data') os.mkdir(data_dir) metadata = metadata.filter_ids(table.ids(axis='sample')) case_samples = sorted(list(metadata.get_ids(case_where))) control_samples = sorted(list(metadata.get_ids(control_where))) table.filter(case_samples + control_samples) table.remove_empty('observation') features = list(table.ids(axis='observation')) if feature_tree is not None: feature_tree = shear_no_prune(feature_tree, features) else: feature_tree = TreeNode() tree_data = tree_to_array(feature_tree) idx, = np.where(np.asarray(tree_data['children']) == 0) tree_data['lookup'] = dict(zip(map(str, idx), range(len(idx)))) tip_order = np.asarray(tree_data['names'])[idx] table = table.sort_order(tip_order, axis='observation') table = table.sort_order(case_samples + control_samples, axis='sample') with open(os.path.join(data_dir, 'packed_table.jsonp'), 'w') as fh: fh.write('LOAD_PACKED_TABLE(') fh.write(json.dumps(table_to_b64pa(table))) fh.write(');') with open(os.path.join(data_dir, 'tree.jsonp'), 'w') as fh: fh.write('LOAD_TREE(') fh.write(json.dumps(tree_data)) fh.write(');')
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) # drop non-numeric columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric') non_numeric_cols = pre_filtered_cols - set(metadata.columns) # filter 0 variance numerical columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_zero_variance=True, drop_all_missing=True) zero_variance_cols = pre_filtered_cols - set(metadata.columns) # Drop samples that have any missing values. # TODO use Metadata API if this type of filtering is supported in the # future. df = metadata.to_dataframe() df = df.dropna(axis='index', how='any') # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'non_numeric_cols': ', '.join(sorted(non_numeric_cols)), 'zero_variance_cols': ', '.join(sorted(zero_variance_cols)), 'result': result})
def summarize(output_dir: str, table: biom.Table, sample_metadata: qiime2.Metadata = None) -> None: number_of_features, number_of_samples = table.shape sample_summary, sample_frequencies = _frequency_summary(table, axis='sample') if number_of_samples > 1: # Calculate the bin count, with a minimum of 5 bins IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile'] if IQR == 0.0: bins = 5 else: # Freedman–Diaconis rule bin_width = (2 * IQR) / (number_of_samples**(1 / 3)) bins = max((sample_summary['Maximum frequency'] - sample_summary['Minimum frequency']) / bin_width, 5) sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False, rug=True, bins=int(round(bins))) sample_frequencies_ax.get_xaxis().set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) sample_frequencies_ax.set_xlabel('Frequency per sample') sample_frequencies_ax.set_ylabel('Number of samples') sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.pdf')) sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.png')) plt.gcf().clear() feature_summary, feature_frequencies = _frequency_summary( table, axis='observation') if number_of_features > 1: feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False, rug=False) feature_frequencies_ax.set_xlabel('Frequency per feature') feature_frequencies_ax.set_ylabel('Number of features') feature_frequencies_ax.set_xscale('log') feature_frequencies_ax.set_yscale('log') feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.pdf')) feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.png')) sample_summary_table = q2templates.df_to_html( sample_summary.apply('{:,}'.format).to_frame('Frequency')) feature_summary_table = q2templates.df_to_html( feature_summary.apply('{:,}'.format).to_frame('Frequency')) index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html') context = { 'number_of_samples': number_of_samples, 'number_of_features': number_of_features, 'total_frequencies': int(np.sum(sample_frequencies)), 'sample_summary_table': sample_summary_table, 'feature_summary_table': feature_summary_table, } feature_qualitative_data = _compute_qualitative_summary(table) sample_frequencies.sort_values(inplace=True, ascending=False) feature_frequencies.sort_values(inplace=True, ascending=False) sample_frequencies.to_csv( os.path.join(output_dir, 'sample-frequency-detail.csv')) feature_frequencies.to_csv( os.path.join(output_dir, 'feature-frequency-detail.csv')) feature_frequencies = feature_frequencies.astype(int) \ .apply('{:,}'.format).to_frame('Frequency') feature_frequencies['# of Samples Observed In'] = \ pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format) feature_frequencies_table = q2templates.df_to_html(feature_frequencies) overview_template = os.path.join(TEMPLATES, 'summarize_assets', 'overview.html') sample_frequency_template = os.path.join(TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html') feature_frequency_template = os.path.join(TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html') context.update({ 'max_count': sample_frequencies.max(), 'feature_frequencies_table': feature_frequencies_table, 'feature_qualitative_data': feature_qualitative_data, 'tabs': [{ 'url': 'overview.html', 'title': 'Overview' }, { 'url': 'sample-frequency-detail.html', 'title': 'Interactive Sample Detail' }, { 'url': 'feature-frequency-detail.html', 'title': 'Feature Detail' }] }) templates = [ index, sample_frequency_template, feature_frequency_template, overview_template ] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") if sample_metadata: sample_metadata = sample_metadata.filter_ids( sample_frequencies.index) # TODO use Metadata.to_json() API if/when it exists in the future. sample_metadata.to_dataframe().to_json(fh) else: fh.write('{}') fh.write(', ') sample_frequencies.to_json(fh) fh.write(');')
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat( [alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis( groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append([ '%s:%s' % (column, names[i]), '%s:%s' % (column, names[j]) ]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests(kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump( { 'initial': initial_data_length, 'filtered': filtered_data_length }, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render( index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons]) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() metadata_df.columns = pd.MultiIndex.from_tuples([ (c, '') for c in metadata_df.columns ]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata( column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = [ 'depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values ] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={ 'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str = 'spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata contains only non-numeric or empty columns. This " "visualizer requires at least one numeric metadata column to " "execute.") # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_column.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = { 'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0] } escaped_column = quote(column) filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % column) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump( { 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0] }, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'filtered_columns': ', '.join(sorted(filtered_columns)) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'), os.path.join(output_dir, 'dist'))
def summarize(output_dir: str, table: biom.Table, sample_metadata: qiime2.Metadata=None) -> None: number_of_features, number_of_samples = table.shape sample_summary, sample_frequencies = _frequency_summary( table, axis='sample') if number_of_samples > 1: # Calculate the bin count, with a minimum of 5 bins IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile'] if IQR == 0.0: bins = 5 else: # Freedman–Diaconis rule bin_width = (2 * IQR) / (number_of_samples ** (1/3)) bins = max((sample_summary['Maximum frequency'] - sample_summary['Minimum frequency']) / bin_width, 5) sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False, rug=True, bins=int(round(bins))) sample_frequencies_ax.get_xaxis().set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) sample_frequencies_ax.set_xlabel('Frequency per sample') sample_frequencies_ax.set_ylabel('Number of samples') sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.pdf')) sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.png')) plt.gcf().clear() feature_summary, feature_frequencies = _frequency_summary( table, axis='observation') if number_of_features > 1: feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False, rug=False) feature_frequencies_ax.set_xlabel('Frequency per feature') feature_frequencies_ax.set_ylabel('Number of features') feature_frequencies_ax.set_xscale('log') feature_frequencies_ax.set_yscale('log') feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.pdf')) feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.png')) sample_summary_table = q2templates.df_to_html( sample_summary.apply('{:,}'.format).to_frame('Frequency')) feature_summary_table = q2templates.df_to_html( feature_summary.apply('{:,}'.format).to_frame('Frequency')) index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html') context = { 'number_of_samples': number_of_samples, 'number_of_features': number_of_features, 'total_frequencies': int(np.sum(sample_frequencies)), 'sample_summary_table': sample_summary_table, 'feature_summary_table': feature_summary_table, } feature_qualitative_data = _compute_qualitative_summary(table) sample_frequencies.sort_values(inplace=True, ascending=False) feature_frequencies.sort_values(inplace=True, ascending=False) sample_frequencies.to_csv( os.path.join(output_dir, 'sample-frequency-detail.csv')) feature_frequencies.to_csv( os.path.join(output_dir, 'feature-frequency-detail.csv')) feature_frequencies = feature_frequencies.astype(int) \ .apply('{:,}'.format).to_frame('Frequency') feature_frequencies['# of Samples Observed In'] = \ pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format) feature_frequencies_table = q2templates.df_to_html(feature_frequencies) overview_template = os.path.join( TEMPLATES, 'summarize_assets', 'overview.html') sample_frequency_template = os.path.join( TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html') feature_frequency_template = os.path.join( TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html') context.update({'max_count': sample_frequencies.max(), 'feature_frequencies_table': feature_frequencies_table, 'feature_qualitative_data': feature_qualitative_data, 'tabs': [{'url': 'overview.html', 'title': 'Overview'}, {'url': 'sample-frequency-detail.html', 'title': 'Interactive Sample Detail'}, {'url': 'feature-frequency-detail.html', 'title': 'Feature Detail'}]}) templates = [index, sample_frequency_template, feature_frequency_template, overview_template] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") if sample_metadata: sample_metadata = sample_metadata.filter_ids( sample_frequencies.index) # TODO use Metadata.to_json() API if/when it exists in the future. sample_metadata.to_dataframe().to_json(fh) else: fh.write('{}') fh.write(', ') sample_frequencies.to_json(fh) fh.write(');')
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str, clustering_method: str, metadata: qiime2.Metadata, sampling_depth: int, iterations: int = 10, phylogeny: skbio.TreeNode = None, correlation_method: str = 'spearman', color_scheme: str = 'BrBG') -> None: with qiime2.sdk.Context() as scope: if table.is_empty(): raise ValueError("Input feature table is empty.") # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) table = qiime2.Artifact.import_data('FeatureTable[Frequency]', table) if metric in phylogenetic_metrics(): if phylogeny is None: raise ValueError("A phylogenetic metric (%s) was requested, " "but a phylogenetic tree was not provided. " "Phylogeny must be provided when using a " "phylogenetic diversity metric." % metric) phylogeny = qiime2.Artifact.import_data('Phylogeny[Rooted]', phylogeny) api_method = scope.ctx.get_action('diversity', 'beta_phylogenetic') beta_func = functools.partial(api_method, phylogeny=phylogeny) else: beta_func = scope.ctx.get_action('diversity', 'beta') rare_func = scope.ctx.get_action('feature-table', 'rarefy') distance_matrices = _get_multiple_rarefaction(beta_func, rare_func, metric, iterations, table, sampling_depth) primary = distance_matrices[0] support = distance_matrices[1:] heatmap_fig, similarity_df = _make_heatmap(distance_matrices, metric, correlation_method, color_scheme) heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg')) similarity_df.to_csv(os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'), sep='\t') tree = _cluster_samples(primary, support, clustering_method) tree.write( os.path.join(output_dir, 'sample-clustering-%s.tre' % clustering_method)) emperor = _jackknifed_emperor(primary, support, metadata) emperor_dir = os.path.join(output_dir, 'emperor') emperor.copy_support_files(emperor_dir) with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh: fh.write(emperor.make_emperor(standalone=True)) templates = list( map( lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page), ['index.html', 'heatmap.html', 'tree.html', 'emperor.html'])) context = { 'metric': metric, 'clustering_method': clustering_method, 'tabs': [{ 'url': 'emperor.html', 'title': 'PCoA' }, { 'url': 'heatmap.html', 'title': 'Heatmap' }, { 'url': 'tree.html', 'title': 'Clustering' }] } q2templates.render(templates, output_dir, context=context)
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns( drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat([alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis(groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append( ['%s:%s' % (column, names[i]), '%s:%s' % (column, names[j])]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests( kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump({'initial': initial_data_length, 'filtered': filtered_data_length}, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join( TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons])}) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() if metadata_df.empty or len(metadata.columns) == 0: raise ValueError("All metadata filtered after dropping columns " "that contained non-categorical data.") metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata(column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str = 'spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata contains only non-numeric or empty columns. This " "visualizer requires at least one numeric metadata column to " "execute.") # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_column.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = {'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0]} escaped_column = quote(column) filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % column) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump({ 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0]}, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'filtered_columns': ', '.join(sorted(filtered_columns))}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'), os.path.join(output_dir, 'dist'))
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str, clustering_method: str, metadata: qiime2.Metadata, sampling_depth: int, iterations: int=10, phylogeny: skbio.TreeNode=None, correlation_method: str='spearman', color_scheme: str='BrBG') -> None: if metric in phylogenetic_metrics(): if phylogeny is None: raise ValueError("A phylogenetic metric (%s) was requested, " "but a phylogenetic tree was not provided. " "Phylogeny must be provided when using a " "phylogenetic diversity metric." % metric) beta_func = functools.partial(beta_phylogenetic, phylogeny=phylogeny) else: beta_func = beta if table.is_empty(): raise ValueError("Input feature table is empty.") # Filter metadata to only include sample IDs present in the feature table. # Also ensures every feature table sample ID is present in the metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) distance_matrices = _get_multiple_rarefaction( beta_func, metric, iterations, table, sampling_depth) primary = distance_matrices[0] support = distance_matrices[1:] heatmap_fig, similarity_df = _make_heatmap( distance_matrices, metric, correlation_method, color_scheme) heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg')) similarity_df.to_csv( os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'), sep='\t') tree = _cluster_samples(primary, support, clustering_method) tree.write(os.path.join(output_dir, 'sample-clustering-%s.tre' % clustering_method)) emperor = _jackknifed_emperor(primary, support, metadata) emperor_dir = os.path.join(output_dir, 'emperor') emperor.copy_support_files(emperor_dir) with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh: fh.write(emperor.make_emperor(standalone=True)) templates = list(map( lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page), ['index.html', 'heatmap.html', 'tree.html', 'emperor.html'])) context = { 'metric': metric, 'clustering_method': clustering_method, 'tabs': [{'url': 'emperor.html', 'title': 'PCoA'}, {'url': 'heatmap.html', 'title': 'Heatmap'}, {'url': 'tree.html', 'title': 'Clustering'}] } q2templates.render(templates, output_dir, context=context)
def preprocess( ctx, table, metadata, sampling_depth, min_frequency, target_variable, discrete, phylogeny=None, with_replacement=False, n_jobs=1, ): # Define QIIME2 methods to call rarefy = ctx.get_action("feature_table", "rarefy") filter_min_features = ctx.get_action("feature_table", "filter_features") filter_samples = ctx.get_action("feature_table", "filter_samples") beta = ctx.get_action("diversity", "beta") beta_phylogenetic = ctx.get_action("diversity", "beta_phylogenetic") filter_features = ctx.get_action("fragment-insertion", "filter_features") results = [] print("Inital sizes") print_datasize(table, metadata) initial_ids_to_keep = table.view(biom.Table).ids() table_id_set = set(initial_ids_to_keep) metadata_id_set = set(metadata.ids) shared_ids = table_id_set.intersection(metadata_id_set) num_shared_ids = len(shared_ids) if num_shared_ids == 0: raise ValueError("No sample IDs are shared between Table and Metadata") print( "# of shared sample IDs between Table and Metadata: ", num_shared_ids, "\n" ) # Filter metadata by samples in table print("Filtering Metadata by samples in table") filteredmetadata = metadata.filter_ids(ids_to_keep=shared_ids) print_datasize(table, filteredmetadata) # Filter samples from metadata where NaN in target_variable column # Reduce metadata to 1 column mapping of sample-id to target print( "Filtering samples from Metadata where NaN in target_variable column" ) print("Reducing Metadata to 1 column mapping of sample-id to target") df = filteredmetadata.to_dataframe() clean_subset_df = clean_metadata( df=df, target_variable=target_variable, discrete=discrete ) target_mapping = Metadata(clean_subset_df) print_datasize(table, target_mapping) # Filter features that do not exist in phylogeny if phylogeny: print("Filtering features from Table that do not exist in phylogeny") phylo_filtered_results = filter_features(table=table, tree=phylogeny) table = phylo_filtered_results.filtered_table print_datasize(table, target_mapping) # Filter low-abundance features from table print( f"Filtering low-abundance features (frequency<{min_frequency}) from Table" ) (table,) = filter_min_features( table=table, min_frequency=min_frequency ) print_datasize(table, target_mapping) # Rarefy Table to sampling_depth print(f"Rarefying Table to sampling depth of {sampling_depth}") (rarefied_table,) = rarefy( table=table, sampling_depth=sampling_depth, with_replacement=with_replacement, ) print_datasize(rarefied_table, target_mapping) print("Filtering Rarefied Table by samples in Metadata") filtered_rarefied_table_results = filter_samples( table=rarefied_table, metadata=target_mapping ) filtered_rarefied_table = filtered_rarefied_table_results.filtered_table print_datasize(filtered_rarefied_table, target_mapping) results += filtered_rarefied_table_results # Refilter target_mapping by samples in table print("Refiltering Metadata by samples in Rarefied Table") ids_to_keep = filtered_rarefied_table.view(biom.Table).ids() target_mapping = target_mapping.filter_ids(ids_to_keep=ids_to_keep) print_datasize(filtered_rarefied_table, target_mapping) # Filter Rarefied Table by samples in metadata print("Filtering Unrarefied Table by samples in Metadata to match Rarefied Table") filtered_table_results = filter_samples( table=table, metadata=target_mapping ) print_datasize(filtered_table_results.filtered_table, target_mapping) results += filtered_table_results # Some transformations to get data into correct format for artifact target_mapping_col = target_mapping.get_column(target_variable) target_mapping_series = target_mapping_col.to_series() print("Reindexing Metadata to match Sample ID order of Table") target_mapping_series = target_mapping_series.reindex( index=ids_to_keep, copy=False ) print("Validating Table and Metadata Sample ID agreement...") if list(target_mapping_series.index) != list(ids_to_keep): print(list(target_mapping_series.index)) print(ids_to_keep) raise ValueError( "Table and Metadata Sample IDs do not match in contents and/or order" ) target_mapping_artifact = ctx.make_artifact( "SampleData[Target]", target_mapping_series ) results += [target_mapping_artifact] # Generate Distance Matrices print("Generating Distance Matrices...") for metric in ["jaccard", "braycurtis", "jensenshannon", "aitchison"]: beta_results = beta( table=filtered_rarefied_table, metric=metric, n_jobs=n_jobs ) results += beta_results if phylogeny: for metric in ["unweighted_unifrac", "weighted_unifrac"]: beta_phylo_results = beta_phylogenetic( table=filtered_rarefied_table, phylogeny=phylogeny, metric=metric, threads=n_jobs, ) results += beta_phylo_results else: # No phylogeny, return empty (1,1) matrices. results += 2*[Artifact.import_data( "DistanceMatrix", skbio.DistanceMatrix(data=[]) )] return tuple(results)
print("%s sample pairs matched together" % (len(case_to_control_match.keys()))) for key in case_to_control_match: key_value = case_to_control_match[key] matchDF.at[key, "matched_to"] = str(key_value) matchDF.at[key_value, "matched_to"] = str(key) else: print("%s cases matched" % (len(case_dictionary.keys()))) for case in case_dictionary: for control in case_dictionary[case]: if control in control_dictionary: control_dictionary[control].append(case) else: control_dictionary[control] = [case] matchDF.at[case, "matched_to"] = ", ".join(sorted(case_dictionary[case])) for control in control_dictionary: matchDF.at[control, "matched_to"] = ", ".join( sorted(control_dictionary[control])) matchedMD = Metadata(matchDF) if only_matches: ids = matchedMD.get_ids("matched_to NOT IN ('none')") #shrinks the MD to only have matched samples matchedMD = matchedMD.filter_ids(ids) return matchedMD
def simple_plot(output_dir, table: biom.Table, feature_tree: skbio.TreeNode, metadata: q2.Metadata, case_where: str, control_where: str, n_transects: int = 10, stratify_by: str = None, mode: str = 'max'): print("Data extracted") layer_dir = os.path.join(output_dir, 'layers') rank_dir = os.path.join(output_dir, 'ranks') os.mkdir(layer_dir) os.mkdir(rank_dir) metadata = metadata.filter_ids(table.ids(axis='sample')) case_samples = sorted(list(metadata.get_ids(case_where))) control_samples = sorted(list(metadata.get_ids(control_where))) get_pairs = comparisons(metadata, control_samples, case_samples, stratify_by) table.filter(case_samples + control_samples) table.remove_empty('observation') features = list(table.ids(axis='observation')) feature_tree = shear_no_prune(feature_tree, features) print("Extraneous features removed") for n in feature_tree.traverse(): if not n.length: n.length = 0 tree = tree_to_array(feature_tree, mode) print("Tree index created") possible_transects = len(np.unique(np.asarray(tree['distances']))) tree_length = tree['distances'][0] # root of tree if n_transects > possible_transects: n_transects = possible_transects print("Only %d transects exist, using that instead" % n_transects) transects = list(np.linspace(0, tree_length, num=n_transects)) print("Will transect at: %s" % ", ".join(map(str, transects))) figure_gen = prepare_plot(tree_length) figure_gen.send(None) # initialize co-routine colors = [] points, _ = pairwise_components(table, get_pairs()) color_fig, highlight_fig, color = figure_gen.send((points, None)) color_fig.savefig(os.path.join(layer_dir, 'original.png'), transparent=True) plt.close(color_fig) highlight_fig.savefig(os.path.join(layer_dir, 'original.h.png'), transparent=True) plt.close(highlight_fig) colors.append(color) rank_files = [] collapsed_groups = pd.DataFrame() for distance in transects: collapsed_table, collapsed_counts, groups = group_by_transect( table, tree, distance) collapsed_groups[groups.name] = groups print("Table collapsed at transect %s" % distance) points, ranks = pairwise_components(collapsed_table, get_pairs()) filename = write_ranks(rank_dir, collapsed_counts, ranks, distance) rank_files.append(filename) color_fig, highlight_fig, color = figure_gen.send((points, distance)) colors.append(color) color_fig.savefig(os.path.join(layer_dir, 'T_%s.png' % distance), transparent=True) plt.close(color_fig) highlight_fig.savefig(os.path.join(layer_dir, 'T_%s.h.png' % distance), transparent=True) plt.close(highlight_fig) print("Finalizing visualization") figure = figure_gen.send((None, None)) figure.savefig(os.path.join(layer_dir, 'trajectory.png'), transparent=True) plt.close(figure) background = next(figure_gen) background.savefig(os.path.join(layer_dir, 'bg.png'), transparent=True) plt.close(background) with open(os.path.join(output_dir, 'collapsed_groups.tsv'), 'w') as fh: collapsed_groups.to_csv(fh, sep='\t') with open(os.path.join(output_dir, 'index.html'), 'w') as fh: template = Environment(loader=BaseLoader).from_string(TEMPLATE) fh.write( template.render({ 'legend': list( zip(['original'] + ['T_%s' % d for d in transects] + ['trajectory'], list(map(to_hex, colors)) + ['red'])), 'filenames': rank_files }))