def tabulate(output_dir: str, input: qiime2.Metadata, page_size: int = 100) -> None: if page_size < 1: raise ValueError('Cannot render less than one record per page.') df = input.to_dataframe() df.reset_index(inplace=True) table = df.to_json(orient='split') # JSON spec doesn't allow single quotes in string values, at all. It does # however allow unicode values. table = table.replace("'", r'\u0027') index = os.path.join(TEMPLATES, 'tabulate', 'index.html') q2templates.render(index, output_dir, context={ 'table': table, 'page_size': page_size }) js = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.js') os.mkdir(os.path.join(output_dir, 'js')) shutil.copy(js, os.path.join(output_dir, 'js', 'datatables.min.js')) css = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.css') os.mkdir(os.path.join(output_dir, 'css')) shutil.copy(css, os.path.join(output_dir, 'css', 'datatables.min.css'))
def visualize_stats(output_dir: str, filter_stats: pd.DataFrame) -> None: sums = filter_stats.sum() sums.name = 'Totals' filter_stats = filter_stats.append(sums) filter_stats.sort_values('total-input-reads', inplace=True, ascending=False) total_retained = filter_stats['total-retained-reads'] total_input = filter_stats['total-input-reads'] filter_stats['fraction-retained'] = total_retained / total_input # reorder such that retained fraction follows total-input-reads and # total-retained-reads columns = list(filter_stats.columns)[:-1] columns.insert(2, 'fraction-retained') filter_stats = filter_stats[columns] html = filter_stats.to_html(classes='table table-striped table-hover') html = html.replace('border="1"', 'border="0"') index = os.path.join(TEMPLATES, 'index.html') context = {'result': html} q2templates.render(index, output_dir, context=context)
def tabulate(output_dir: str, data: pd.Series) -> None: prepped = [] for _id, taxa in data.iteritems(): prepped.append({'id': _id, 'taxa': taxa}) index = os.path.join(TEMPLATES, 'tabulate', 'index.html') q2templates.render(index, output_dir, context={'data': prepped})
def core_features(output_dir, table: biom.Table, min_fraction: float = 0.5, max_fraction: float = 1.0, steps: int = 11) -> None: if max_fraction < min_fraction: raise ValueError('min_fraction (%r) parameter must be less than ' 'max_fraction (%r) parameter.' % (min_fraction, max_fraction)) index_fp = os.path.join(TEMPLATES, 'index.html') context = { 'num_samples': table.shape[1], 'num_features': table.shape[0] } if min_fraction == max_fraction: fractions = [min_fraction] else: fractions = np.linspace(min_fraction, max_fraction, steps) rounded_fractions = _round_fractions(fractions) data = [] file_links = [] for fraction, rounded_fraction in zip(fractions, rounded_fractions): core_features = _get_core_features(table, fraction) core_feature_count = len(core_features) data.append([fraction, core_feature_count]) if core_feature_count > 0: core_feature_fn = 'core-features-%s.tsv' % rounded_fraction core_feature_fp = os.path.join(output_dir, core_feature_fn) file_links.append("<a href='./%s'>TSV</a>" % core_feature_fn) core_features.to_csv(core_feature_fp, sep='\t', index_label='Feature ID') else: file_links.append('No core features') df = pd.DataFrame(data, columns=['Fraction of samples', 'Feature count']) df['Fraction of features'] = df['Feature count'] / table.shape[0] df['Feature list'] = file_links # newer versions of seaborn don't like dataframes with fewer than two rows if len(fractions) > 1: ax = sns.regplot(data=df, x='Fraction of samples', y='Feature count', fit_reg=False) # matplotlib will issue a UserWarning if attempting to set left and # right bounds to the same value. ax.set_xbound(min(fractions), max(fractions)) ax.set_ybound(0, max(df['Feature count']) + 1) ax.get_figure().savefig( os.path.join(output_dir, 'core-feature-counts.svg')) context['show_plot'] = True context['table_html'] = q2templates.df_to_html(df, index=False, escape=False) q2templates.render(index_fp, output_dir, context=context)
def _visualize_anova(output_dir, pairwise_tests=False, model_results=False, residuals=False, pairwise_test_name='Pairwise t-tests'): pd.set_option('display.max_colwidth', -1) if pairwise_tests is not False: pairwise_tests.to_csv(os.path.join(output_dir, 'pairwise_tests.tsv'), sep='\t') pairwise_tests = q2templates.df_to_html(pairwise_tests) model_results.to_csv(os.path.join(output_dir, 'model_results.tsv'), sep='\t') model_results = q2templates.df_to_html(model_results) residuals.savefig(os.path.join(output_dir, 'residuals.png'), bbox_inches='tight') residuals.savefig(os.path.join(output_dir, 'residuals.pdf'), bbox_inches='tight') plt.close('all') index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'plot_name': 'ANOVA', 'model_results': model_results, 'pairwise_tests': pairwise_tests, 'residuals': residuals, 'pairwise_test_name': pairwise_test_name, })
def _generic_plot(output_dir: str, master: skbio.OrdinationResults, metadata: qiime2.Metadata, other_pcoa: skbio.OrdinationResults, plot_name, custom_axes: str=None, feature_metadata: qiime2.Metadata=None): mf = metadata.to_dataframe() if feature_metadata is not None: feature_metadata = feature_metadata.to_dataframe() if other_pcoa is None: procrustes = None else: procrustes = [other_pcoa] viz = Emperor(master, mf, feature_mapping_file=feature_metadata, procrustes=procrustes, remote='.') if custom_axes is not None: viz.custom_axes = custom_axes if other_pcoa: viz.procrustes_names = ['reference', 'other'] html = viz.make_emperor(standalone=True) viz.copy_support_files(output_dir) with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh: fh.write(html) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={'plot_name': plot_name})
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # convert metadata to numeric values where applicable, drop the non-numeric # values, and then drop samples that contain NaNs df = metadata.to_dataframe() df = df.apply(lambda x: pd.to_numeric(x, errors='ignore')) # filter categorical columns pre_filtered_cols = set(df.columns) df = df.select_dtypes([numpy.number]).dropna() filtered_categorical_cols = pre_filtered_cols - set(df.columns) # filter 0 variance numerical columns pre_filtered_cols = set(df.columns) df = df.loc[:, df.var() != 0] filtered_zero_variance_cols = pre_filtered_cols - set(df.columns) # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index, strict=False) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = result.to_html(classes='table table-striped table-hover').replace( 'border="1"', 'border="0"') index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'filtered_categorical_cols': ', '.join(filtered_categorical_cols), 'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols), 'result': result})
def heatmap(output_dir: str, ranks: pd.DataFrame, microbe_metadata: qiime2.CategoricalMetadataColumn = None, metabolite_metadata: qiime2.CategoricalMetadataColumn = None, method: str = 'average', metric: str = 'euclidean', color_palette: str = 'seismic', margin_palette: str = 'cubehelix', x_labels: bool = False, y_labels: bool = False, level: int = -1) -> None: if microbe_metadata is not None: microbe_metadata = microbe_metadata.to_series() if metabolite_metadata is not None: metabolite_metadata = metabolite_metadata.to_series() hotmap = ranks_heatmap(ranks, microbe_metadata, metabolite_metadata, method, metric, color_palette, margin_palette, x_labels, y_labels, level) hotmap.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight') hotmap.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight') index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': 'Rank Heatmap', 'pdf_fp': 'heatmap.pdf', 'png_fp': 'heatmap.png' })
def tabulate(output_dir: str, input: qiime2.Metadata, page_size: int = 100) -> None: if page_size < 1: raise ValueError('Cannot render less than one record per page.') df = input.to_dataframe() df_columns = pd.MultiIndex.from_tuples([(n, t.type) for n, t in input.columns.items()], names=['column header', 'type']) df.columns = df_columns df.reset_index(inplace=True) table = df.to_json(orient='split') index = os.path.join(TEMPLATES, 'tabulate', 'index.html') q2templates.render(index, output_dir, context={ 'table': table, 'page_size': page_size }) input.save(os.path.join(output_dir, 'metadata.tsv')) js = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.js') os.mkdir(os.path.join(output_dir, 'js')) shutil.copy(js, os.path.join(output_dir, 'js', 'datatables.min.js')) css = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.css') os.mkdir(os.path.join(output_dir, 'css')) shutil.copy(css, os.path.join(output_dir, 'css', 'datatables.min.css'))
def _visualize(output_dir, estimator, cm, importances=None, optimize_feature_selection=True, title='results'): pd.set_option('display.max_colwidth', -1) # summarize model accuracy and params if estimator is not None: result = _extract_estimator_parameters(estimator) result = q2templates.df_to_html(result.to_frame()) else: result = False if cm is not None: cm.to_csv(join( output_dir, 'predictive_accuracy.tsv'), sep='\t', index=True) cm = q2templates.df_to_html(cm) if importances is not None: importances = sort_importances(importances) pd.set_option('display.float_format', '{:.3e}'.format) importances.to_csv(join( output_dir, 'feature_importance.tsv'), sep='\t', index=True) importances = q2templates.df_to_html(importances, index=True) else: importances = False index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': title, 'result': result, 'predictions': cm, 'importances': importances, 'classification': True, 'optimize_feature_selection': optimize_feature_selection, 'maturity_index': False})
def summarize( output_dir: str, problem: zarr.hierarchy.Group, taxa: skbio.TreeNode = None, maxplot: int = 200, predictions: zarr.hierarchy.Group = None, ): context = build_context(output_dir, problem, predictions, taxa, maxplot) index = os.path.join(assets, "index.html") overview_template = os.path.join(assets, "overview.html") path_template = os.path.join(assets, "path.html") cv_template = os.path.join(assets, "cv.html") stabsel_template = os.path.join(assets, "stabsel.html") lam_fixed_template = os.path.join(assets, "lam-fixed.html") templates = [ index, overview_template, path_template, cv_template, stabsel_template, lam_fixed_template, ] q2templates.render(templates, output_dir, context=context)
def tabulate_seqs(output_dir: str, data: DNAIterator) -> None: sequences = [] seq_lengths = [] with open(os.path.join(output_dir, 'sequences.fasta'), 'w') as fh: for sequence in data: skbio.io.write(sequence, format='fasta', into=fh) str_seq = str(sequence) seq_len = len(str_seq) sequences.append({ 'id': sequence.metadata['id'], 'len': seq_len, 'url': _blast_url_template % str_seq, 'seq': str_seq }) seq_lengths.append(seq_len) seq_len_stats = _compute_descriptive_stats(seq_lengths) _write_tsvs_of_descriptive_stats(seq_len_stats, output_dir) index = os.path.join(TEMPLATES, 'tabulate_seqs_assets', 'index.html') q2templates.render(index, output_dir, context={ 'data': sequences, 'stats': seq_len_stats }) js = os.path.join(TEMPLATES, 'tabulate_seqs_assets', 'js', 'tsorter.min.js') os.mkdir(os.path.join(output_dir, 'js')) shutil.copy(js, os.path.join(output_dir, 'js', 'tsorter.min.js'))
def _generic_plot(output_dir: str, master: skbio.OrdinationResults, metadata: qiime2.Metadata, other_pcoa: skbio.OrdinationResults, plot_name, custom_axes: str=None): mf = metadata.to_dataframe() if other_pcoa is None: procrustes = None else: procrustes = [other_pcoa] viz = Emperor(master, mf, procrustes=procrustes, remote='.') if custom_axes is not None: viz.custom_axes = custom_axes if other_pcoa: viz.procrustes_names = ['reference', 'other'] html = viz.make_emperor(standalone=True) viz.copy_support_files(output_dir) with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh: fh.write(html) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={'plot_name': plot_name})
def _visualize(output_dir, estimator, cm, roc, optimize_feature_selection=True, title='results'): pd.set_option('display.max_colwidth', None) # summarize model accuracy and params if estimator is not None: result = _extract_estimator_parameters(estimator) result = q2templates.df_to_html(result.to_frame()) else: result = False if cm is not None: cm.to_csv(join( output_dir, 'predictive_accuracy.tsv'), sep='\t', index=True) cm = q2templates.df_to_html(cm) if roc is not None: roc = True index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': title, 'result': result, 'predictions': cm, 'roc': roc, 'optimize_feature_selection': optimize_feature_selection})
def barplot(output_dir: str, table: pd.DataFrame, taxonomy: pd.Series, metadata: Metadata) -> None: metadata = metadata.to_dataframe() filenames = [] collapsed_tables = _extract_to_level(taxonomy, table) for level, df in enumerate(collapsed_tables, 1): # Join collapsed table with metadata taxa_cols = df.columns.values.tolist() df = df.join(metadata, how='left') df = df.reset_index(drop=False) # Move SampleID index into columns df = df.fillna('') # JS sort works best with empty strings vs null all_cols = df.columns.values.tolist() filename = 'lvl-%d.jsonp' % level filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('Level %d'," % level) json.dump(taxa_cols, fh) fh.write(",") json.dump(all_cols, fh) fh.write(",") df.to_json(fh, orient='records') fh.write(");") # Now that the tables have been collapsed, write out the index template index = os.path.join(TEMPLATES, 'barplot', 'index.html') q2templates.render(index, output_dir, context={'filenames': filenames}) # Copy assets for rendering figure shutil.copytree(os.path.join(TEMPLATES, 'barplot', 'dst'), os.path.join(output_dir, 'dist'))
def heatmap(output_dir, table: pd.DataFrame, sample_metadata: qiime2.CategoricalMetadataColumn = None, feature_metadata: qiime2.CategoricalMetadataColumn = None, normalize: bool = True, title: str = None, metric: str = 'euclidean', method: str = 'average', cluster: str = 'both', color_scheme: str = 'rocket') -> None: if table.empty: raise ValueError('Cannot visualize an empty table.') if sample_metadata is not None: table = _munge_sample_metadata(sample_metadata, table, cluster) # relabel feature table feature IDs with feature metadata column values if feature_metadata is not None: table = _munge_feature_metadata(feature_metadata, table, cluster) cbar_label = 'frequency' if normalize: table = table.apply(lambda x: np.log10(x + 1)) cbar_label = 'log10 frequency' # Hard-coded values for reasonable plots scaletron, labelsize, dpi = 50, 8, 100 sns.set(rc={'xtick.labelsize': labelsize, 'ytick.labelsize': labelsize, 'figure.dpi': dpi}) width, height = table.shape[1] / scaletron, table.shape[0] / scaletron heatmap_plot = sns.clustermap(table, method=method, metric=metric, **_clustering_map[cluster], cmap=color_scheme, xticklabels=True, yticklabels=True, cbar_kws={'label': cbar_label}) if title is not None: heatmap_plot.fig.suptitle(title) hm = heatmap_plot.ax_heatmap.get_position() cbar = heatmap_plot.cax.get_position() row = heatmap_plot.ax_row_dendrogram.get_position() col = heatmap_plot.ax_col_dendrogram.get_position() # Resize the plot to set cell aspect-ratio to square heatmap_plot.ax_heatmap.set_position([hm.x0, hm.y0, width, height]) heatmap_plot.cax.set_position([cbar.x0, hm.y0 + height, cbar.width, cbar.height]) heatmap_plot.ax_row_dendrogram.set_position([row.x0, row.y0, row.width, height]) heatmap_plot.ax_col_dendrogram.set_position([col.x0, hm.y0 + height, width, col.height]) # https://stackoverflow.com/a/34697479/3776794 plt.setp(heatmap_plot.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) plt.setp(heatmap_plot.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) for ext in ['png', 'svg']: img_fp = os.path.join(output_dir, 'feature-table-heatmap.%s' % ext) heatmap_plot.savefig(img_fp) index_fp = os.path.join(TEMPLATES, 'index.html') q2templates.render(index_fp, output_dir, context={'normalize': normalize})
def _visualize(output_dir, multiple_group_test=False, pairwise_tests=False, paired_difference_tests=False, plot=False, summary=False, errors=False, model_summary=False, model_results=False, raw_data=False, plot_name='Pairwise difference boxplot', pairwise_test_name='Pairwise group comparison tests'): pd.set_option('display.max_colwidth', -1) if summary is not False: summary = q2templates.df_to_html(summary.to_frame()) if multiple_group_test is not False: multiple_group_test = multiple_group_test.to_frame().transpose() multiple_group_test = q2templates.df_to_html(multiple_group_test) if pairwise_tests is not False: pairwise_tests.to_csv(os.path.join(output_dir, 'pairwise_tests.tsv'), sep='\t') pairwise_tests = q2templates.df_to_html(pairwise_tests) if raw_data is not False: raw_data.to_csv(os.path.join(output_dir, 'raw-data.tsv'), sep='\t') raw_data = True if paired_difference_tests is not False: paired_difference_tests.to_csv(os.path.join( output_dir, 'paired_difference_tests.tsv'), sep='\t') paired_difference_tests = q2templates.df_to_html( paired_difference_tests) if model_summary is not False: model_summary.to_csv(os.path.join(output_dir, 'model_summary.tsv'), sep='\t') model_summary = q2templates.df_to_html(model_summary) if model_results is not False: model_results.to_csv(os.path.join(output_dir, 'model_results.tsv'), sep='\t') model_results = q2templates.df_to_html(model_results) if plot is not False: plot.savefig(os.path.join(output_dir, 'plot.png'), bbox_inches='tight') plot.savefig(os.path.join(output_dir, 'plot.pdf'), bbox_inches='tight') plt.close('all') index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'errors': errors, 'summary': summary, 'model_summary': model_summary, 'model_results': model_results, 'multiple_group_test': multiple_group_test, 'pairwise_tests': pairwise_tests, 'paired_difference_tests': paired_difference_tests, 'plot': plot, 'plot_name': plot_name, 'raw_data': raw_data, 'pairwise_test_name': pairwise_test_name, })
def _visualize_maturity_index(table, metadata, group_by, column, predicted_column, importances, estimator, accuracy, output_dir, maz_stats=True): pd.set_option('display.max_colwidth', -1) maturity = '{0} maturity'.format(column) maz = '{0} MAZ score'.format(column) # save feature importance data and convert to html importances = sort_importances(importances) importances.to_csv( join(output_dir, 'feature_importance.tsv'), index=True, sep='\t') importance = q2templates.df_to_html(importances, index=True) # save predicted values, maturity, and MAZ score data maz_md = metadata[[group_by, column, predicted_column, maturity, maz]] maz_md.to_csv(join(output_dir, 'maz_scores.tsv'), sep='\t') if maz_stats: maz_aov = _two_way_anova(table, metadata, maz, group_by, column)[0] maz_aov.to_csv(join(output_dir, 'maz_aov.tsv'), sep='\t') maz_pairwise = _pairwise_stats( table, metadata, maz, group_by, column) maz_pairwise.to_csv(join(output_dir, 'maz_pairwise.tsv'), sep='\t') # plot control/treatment predicted vs. actual values g = _lmplot_from_dataframe( metadata, column, predicted_column, group_by) g.savefig(join(output_dir, 'maz_predictions.png'), bbox_inches='tight') g.savefig(join(output_dir, 'maz_predictions.pdf'), bbox_inches='tight') plt.close('all') # plot barplots of MAZ score vs. column (e.g., age) g = _boxplot_from_dataframe(metadata, column, maz, group_by) g.get_figure().savefig( join(output_dir, 'maz_boxplots.png'), bbox_inches='tight') g.get_figure().savefig( join(output_dir, 'maz_boxplots.pdf'), bbox_inches='tight') plt.close('all') # plot heatmap of column (e.g., age) vs. abundance of top features top = table[list(importances.index)] g = _clustermap_from_dataframe(top, metadata, group_by, column) g.savefig(join(output_dir, 'maz_heatmaps.png'), bbox_inches='tight') g.savefig(join(output_dir, 'maz_heatmaps.pdf'), bbox_inches='tight') result = _extract_estimator_parameters(estimator) result.append(pd.Series([accuracy], index=['Accuracy score'])) result = q2templates.df_to_html(result.to_frame()) index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': 'maturity index predictions', 'result': result, 'predictions': None, 'importances': importance, 'classification': False, 'optimize_feature_selection': True, 'maturity_index': True})
def tabulate_seqs(output_dir: str, data: DNAIterator) -> None: sequences = [] for sequence in data: str_seq = str(sequence) sequences.append({"id": sequence.metadata["id"], "url": _blast_url_template % str_seq, "seq": str_seq}) index = os.path.join(TEMPLATES, "tabulate_seqs_assets", "index.html") q2templates.render(index, output_dir, context={"data": sequences})
def core_features(output_dir, table: biom.Table, min_fraction: float=0.5, max_fraction: float=1.0, steps: int=11) -> None: if max_fraction < min_fraction: raise ValueError('min_fraction (%r) parameter must be less than ' 'max_fraction (%r) parameter.' % (min_fraction, max_fraction)) index_fp = os.path.join(TEMPLATES, 'index.html') context = { 'num_samples': table.shape[1], 'num_features': table.shape[0] } if min_fraction == max_fraction: fractions = [min_fraction] else: fractions = np.linspace(min_fraction, max_fraction, steps) rounded_fractions = _round_fractions(fractions) data = [] file_links = [] for fraction, rounded_fraction in zip(fractions, rounded_fractions): core_features = _get_core_features(table, fraction) core_feature_count = len(core_features) data.append([fraction, core_feature_count]) if core_feature_count > 0: core_feature_fn = 'core-features-%s.tsv' % rounded_fraction core_feature_fp = os.path.join(output_dir, core_feature_fn) file_links.append("<a href='./%s'>TSV</a>" % core_feature_fn) core_features.to_csv(core_feature_fp, sep='\t', index_label='Feature ID') else: file_links.append('No core features') df = pd.DataFrame(data, columns=['Fraction of samples', 'Feature count']) df['Fraction of features'] = df['Feature count'] / table.shape[0] df['Feature list'] = file_links ax = sns.regplot(data=df, x='Fraction of samples', y='Feature count', fit_reg=False) # matplotlib will issue a UserWarning if attempting to set left and right # bounds to the same value. if min_fraction != max_fraction: ax.set_xbound(min(fractions), max(fractions)) ax.set_ybound(0, max(df['Feature count']) + 1) ax.get_figure().savefig( os.path.join(output_dir, 'core-feature-counts.svg')) context['table_html'] = q2templates.df_to_html(df, index=False, escape=False) q2templates.render(index_fp, output_dir, context=context)
def _visualize(output_dir, multiple_group_test=False, pairwise_tests=False, paired_difference_tests=False, plot=False, summary=False, model_summary=False, model_results=False): pd.set_option('display.max_colwidth', -1) if summary is not False: summary = summary.to_frame().to_html(classes=( "table table-striped table-hover")).replace( 'border="1"', 'border="0"') if multiple_group_test is not False: multiple_group_test = multiple_group_test.to_frame().transpose() multiple_group_test = multiple_group_test.to_html(classes=( "table table-striped table-hover")).replace( 'border="1"', 'border="0"') if pairwise_tests is not False: pairwise_tests.to_csv(join(output_dir, 'pairwise_tests.tsv'), sep='\t') pairwise_tests = pairwise_tests.to_html(classes=( "table table-striped table-hover")).replace( 'border="1"', 'border="0"') if paired_difference_tests is not False: paired_difference_tests.to_csv(join( output_dir, 'paired_difference_tests.tsv'), sep='\t') paired_difference_tests = paired_difference_tests.to_html(classes=( "table table-striped table-hover")).replace( 'border="1"', 'border="0"') if model_summary is not False: model_summary.to_csv(join(output_dir, 'model_summary.tsv'), sep='\t') model_summary = model_summary.to_html(classes=( "table table-striped table-hover")).replace( 'border="1"', 'border="0"') if model_results is not False: model_results.to_csv(join(output_dir, 'model_results.tsv'), sep='\t') model_results = model_results.to_html(classes=( "table table-striped table-hover")).replace( 'border="1"', 'border="0"') if plot is not False: plot.savefig(join(output_dir, 'plot.png'), bbox_inches='tight') plot.savefig(join(output_dir, 'plot.pdf'), bbox_inches='tight') plt.close('all') index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'summary': summary, 'model_summary': model_summary, 'model_results': model_results, 'multiple_group_test': multiple_group_test, 'pairwise_tests': pairwise_tests, 'paired_difference_tests': paired_difference_tests, 'plot': plot, })
def _visualize_knn(output_dir, params: pd.Series): result = q2templates.df_to_html(params.to_frame()) index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': 'Estimator Summary', 'result': result, 'predictions': None, 'importances': None, 'classification': True, 'optimize_feature_selection': False})
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str, clustering_method: str, metadata: qiime2.Metadata, sampling_depth: int, iterations: int=10, phylogeny: skbio.TreeNode=None, correlation_method: str='spearman', color_scheme: str='BrBG') -> None: if metric in phylogenetic_metrics(): if phylogeny is None: raise ValueError("A phylogenetic metric (%s) was requested, " "but a phylogenetic tree was not provided. " "Phylogeny must be provided when using a " "phylogenetic diversity metric." % metric) beta_func = functools.partial(beta_phylogenetic, phylogeny=phylogeny) else: beta_func = beta distance_matrices = _get_multiple_rarefaction( beta_func, metric, iterations, table, sampling_depth) primary = distance_matrices[0] support = distance_matrices[1:] heatmap_fig, similarity_df = _make_heatmap( distance_matrices, metric, correlation_method, color_scheme) heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg')) similarity_df.to_csv( os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'), sep='\t') tree = _cluster_samples(primary, support, clustering_method) tree.write(os.path.join(output_dir, 'sample-clustering-%s.tre' % clustering_method)) emperor = _jackknifed_emperor(primary, support, metadata) emperor_dir = os.path.join(output_dir, 'emperor') emperor.copy_support_files(emperor_dir) with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh: fh.write(emperor.make_emperor(standalone=True)) templates = list(map( lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page), ['index.html', 'heatmap.html', 'tree.html', 'emperor.html'])) context = { 'metric': metric, 'clustering_method': clustering_method, 'tabs': [{'url': 'emperor.html', 'title': 'PCoA'}, {'url': 'heatmap.html', 'title': 'Heatmap'}, {'url': 'tree.html', 'title': 'Clustering'}] } q2templates.render(templates, output_dir, context=context)
def heatmap(output_dir, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn=None, normalize: bool=True, title: str=None, metric: str='euclidean', method: str='average', cluster: str='both', color_scheme: str='rocket') -> None: if table.empty: raise ValueError('Cannot visualize an empty table.') if metadata is not None: table = _munge_metadata(metadata, table, cluster) cbar_label = 'frequency' if normalize: table = table.apply(lambda x: np.log10(x + 1)) cbar_label = 'log10 frequency' # Hard-coded values for reasonable plots scaletron, labelsize, dpi = 50, 8, 100 sns.set(rc={'xtick.labelsize': labelsize, 'ytick.labelsize': labelsize, 'figure.dpi': dpi}) width, height = table.shape[1] / scaletron, table.shape[0] / scaletron heatmap_plot = sns.clustermap(table, method=method, metric=metric, **_clustering_map[cluster], cmap=color_scheme, xticklabels=True, yticklabels=True, cbar_kws={'label': cbar_label}) if title is not None: heatmap_plot.fig.suptitle(title) hm = heatmap_plot.ax_heatmap.get_position() cbar = heatmap_plot.cax.get_position() row = heatmap_plot.ax_row_dendrogram.get_position() col = heatmap_plot.ax_col_dendrogram.get_position() # Resize the plot to set cell aspect-ratio to square heatmap_plot.ax_heatmap.set_position([hm.x0, hm.y0, width, height]) heatmap_plot.cax.set_position([cbar.x0, hm.y0 + height, cbar.width, cbar.height]) heatmap_plot.ax_row_dendrogram.set_position([row.x0, row.y0, row.width, height]) heatmap_plot.ax_col_dendrogram.set_position([col.x0, hm.y0 + height, width, col.height]) # https://stackoverflow.com/a/34697479/3776794 plt.setp(heatmap_plot.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) plt.setp(heatmap_plot.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) for ext in ['png', 'svg']: img_fp = os.path.join(output_dir, 'feature-table-heatmap.%s' % ext) heatmap_plot.savefig(img_fp) index_fp = os.path.join(TEMPLATES, 'index.html') q2templates.render(index_fp, output_dir, context={'normalize': normalize})
def save_animated_map(output_dir, lat_min, lat_max, data, column): # save fig, which is really a legend plt.savefig(join(output_dir, 'colorbar.png'), bbox_inches='tight') # copy all js/css utilities in_path = partial(join, TEMPLATES, 'animated_map') copytree(in_path('static'), join(output_dir, 'static')) # save template q2templates.render(in_path('index.html'), output_dir, context={ 'lat_min': lat_min, 'lat_max': lat_max, 'data': data, 'column': column})
def mapviz(output_dir, results=None, title='Coordinates'): if results is not None: results.to_csv(join( output_dir, 'results.tsv'), sep='\t', index=True) results = q2templates.df_to_html(results) else: results = False index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'results': results, 'title': title})
def summarize(output_dir: str, problem : zarr.hierarchy.Group): print(TEMPLATES) beta = pd.DataFrame(data={'label':problem['label'],'beta':problem['solution/LAMfixed/refit']}) beta.to_csv(os.path.join(output_dir,'beta.csv'),header=True, index=False) show_plot = False if show_plot : x = np.linspace(0,1) y = x**2 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(x, y, color='blue') fig.savefig(os.path.join(output_dir, 'test-plot.png')) html = q2templates.df_to_html(beta, index=False) context = { 'dico': { 'un': 1, 'deux':2 }, 'result': html, 'n_features':len(beta), 'beta' : beta, 'show_plot': show_plot, 'tabs': [{'title': 'Overview', 'url': 'overview.html'}, {'title': 'LAM fixed', 'url': 'lam-fixed.html'}], 'dangers': [], 'warnings': [], } index = os.path.join(TEMPLATES, 'assets', 'index.html') overview_template = os.path.join(TEMPLATES, 'assets', 'overview.html') quality_template = os.path.join(TEMPLATES, 'assets', 'quality-plot.html') templates = [index, overview_template, quality_template] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") json.dump({'selected param' : 10}, fh) fh.write(',') beta.to_json(fh) fh.write(');')
def seq_depth(output_dir: str, table: pd.DataFrame, metadata: qiime2.Metadata, mypar: float = 4) -> None: table_path = os.path.join(output_dir, 'table.tsv') metadata_path = os.path.join(output_dir, 'metadata.tsv') table.to_csv(table_path) metadata.save(metadata_path) cmd_path = os.path.join(TEMPLATES, 'seq_depth.R') print(os.path.exists(table_path)) print(os.path.exists(metadata_path)) cmd = [ 'Rscript', cmd_path, '{0}'.format(output_dir), '{0}'.format(table_path), '{0}'.format(metadata_path) ] #cmd = 'Rscript {0} arg1={1} arg2={2} arg3={3}'.format(cmd_path, output_dir, table_path, metadata_path) #cmd = 'Rscript assets/seq_depth.R arg1=$1 arg2=$2 arg3=$3' proc = subprocess.run(cmd, check=True) index = os.path.join(TEMPLATES, 'index.html') # Errors filepath, load in as list errors_fp = os.path.join(output_dir, 'warnings.txt') with open(errors_fp, 'r') as errors_f: errors = [e for e in errors_f] # Load in depths as a pandas data frame, then transfer to html depths = pd.read_csv(os.path.join(output_dir, 'mytable.tsv'), sep="\t") depths = q2templates.df_to_html(depths) # Load in plot plot_fp = os.path.join(output_dir, 'myplot.png') q2templates.render(index, output_dir, context={ 'errors': errors, 'summary': None, 'model_summary': None, 'model_results': depths, 'multiple_group_test': None, 'pairwise_tests': None, 'paired_difference_tests': None, 'plot': True, 'plot_name': "My Plot", 'raw_data': None, 'pairwise_test_name': None, })
def report(output_dir: str, pcoa: skbio.OrdinationResults, metadata: Metadata, alpha: pd.Series, table: biom.Table, taxonomy: pd.Series, samples: list) -> None: metadata = metadata.to_dataframe() _insanity_checker(samples, metadata, table, alpha, pcoa) index = os.path.join(TEMPLATES, 'report', 'index.html') q2templates.render(index, output_dir, context={'name': 'foo'}) # Copy assets for rendering figure shutil.copytree(os.path.join(TEMPLATES, 'report', 'resources'), os.path.join(output_dir, 'resources'))
def paired_heatmap(output_dir: str, ranks: pd.DataFrame, microbes_table: biom.Table, metabolites_table: biom.Table, features: str = None, top_k_microbes: int = 2, keep_top_samples: bool = True, microbe_metadata: qiime2.CategoricalMetadataColumn = None, normalize: str = 'log10', color_palette: str = 'magma', top_k_metabolites: int = 50, level: int = -1, row_center: bool = True) -> None: if microbe_metadata is not None: microbe_metadata = microbe_metadata.to_series() ranks = ranks.T if row_center: ranks = ranks - ranks.mean(axis=0) select_microbes, select_metabolites, hotmaps = paired_heatmaps( ranks, microbes_table, metabolites_table, microbe_metadata, features, top_k_microbes, top_k_metabolites, keep_top_samples, level, normalize, color_palette) hotmaps.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight') hotmaps.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight') select_microbes.to_csv(join(output_dir, 'select_microbes.tsv'), sep='\t') select_metabolites.to_csv(join(output_dir, 'select_metabolites.tsv'), sep='\t') index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': 'Paired Feature Abundance Heatmaps', 'pdf_fp': 'heatmap.pdf', 'png_fp': 'heatmap.png', 'table1_fp': 'select_microbes.tsv', 'download1_text': 'Download microbe abundances as TSV', 'table2_fp': 'select_metabolites.tsv', 'download2_text': 'Download top k metabolite abundances as TSV' })
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) # drop non-numeric columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric') non_numeric_cols = pre_filtered_cols - set(metadata.columns) # Drop samples that have any missing values. # TODO use Metadata API if more filtering is supported in the future. df = metadata.to_dataframe() df = df.dropna() metadata = qiime2.Metadata(df) # filter 0 variance numerical columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_zero_variance=True, drop_all_missing=True) zero_variance_cols = pre_filtered_cols - set(metadata.columns) df = metadata.to_dataframe() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'non_numeric_cols': ', '.join(sorted(non_numeric_cols)), 'zero_variance_cols': ', '.join(sorted(zero_variance_cols)), 'result': result })
def barplot(output_dir: str, table: pd.DataFrame, taxonomy: pd.Series, metadata: Metadata = None) -> None: if metadata is None: metadata = Metadata(pd.DataFrame({'id': table.index}).set_index('id')) ids_not_in_metadata = set(table.index) - set(metadata.ids) if ids_not_in_metadata: raise ValueError('Sample IDs found in the table are missing in the ' f'metadata: {ids_not_in_metadata!r}.') metadata = metadata.to_dataframe() jsonp_files, csv_files = [], [] collapsed_tables = _extract_to_level(taxonomy, table) for level, df in enumerate(collapsed_tables, 1): # Stash column labels before manipulating dataframe taxa_cols = df.columns.values.tolist() # Join collapsed table with metadata df = df.join(metadata, how='left') df = df.reset_index(drop=False) # Move index into columns # Our JS sort works best with empty strings vs nulls df = df.fillna('') all_cols = df.columns.values.tolist() jsonp_file = 'level-%d.jsonp' % level csv_file = 'level-%d.csv' % level jsonp_files.append(jsonp_file) csv_files.append(csv_file) df.to_csv(os.path.join(output_dir, csv_file), index=False) with open(os.path.join(output_dir, jsonp_file), 'w') as fh: fh.write('load_data(%d,' % level) json.dump(taxa_cols, fh) fh.write(',') json.dump(all_cols, fh) fh.write(',') df.to_json(fh, orient='records') fh.write(');') # Now that the tables have been collapsed, write out the index template index = os.path.join(TEMPLATES, 'barplot', 'index.html') q2templates.render(index, output_dir, context={'jsonp_files': jsonp_files}) # Copy assets for rendering figure shutil.copytree(os.path.join(TEMPLATES, 'barplot', 'dist'), os.path.join(output_dir, 'dist'))
def summarize(output_dir: str, problem: zarr.hierarchy.Group): context = build_context(output_dir, problem) index = os.path.join(assets, 'index.html') overview_template = os.path.join(assets, 'overview.html') path_template = os.path.join(assets, 'path.html') cv_template = os.path.join(assets, 'cv.html') stabsel_template = os.path.join(assets, 'stabsel.html') lam_fixed_template = os.path.join(assets, 'lam-fixed.html') templates = [ index, overview_template, path_template, cv_template, stabsel_template, lam_fixed_template ] q2templates.render(templates, output_dir, context=context)
def kNN_LOOCV_F_measures(output_dir: str, nearest_neighbors: dict, class_weight: DataFrame): y = nearest_neighbors['taxonomies'] indices = nearest_neighbors['neighbors'] weights = class_weight.T['Weight'].to_dict() uniform = _loocv(y, indices, weights, True) bespoke = _loocv(y, indices, weights) index = os.path.join(TEMPLATES, 'index.html') f_measures = DataFrame({'F-measure': [bespoke, uniform, bespoke-uniform]}, index=['Weighted', 'Uniform', 'Difference']) f_measures = q2templates.df_to_html(f_measures) q2templates.render(index, output_dir, context={ 'title': 'Indicators of Taxonomic Weight Importance', 'f_measures': f_measures, })
def save_viz(viz, output_dir): """Saves an Empress visualization to a filepath. Parameters ---------- viz : empress.Empress output_dir : str """ with open(os.path.join(output_dir, 'empress.html'), 'w') as htmlfile: htmlfile.write(str(viz)) viz.copy_support_files(output_dir) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir)
def tabulate_seqs(output_dir: str, data: DNAIterator) -> None: sequences = [] with open(os.path.join(output_dir, 'sequences.fasta'), 'w') as fh: for sequence in data: skbio.io.write(sequence, format='fasta', into=fh) str_seq = str(sequence) sequences.append({'id': sequence.metadata['id'], 'url': _blast_url_template % str_seq, 'seq': str_seq}) index = os.path.join(TEMPLATES, 'tabulate_seqs_assets', 'index.html') q2templates.render(index, output_dir, context={'data': sequences}) js = os.path.join( TEMPLATES, 'tabulate_seqs_assets', 'js', 'tsorter.min.js') os.mkdir(os.path.join(output_dir, 'js')) shutil.copy(js, os.path.join(output_dir, 'js', 'tsorter.min.js'))
def plot(output_dir: str, pcoa: skbio.OrdinationResults, metadata: qiime2.Metadata, custom_axis: str=None) -> None: mf = metadata.to_dataframe() viz = Emperor(pcoa, mf, remote='.') if custom_axis is not None: # put custom_axis inside a list to workaround the type system not # supporting lists of types html = viz.make_emperor(standalone=True, custom_axes=[custom_axis]) else: html = viz.make_emperor(standalone=True) viz.copy_support_files(output_dir) with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh: fh.write(html) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir)
def summarize(output_dir: str, table: biom.Table) -> None: number_of_samples = table.shape[1] number_of_features = table.shape[0] sample_summary, sample_frequencies = _frequency_summary(table, axis="sample") if number_of_samples > 1: sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False, rug=True) sample_frequencies_ax.set_xlabel("Frequency per sample") sample_frequencies_ax.get_figure().savefig(os.path.join(output_dir, "sample-frequencies.pdf")) sample_frequencies_ax.get_figure().savefig(os.path.join(output_dir, "sample-frequencies.png")) plt.gcf().clear() feature_summary, feature_frequencies = _frequency_summary(table, axis="observation") if number_of_features > 1: feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False, rug=True) feature_frequencies_ax.set_xlabel("Frequency per feature") feature_frequencies_ax.set_xscale("log") feature_frequencies_ax.set_yscale("log") feature_frequencies_ax.get_figure().savefig(os.path.join(output_dir, "feature-frequencies.pdf")) feature_frequencies_ax.get_figure().savefig(os.path.join(output_dir, "feature-frequencies.png")) sample_summary_table = _format_html_table(sample_summary.to_frame("Frequency")) feature_summary_table = _format_html_table(feature_summary.to_frame("Frequency")) index = os.path.join(TEMPLATES, "summarize_assets", "index.html") context = { "number_of_samples": number_of_samples, "number_of_features": number_of_features, "total_frequencies": int(np.sum(sample_frequencies)), "sample_summary_table": sample_summary_table, "feature_summary_table": feature_summary_table, } sample_frequencies.sort_values(inplace=True) sample_frequencies.to_csv(os.path.join(output_dir, "sample-frequency-detail.csv")) sample_frequencies_table = _format_html_table(sample_frequencies.to_frame("Frequency")) sample_frequency_template = os.path.join(TEMPLATES, "summarize_assets", "sample-frequency-detail.html") context.update({"sample_frequencies_table": sample_frequencies_table}) templates = [index, sample_frequency_template] q2templates.render(templates, output_dir, context=context)
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) # drop non-numeric columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric') non_numeric_cols = pre_filtered_cols - set(metadata.columns) # filter 0 variance numerical columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_zero_variance=True, drop_all_missing=True) zero_variance_cols = pre_filtered_cols - set(metadata.columns) # Drop samples that have any missing values. # TODO use Metadata API if this type of filtering is supported in the # future. df = metadata.to_dataframe() df = df.dropna(axis='index', how='any') # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'non_numeric_cols': ', '.join(sorted(non_numeric_cols)), 'zero_variance_cols': ', '.join(sorted(zero_variance_cols)), 'result': result})
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: str = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: metadata.get_column(i.name()) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def summarize(output_dir: str, table: biom.Table, sample_metadata: qiime2.Metadata=None) -> None: number_of_features, number_of_samples = table.shape sample_summary, sample_frequencies = _frequency_summary( table, axis='sample') if number_of_samples > 1: # Calculate the bin count, with a minimum of 5 bins IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile'] if IQR == 0.0: bins = 5 else: # Freedman–Diaconis rule bin_width = (2 * IQR) / (number_of_samples ** (1/3)) bins = max((sample_summary['Maximum frequency'] - sample_summary['Minimum frequency']) / bin_width, 5) sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False, rug=True, bins=int(round(bins))) sample_frequencies_ax.get_xaxis().set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) sample_frequencies_ax.set_xlabel('Frequency per sample') sample_frequencies_ax.set_ylabel('Number of samples') sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.pdf')) sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.png')) plt.gcf().clear() feature_summary, feature_frequencies = _frequency_summary( table, axis='observation') if number_of_features > 1: feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False, rug=False) feature_frequencies_ax.set_xlabel('Frequency per feature') feature_frequencies_ax.set_ylabel('Number of features') feature_frequencies_ax.set_xscale('log') feature_frequencies_ax.set_yscale('log') feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.pdf')) feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.png')) sample_summary_table = q2templates.df_to_html( sample_summary.apply('{:,}'.format).to_frame('Frequency')) feature_summary_table = q2templates.df_to_html( feature_summary.apply('{:,}'.format).to_frame('Frequency')) index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html') context = { 'number_of_samples': number_of_samples, 'number_of_features': number_of_features, 'total_frequencies': int(np.sum(sample_frequencies)), 'sample_summary_table': sample_summary_table, 'feature_summary_table': feature_summary_table, } feature_qualitative_data = _compute_qualitative_summary(table) sample_frequencies.sort_values(inplace=True, ascending=False) feature_frequencies.sort_values(inplace=True, ascending=False) sample_frequencies.to_csv( os.path.join(output_dir, 'sample-frequency-detail.csv')) feature_frequencies.to_csv( os.path.join(output_dir, 'feature-frequency-detail.csv')) feature_frequencies = feature_frequencies.astype(int) \ .apply('{:,}'.format).to_frame('Frequency') feature_frequencies['# of Samples Observed In'] = \ pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format) feature_frequencies_table = q2templates.df_to_html(feature_frequencies) overview_template = os.path.join( TEMPLATES, 'summarize_assets', 'overview.html') sample_frequency_template = os.path.join( TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html') feature_frequency_template = os.path.join( TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html') context.update({'max_count': sample_frequencies.max(), 'feature_frequencies_table': feature_frequencies_table, 'feature_qualitative_data': feature_qualitative_data, 'tabs': [{'url': 'overview.html', 'title': 'Overview'}, {'url': 'sample-frequency-detail.html', 'title': 'Interactive Sample Detail'}, {'url': 'feature-frequency-detail.html', 'title': 'Feature Detail'}]}) templates = [index, sample_frequency_template, feature_frequency_template, overview_template] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") if sample_metadata: sample_metadata = sample_metadata.filter_ids( sample_frequencies.index) # TODO use Metadata.to_json() API if/when it exists in the future. sample_metadata.to_dataframe().to_json(fh) else: fh.write('{}') fh.write(', ') sample_frequencies.to_json(fh) fh.write(');')
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode=None, metrics: set=None, metadata: qiime2.Metadata=None, min_depth: int=1, steps: int=10, iterations: int=10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is not None: metadata_ids = metadata.ids() table_ids = set(table.ids(axis='sample')) if not table_ids.issubset(metadata_ids): raise ValueError('Missing samples in metadata: %r' % table_ids.difference(metadata_ids)) filenames, categories, empty_columns = [], [], [] data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: metadata_df = metadata.to_dataframe() metadata_df = metadata_df.loc[data.index] all_columns = metadata_df.columns metadata_df.dropna(axis='columns', how='all', inplace=True) empty_columns = set(all_columns) - set(metadata_df.columns) metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) merged = data.join(metadata_df, how='left') categories = metadata_df.columns.get_level_values(0) for category in categories: category_name = quote(category) reindexed_df, counts = _reindex_with_metadata(category, categories, merged) c_df = _compute_summary(reindexed_df, category, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, category_name) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': filenames, 'categories': list(categories), 'empty_columns': sorted(empty_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def ancom(output_dir: str, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, transform_function: str = 'clr', difference_function: str = None) -> None: metadata = metadata.filter_ids(table.index) if metadata.has_missing_values(): missing_data_sids = metadata.get_ids(where_values_missing=True) missing_data_sids = ', '.join(sorted(missing_data_sids)) raise ValueError('Metadata column is missing values for the ' 'following samples. Values need to be added for ' 'these samples, or the samples need to be removed ' 'from the table: %s' % missing_data_sids) ancom_results = skbio_ancom(table, metadata.to_series(), significance_test=f_oneway) ancom_results[0].sort_values(by='W', ascending=False, inplace=True) ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'}, inplace=True) significant_features = ancom_results[0][ ancom_results[0]['Reject null hypothesis']] context = dict() if not significant_features.empty: context['significant_features'] = q2templates.df_to_html( significant_features['W'].to_frame()) context['percent_abundances'] = q2templates.df_to_html( ancom_results[1].loc[significant_features.index]) metadata = metadata.to_series() cats = list(set(metadata)) transform_function_name = transform_function transform_function = _transform_functions[transform_function] transformed_table = table.apply( transform_function, axis=1, result_type='broadcast') if difference_function is None: if len(cats) == 2: difference_function = 'mean_difference' else: # len(categories) > 2 difference_function = 'f_statistic' _d_func = _difference_functions[difference_function] def diff_func(x): args = _d_func(*[x[metadata == c] for c in cats]) if isinstance(args, tuple): return args[0] else: return args # effectively doing a groupby operation wrt to the metadata fold_change = transformed_table.apply(diff_func, axis=0) if not pd.isnull(fold_change).all(): volcano_results = pd.DataFrame({transform_function_name: fold_change, 'W': ancom_results[0].W}) volcano_results = volcano_results.reset_index(drop=False) spec = { '$schema': 'https://vega.github.io/schema/vega/v4.json', 'width': 300, 'height': 300, 'data': [ {'name': 'values', 'values': volcano_results.to_dict(orient='records')}], 'scales': [ {'name': 'xScale', 'domain': {'data': 'values', 'field': transform_function_name}, 'range': 'width'}, {'name': 'yScale', 'domain': {'data': 'values', 'field': 'W'}, 'range': 'height'}], 'axes': [ {'scale': 'xScale', 'orient': 'bottom', 'title': transform_function_name}, {'scale': 'yScale', 'orient': 'left', 'title': 'W'}], 'marks': [ {'type': 'symbol', 'from': {'data': 'values'}, 'encode': { 'hover': { 'fill': {'value': '#FF0000'}, 'opacity': {'value': 1}}, 'enter': { 'x': {'scale': 'xScale', 'field': transform_function_name}, 'y': {'scale': 'yScale', 'field': 'W'}}, 'update': { 'fill': {'value': 'black'}, 'opacity': {'value': 0.3}, 'tooltip': { 'signal': "{{'title': datum['index'], '{0}': " "datum['{0}'], 'W': datum['W']}}".format( transform_function_name)}}}}]} context['vega_spec'] = json.dumps(spec) copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir) ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'), header=True, index=True, sep='\t') ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.tsv'), header=True, index=True, sep='\t') index = os.path.join(TEMPLATES, 'ancom', 'index.html') q2templates.render(index, output_dir, context=context)
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str='permanova', pairwise: bool=False, permutations: int=999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Cast metadata to numeric (if applicable), which gives better sorting # in boxplots. Then filter any samples that are not in the distance matrix, # and drop samples with have no data for this metadata # category, including those with empty strings as values. metadata = pd.to_numeric(metadata.to_series(), errors='ignore') metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.index) filtered_dm_length = distance_matrix.shape[0] # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style("white") # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in sorted(metadata.groupby(metadata))]) for group_id in groupings: group_distances, x_ticklabels = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() result_html = result.to_frame().to_html(classes=("table table-striped " "table-hover")) result_html = result_html.replace('border="1"', 'border="0"') if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value']]) columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value'] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join( output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = pairwise_results.to_html( classes=("table table-striped table-hover")) pairwise_results_html = pairwise_results_html.replace( 'border="1"', 'border="0"') else: pairwise_results_html = None index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'groupings': groupings, 'result': result_html, 'pairwise_results': pairwise_results_html })
def beta_correlation(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str='spearman', permutations: int=999) -> None: test_statistics = {'spearman': 'rho', 'pearson': 'r'} alt_hypothesis = 'two-sided' try: metadata = pd.to_numeric(metadata.to_series(), errors='raise') except ValueError as e: raise ValueError('Only numeric data can be used with the Mantel test. ' 'Non-numeric data was encountered in the sample ' 'metadata. Orignal error message follows:\n%s' % str(e)) initial_metadata_length = len(metadata) metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() filtered_metadata_length = len(metadata) ids_with_missing_metadata = set(distance_matrix.ids) - set(metadata.index) if len(ids_with_missing_metadata) > 0: raise ValueError('All samples in distance matrix must be present ' 'and contain data in the sample metadata. The ' 'following samples were present in the distance ' 'matrix, but were missing from the sample metadata ' 'or had no data: %s' % ', '.join(ids_with_missing_metadata)) metadata_distances = _metadata_distance(metadata) r, p, n = skbio.stats.distance.mantel( distance_matrix, metadata_distances, method=method, permutations=permutations, alternative=alt_hypothesis, strict=True) result = pd.Series([method.title(), n, permutations, alt_hypothesis, metadata.name, r, p], index=['Method', 'Sample size', 'Permutations', 'Alternative hypothesis', 'Metadata category', '%s %s' % (method.title(), test_statistics[method]), 'p-value'], name='Mantel test results') result_html = result.to_frame().to_html(classes=("table table-striped " "table-hover")) result_html = result_html.replace('border="1"', 'border="0"') scatter_data = [] for id1, id2 in itertools.combinations(distance_matrix.ids, 2): scatter_data.append((distance_matrix[id1, id2], metadata_distances[id1, id2])) x = 'Input distance' y = 'Euclidean distance of\n%s' % metadata.name scatter_data = pd.DataFrame(scatter_data, columns=[x, y]) fig = sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False).get_figure() fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.png')) fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.pdf')) index = os.path.join( TEMPLATES, 'beta_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_metadata_length': initial_metadata_length, 'filtered_metadata_length': filtered_metadata_length, 'result': result_html })
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.CategoricalMetadataColumn, method: str = 'permanova', pairwise: bool = False, permutations: int = 999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) metadata = metadata.drop_missing_values() # filter the distance matrix to exclude samples that were dropped from # the metadata due to missing values, and keep track of how many samples # survived the filtering so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.ids) filtered_dm_length = distance_matrix.shape[0] metadata = metadata.to_series() # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style('white') # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. # TODO: update to use a grouping API and natsort API on # CategoricalMetadataColumn, if those become available. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in natsorted(metadata.groupby(metadata))]) pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) for group_id in groupings: group_distances, x_ticklabels, group_pairs_summary = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) group_pairs_summary = pd.DataFrame( group_pairs_summary, columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) pairs_summary = pd.concat([pairs_summary, group_pairs_summary]) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t') result_html = q2templates.df_to_html(result.to_frame()) if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value']]) columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value'] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join( output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = q2templates.df_to_html(pairwise_results) else: pairwise_results_html = None # repartition groupings for rendering group_ids = list(groupings.keys()) row_count, group_count = 3, len(group_ids) # Start at three plots per row while group_count % row_count != 0: row_count = row_count - 1 group_rows = [group_ids[g:g+row_count] for g in range(0, group_count, row_count)] index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'group_rows': group_rows, 'bootstrap_group_col_size': int(12 / row_count), 'result': result_html, 'pairwise_results': pairwise_results_html })
def mantel(output_dir: str, dm1: skbio.DistanceMatrix, dm2: skbio.DistanceMatrix, method: str = 'spearman', permutations: int = 999, intersect_ids: bool = False, label1: str = 'Distance Matrix 1', label2: str = 'Distance Matrix 2') -> None: test_statistics = {'spearman': 'rho', 'pearson': 'r'} alt_hypothesis = 'two-sided' # The following code to handle mismatched IDs, and subsequently filter the # distance matrices, is not technically necessary because skbio's mantel # function will raise an error on mismatches with `strict=True`, and will # handle intersection if `strict=False`. However, we need to handle the ID # matching explicitly to find *which* IDs are mismatched -- the error # message coming from scikit-bio doesn't describe those. We also need to # have the mismatched IDs to display as a warning in the viz if # `intersect_ids=True`. Finally, the distance matrices are explicitly # filtered to matching IDs only because their data are used elsewhere in # this function (e.g. extracting scatter plot data). # Find the symmetric difference between ID sets. ids1 = set(dm1.ids) ids2 = set(dm2.ids) mismatched_ids = ids1 ^ ids2 if not intersect_ids and mismatched_ids: raise ValueError( 'The following ID(s) are not contained in both distance matrices. ' 'This sometimes occurs when mismatched files are passed. If this ' 'is not the case, you can use `intersect_ids` to discard these ' 'mismatches and apply the Mantel test to only those IDs that are ' 'found in both distance matrices.\n\n%s' % ', '.join(sorted(mismatched_ids))) if mismatched_ids: matched_ids = ids1 & ids2 # Run in `strict` mode because the matches should all be found in both # matrices. dm1 = dm1.filter(matched_ids, strict=True) dm2 = dm2.filter(matched_ids, strict=True) # Run in `strict` mode because all IDs should be matched at this point. r, p, sample_size = skbio.stats.distance.mantel( dm1, dm2, method=method, permutations=permutations, alternative=alt_hypothesis, strict=True) result = pd.Series([method.title(), sample_size, permutations, alt_hypothesis, r, p], index=['Method', 'Sample size', 'Permutations', 'Alternative hypothesis', '%s %s' % (method.title(), test_statistics[method]), 'p-value'], name='Mantel test results') table_html = q2templates.df_to_html(result.to_frame()) # We know the distance matrices have matching ID sets at this point, so we # can safely generate all pairs of IDs using one of the matrices' ID sets # (it doesn't matter which one). scatter_data = [] for id1, id2 in itertools.combinations(dm1.ids, 2): scatter_data.append((dm1[id1, id2], dm2[id1, id2])) plt.figure() x = 'Pairwise Distance (%s)' % label1 y = 'Pairwise Distance (%s)' % label2 scatter_data = pd.DataFrame(scatter_data, columns=[x, y]) sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False) plt.savefig(os.path.join(output_dir, 'mantel-scatter.svg')) context = { 'table': table_html, 'sample_size': sample_size, 'mismatched_ids': mismatched_ids } index = os.path.join( TEMPLATES, 'mantel_assets', 'index.html') q2templates.render(index, output_dir, context=context)
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns( drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat([alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis(groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append( ['%s:%s' % (column, names[i]), '%s:%s' % (column, names[j])]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests( kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump({'initial': initial_data_length, 'filtered': filtered_data_length}, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join( TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons])}) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() if metadata_df.empty or len(metadata.columns) == 0: raise ValueError("All metadata filtered after dropping columns " "that contained non-categorical data.") metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata(column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str = 'spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata contains only non-numeric or empty columns. This " "visualizer requires at least one numeric metadata column to " "execute.") # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_column.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = {'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0]} escaped_column = quote(column) filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % column) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump({ 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0]}, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'filtered_columns': ', '.join(sorted(filtered_columns))}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str='spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) metadata_df = metadata.to_dataframe() metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore') pre_filtered_cols = set(metadata_df.columns) metadata_df = metadata_df.select_dtypes(include=[np.number]) post_filtered_cols = set(metadata_df.columns) filtered_categories = pre_filtered_cols - post_filtered_cols categories = metadata_df.columns if len(categories) == 0: raise ValueError('Only non-numeric data is present in metadata file.') filenames = [] for category in categories: metadata_category = metadata_df[category] metadata_category = metadata_category[alpha_diversity.index] metadata_category = metadata_category.dropna() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_category, alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_category.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = {'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0]} escaped_category = quote(category) filename = 'category-%s.jsonp' % escaped_category filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % category) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump({ 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0]}, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'categories': [quote(fn) for fn in filenames], 'filtered_categories': ', '.join(filtered_categories)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dst'), os.path.join(output_dir, 'dist'))
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: metadata_df = metadata.to_dataframe() metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore') pre_filtered_cols = set(metadata_df.columns) metadata_df = metadata_df.select_dtypes(exclude=[np.number]) post_filtered_cols = set(metadata_df.columns) filtered_numeric_categories = pre_filtered_cols - post_filtered_cols filtered_group_comparisons = [] categories = metadata_df.columns metric_name = alpha_diversity.name if len(categories) == 0: raise ValueError('Only numeric data is present in metadata file.') filenames = [] filtered_categories = [] for category in categories: metadata_category = metadata.get_category(category).to_series() metadata_category = metadata_category[alpha_diversity.index] metadata_category = metadata_category.replace(r'', np.nan).dropna() initial_data_length = alpha_diversity.shape[0] data = pd.concat([alpha_diversity, metadata_category], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_category.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[alpha_diversity.name])) if (len(groups) > 1 and len(groups) != len(data.index)): escaped_category = quote(category) filename = 'category-%s.jsonp' % escaped_category filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis(groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append( ['%s:%s' % (category, names[i]), '%s:%s' % (category, names[j])]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests( kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_category pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: df = pd.Series(groups, index=names) fh.write("load_data('%s'," % category) df.to_json(fh, orient='split') fh.write(",") json.dump({'initial': initial_data_length, 'filtered': filtered_data_length}, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = kw_H_pairwise.to_html(classes="table table-striped " "table-hover") table = table.replace('border="1"', 'border="0"') fh.write(table.replace('\n', '')) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) else: filtered_categories.append(category) index = os.path.join( TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'categories': [quote(fn) for fn in filenames], 'filtered_numeric_categories': ', '.join(filtered_numeric_categories), 'filtered_categories': ', '.join(filtered_categories), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons])}) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dst'), os.path.join(output_dir, 'dist'))