def _visualize(output_dir, estimator, cm, importances=None, optimize_feature_selection=True, title='results'): pd.set_option('display.max_colwidth', -1) # summarize model accuracy and params if estimator is not None: result = _extract_estimator_parameters(estimator) result = q2templates.df_to_html(result.to_frame()) else: result = False if cm is not None: cm.to_csv(join( output_dir, 'predictive_accuracy.tsv'), sep='\t', index=True) cm = q2templates.df_to_html(cm) if importances is not None: importances = sort_importances(importances) pd.set_option('display.float_format', '{:.3e}'.format) importances.to_csv(join( output_dir, 'feature_importance.tsv'), sep='\t', index=True) importances = q2templates.df_to_html(importances, index=True) else: importances = False index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': title, 'result': result, 'predictions': cm, 'importances': importances, 'classification': True, 'optimize_feature_selection': optimize_feature_selection, 'maturity_index': False})
def _visualize_anova(output_dir, pairwise_tests=False, model_results=False, residuals=False, pairwise_test_name='Pairwise t-tests'): pd.set_option('display.max_colwidth', -1) if pairwise_tests is not False: pairwise_tests.to_csv(os.path.join(output_dir, 'pairwise_tests.tsv'), sep='\t') pairwise_tests = q2templates.df_to_html(pairwise_tests) model_results.to_csv(os.path.join(output_dir, 'model_results.tsv'), sep='\t') model_results = q2templates.df_to_html(model_results) residuals.savefig(os.path.join(output_dir, 'residuals.png'), bbox_inches='tight') residuals.savefig(os.path.join(output_dir, 'residuals.pdf'), bbox_inches='tight') plt.close('all') index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'plot_name': 'ANOVA', 'model_results': model_results, 'pairwise_tests': pairwise_tests, 'residuals': residuals, 'pairwise_test_name': pairwise_test_name, })
def _visualize(output_dir, estimator, cm, roc, optimize_feature_selection=True, title='results'): pd.set_option('display.max_colwidth', None) # summarize model accuracy and params if estimator is not None: result = _extract_estimator_parameters(estimator) result = q2templates.df_to_html(result.to_frame()) else: result = False if cm is not None: cm.to_csv(join( output_dir, 'predictive_accuracy.tsv'), sep='\t', index=True) cm = q2templates.df_to_html(cm) if roc is not None: roc = True index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': title, 'result': result, 'predictions': cm, 'roc': roc, 'optimize_feature_selection': optimize_feature_selection})
def _visualize_maturity_index(table, metadata, group_by, column, predicted_column, importances, estimator, accuracy, output_dir, maz_stats=True): pd.set_option('display.max_colwidth', -1) maturity = '{0} maturity'.format(column) maz = '{0} MAZ score'.format(column) # save feature importance data and convert to html importances = sort_importances(importances) importances.to_csv( join(output_dir, 'feature_importance.tsv'), index=True, sep='\t') importance = q2templates.df_to_html(importances, index=True) # save predicted values, maturity, and MAZ score data maz_md = metadata[[group_by, column, predicted_column, maturity, maz]] maz_md.to_csv(join(output_dir, 'maz_scores.tsv'), sep='\t') if maz_stats: maz_aov = _two_way_anova(table, metadata, maz, group_by, column)[0] maz_aov.to_csv(join(output_dir, 'maz_aov.tsv'), sep='\t') maz_pairwise = _pairwise_stats( table, metadata, maz, group_by, column) maz_pairwise.to_csv(join(output_dir, 'maz_pairwise.tsv'), sep='\t') # plot control/treatment predicted vs. actual values g = _lmplot_from_dataframe( metadata, column, predicted_column, group_by) g.savefig(join(output_dir, 'maz_predictions.png'), bbox_inches='tight') g.savefig(join(output_dir, 'maz_predictions.pdf'), bbox_inches='tight') plt.close('all') # plot barplots of MAZ score vs. column (e.g., age) g = _boxplot_from_dataframe(metadata, column, maz, group_by) g.get_figure().savefig( join(output_dir, 'maz_boxplots.png'), bbox_inches='tight') g.get_figure().savefig( join(output_dir, 'maz_boxplots.pdf'), bbox_inches='tight') plt.close('all') # plot heatmap of column (e.g., age) vs. abundance of top features top = table[list(importances.index)] g = _clustermap_from_dataframe(top, metadata, group_by, column) g.savefig(join(output_dir, 'maz_heatmaps.png'), bbox_inches='tight') g.savefig(join(output_dir, 'maz_heatmaps.pdf'), bbox_inches='tight') result = _extract_estimator_parameters(estimator) result.append(pd.Series([accuracy], index=['Accuracy score'])) result = q2templates.df_to_html(result.to_frame()) index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': 'maturity index predictions', 'result': result, 'predictions': None, 'importances': importance, 'classification': False, 'optimize_feature_selection': True, 'maturity_index': True})
def test_defaults(self): df = pd.DataFrame({'col1': ['foo', 'bar', 'baz'], 'col2': [1, 2, 4.2]}) obs = df_to_html(df) self.assertIn('border="0"', obs) self.assertIn('table table-striped table-hover', obs)
def core_features(output_dir, table: biom.Table, min_fraction: float = 0.5, max_fraction: float = 1.0, steps: int = 11) -> None: if max_fraction < min_fraction: raise ValueError('min_fraction (%r) parameter must be less than ' 'max_fraction (%r) parameter.' % (min_fraction, max_fraction)) index_fp = os.path.join(TEMPLATES, 'index.html') context = { 'num_samples': table.shape[1], 'num_features': table.shape[0] } if min_fraction == max_fraction: fractions = [min_fraction] else: fractions = np.linspace(min_fraction, max_fraction, steps) rounded_fractions = _round_fractions(fractions) data = [] file_links = [] for fraction, rounded_fraction in zip(fractions, rounded_fractions): core_features = _get_core_features(table, fraction) core_feature_count = len(core_features) data.append([fraction, core_feature_count]) if core_feature_count > 0: core_feature_fn = 'core-features-%s.tsv' % rounded_fraction core_feature_fp = os.path.join(output_dir, core_feature_fn) file_links.append("<a href='./%s'>TSV</a>" % core_feature_fn) core_features.to_csv(core_feature_fp, sep='\t', index_label='Feature ID') else: file_links.append('No core features') df = pd.DataFrame(data, columns=['Fraction of samples', 'Feature count']) df['Fraction of features'] = df['Feature count'] / table.shape[0] df['Feature list'] = file_links # newer versions of seaborn don't like dataframes with fewer than two rows if len(fractions) > 1: ax = sns.regplot(data=df, x='Fraction of samples', y='Feature count', fit_reg=False) # matplotlib will issue a UserWarning if attempting to set left and # right bounds to the same value. ax.set_xbound(min(fractions), max(fractions)) ax.set_ybound(0, max(df['Feature count']) + 1) ax.get_figure().savefig( os.path.join(output_dir, 'core-feature-counts.svg')) context['show_plot'] = True context['table_html'] = q2templates.df_to_html(df, index=False, escape=False) q2templates.render(index_fp, output_dir, context=context)
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # convert metadata to numeric values where applicable, drop the non-numeric # values, and then drop samples that contain NaNs df = metadata.to_dataframe() df = df.apply(lambda x: pd.to_numeric(x, errors='ignore')) # filter categorical columns pre_filtered_cols = set(df.columns) df = df.select_dtypes([numpy.number]).dropna() filtered_categorical_cols = pre_filtered_cols - set(df.columns) # filter 0 variance numerical columns pre_filtered_cols = set(df.columns) df = df.loc[:, df.var() != 0] filtered_zero_variance_cols = pre_filtered_cols - set(df.columns) # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index, strict=False) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'filtered_categorical_cols': ', '.join(filtered_categorical_cols), 'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols), 'result': result})
def core_features(output_dir, table: biom.Table, min_fraction: float=0.5, max_fraction: float=1.0, steps: int=11) -> None: if max_fraction < min_fraction: raise ValueError('min_fraction (%r) parameter must be less than ' 'max_fraction (%r) parameter.' % (min_fraction, max_fraction)) index_fp = os.path.join(TEMPLATES, 'index.html') context = { 'num_samples': table.shape[1], 'num_features': table.shape[0] } if min_fraction == max_fraction: fractions = [min_fraction] else: fractions = np.linspace(min_fraction, max_fraction, steps) rounded_fractions = _round_fractions(fractions) data = [] file_links = [] for fraction, rounded_fraction in zip(fractions, rounded_fractions): core_features = _get_core_features(table, fraction) core_feature_count = len(core_features) data.append([fraction, core_feature_count]) if core_feature_count > 0: core_feature_fn = 'core-features-%s.tsv' % rounded_fraction core_feature_fp = os.path.join(output_dir, core_feature_fn) file_links.append("<a href='./%s'>TSV</a>" % core_feature_fn) core_features.to_csv(core_feature_fp, sep='\t', index_label='Feature ID') else: file_links.append('No core features') df = pd.DataFrame(data, columns=['Fraction of samples', 'Feature count']) df['Fraction of features'] = df['Feature count'] / table.shape[0] df['Feature list'] = file_links ax = sns.regplot(data=df, x='Fraction of samples', y='Feature count', fit_reg=False) # matplotlib will issue a UserWarning if attempting to set left and right # bounds to the same value. if min_fraction != max_fraction: ax.set_xbound(min(fractions), max(fractions)) ax.set_ybound(0, max(df['Feature count']) + 1) ax.get_figure().savefig( os.path.join(output_dir, 'core-feature-counts.svg')) context['table_html'] = q2templates.df_to_html(df, index=False, escape=False) q2templates.render(index_fp, output_dir, context=context)
def _visualize(output_dir, estimator, cm, accuracy, importances=None, optimize_feature_selection=True, title='results'): # Need to sort out how to save estimator as sklearn.pipeline # This will be possible once qiime2 support pipeline actions pd.set_option('display.max_colwidth', -1) # summarize model accuracy and params result = pd.Series(estimator.get_params(), name='Parameter setting') result = q2templates.df_to_html(result.to_frame()) cm.to_csv(join(output_dir, 'predictive_accuracy.tsv'), sep='\t', index=True) cm = q2templates.df_to_html(cm) if importances is not None: importances = sort_importances(importances) pd.set_option('display.float_format', '{:.3e}'.format) importances.to_csv(join(output_dir, 'feature_importance.tsv'), sep='\t', index=False) importances = q2templates.df_to_html(importances, index=False) index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': title, 'result': result, 'predictions': cm, 'importances': importances, 'classification': True, 'optimize_feature_selection': optimize_feature_selection, 'maturity_index': False })
def _visualize_knn(output_dir, params: pd.Series): result = q2templates.df_to_html(params.to_frame()) index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': 'Estimator Summary', 'result': result, 'predictions': None, 'importances': None, 'classification': True, 'optimize_feature_selection': False})
def test_no_truncation(self): long_cell = 'baz' * 100 df = pd.DataFrame({'col': ['foo', 'bar', long_cell]}) obs = df_to_html(df) self.assertIn('col', obs) self.assertIn('foo', obs) self.assertIn('bar', obs) self.assertIn(long_cell, obs)
def _get_html(output_dir, datafiles): html = {} for direction in datafiles: html[direction] = {} for stats_type in datafiles[direction]: filename = datafiles[direction][stats_type] filename = os.path.join(output_dir, filename) data_df = pd.read_csv(filename, sep='\t') html[direction][stats_type] = q2templates.df_to_html(data_df, index=False) return html
def summarize_Qiita_metadata_category_and_contexts( output_dir: str = None, category: str = 'sample_type'): counts, caches = _fetch_Qiita_summaries(category=category) counts = counts.to_frame() counts = DataFrame({ category: counts.index, 'count': counts.values.T[0] }, columns=[category, 'count']) sample_types = q2templates.df_to_html(counts, bold_rows=False, index=False) contexts = q2templates.df_to_html(caches, index=False) title = 'Available in Qiita' index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': title, 'sample_types': sample_types, 'contexts': contexts })
def mapviz(output_dir, results=None, title='Coordinates'): if results is not None: results.to_csv(join( output_dir, 'results.tsv'), sep='\t', index=True) results = q2templates.df_to_html(results) else: results = False index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'results': results, 'title': title})
def _build_seq_len_table(qscores: pd.DataFrame) -> str: sequence_lengths = qscores.notnull().sum(axis=1).copy() stats = _compute_stats_of_df(sequence_lengths) stats[stats.index != 'count'] = \ stats[stats.index != 'count'].astype(int).apply('{} nts'.format) stats.rename(index={'50%': '50% (Median)', 'count': 'Total Sequences Sampled'}, inplace=True) frame = stats.to_frame(name="") return q2templates.df_to_html(frame)
def seq_depth(output_dir: str, table: pd.DataFrame, metadata: qiime2.Metadata, mypar: float = 4) -> None: table_path = os.path.join(output_dir, 'table.tsv') metadata_path = os.path.join(output_dir, 'metadata.tsv') table.to_csv(table_path) metadata.save(metadata_path) cmd_path = os.path.join(TEMPLATES, 'seq_depth.R') print(os.path.exists(table_path)) print(os.path.exists(metadata_path)) cmd = [ 'Rscript', cmd_path, '{0}'.format(output_dir), '{0}'.format(table_path), '{0}'.format(metadata_path) ] #cmd = 'Rscript {0} arg1={1} arg2={2} arg3={3}'.format(cmd_path, output_dir, table_path, metadata_path) #cmd = 'Rscript assets/seq_depth.R arg1=$1 arg2=$2 arg3=$3' proc = subprocess.run(cmd, check=True) index = os.path.join(TEMPLATES, 'index.html') # Errors filepath, load in as list errors_fp = os.path.join(output_dir, 'warnings.txt') with open(errors_fp, 'r') as errors_f: errors = [e for e in errors_f] # Load in depths as a pandas data frame, then transfer to html depths = pd.read_csv(os.path.join(output_dir, 'mytable.tsv'), sep="\t") depths = q2templates.df_to_html(depths) # Load in plot plot_fp = os.path.join(output_dir, 'myplot.png') q2templates.render(index, output_dir, context={ 'errors': errors, 'summary': None, 'model_summary': None, 'model_results': depths, 'multiple_group_test': None, 'pairwise_tests': None, 'paired_difference_tests': None, 'plot': True, 'plot_name': "My Plot", 'raw_data': None, 'pairwise_test_name': None, })
def summarize(output_dir: str, problem : zarr.hierarchy.Group): print(TEMPLATES) beta = pd.DataFrame(data={'label':problem['label'],'beta':problem['solution/LAMfixed/refit']}) beta.to_csv(os.path.join(output_dir,'beta.csv'),header=True, index=False) show_plot = False if show_plot : x = np.linspace(0,1) y = x**2 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(x, y, color='blue') fig.savefig(os.path.join(output_dir, 'test-plot.png')) html = q2templates.df_to_html(beta, index=False) context = { 'dico': { 'un': 1, 'deux':2 }, 'result': html, 'n_features':len(beta), 'beta' : beta, 'show_plot': show_plot, 'tabs': [{'title': 'Overview', 'url': 'overview.html'}, {'title': 'LAM fixed', 'url': 'lam-fixed.html'}], 'dangers': [], 'warnings': [], } index = os.path.join(TEMPLATES, 'assets', 'index.html') overview_template = os.path.join(TEMPLATES, 'assets', 'overview.html') quality_template = os.path.join(TEMPLATES, 'assets', 'quality-plot.html') templates = [index, overview_template, quality_template] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") json.dump({'selected param' : 10}, fh) fh.write(',') beta.to_json(fh) fh.write(');')
def _visualize(output_dir, multiple_group_test=False, pairwise_tests=False, paired_difference_tests=False, plot=False, summary=False, errors=False, model_summary=False, model_results=False, raw_data=False, plot_name='Pairwise difference boxplot', pairwise_test_name='Pairwise group comparison tests'): pd.set_option('display.max_colwidth', -1) if summary is not False: summary = q2templates.df_to_html(summary.to_frame()) if multiple_group_test is not False: multiple_group_test = multiple_group_test.to_frame().transpose() multiple_group_test = q2templates.df_to_html(multiple_group_test) if pairwise_tests is not False: pairwise_tests.to_csv(os.path.join(output_dir, 'pairwise_tests.tsv'), sep='\t') pairwise_tests = q2templates.df_to_html(pairwise_tests) if raw_data is not False: raw_data.to_csv(os.path.join(output_dir, 'raw-data.tsv'), sep='\t') raw_data = True if paired_difference_tests is not False: paired_difference_tests.to_csv(os.path.join( output_dir, 'paired_difference_tests.tsv'), sep='\t') paired_difference_tests = q2templates.df_to_html( paired_difference_tests) if model_summary is not False: model_summary.to_csv(os.path.join(output_dir, 'model_summary.tsv'), sep='\t') model_summary = q2templates.df_to_html(model_summary) if model_results is not False: model_results.to_csv(os.path.join(output_dir, 'model_results.tsv'), sep='\t') model_results = q2templates.df_to_html(model_results) if plot is not False: plot.savefig(os.path.join(output_dir, 'plot.png'), bbox_inches='tight') plot.savefig(os.path.join(output_dir, 'plot.pdf'), bbox_inches='tight') plt.close('all') index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'errors': errors, 'summary': summary, 'model_summary': model_summary, 'model_results': model_results, 'multiple_group_test': multiple_group_test, 'pairwise_tests': pairwise_tests, 'paired_difference_tests': paired_difference_tests, 'plot': plot, 'plot_name': plot_name, 'raw_data': raw_data, 'pairwise_test_name': pairwise_test_name, })
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) # drop non-numeric columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric') non_numeric_cols = pre_filtered_cols - set(metadata.columns) # Drop samples that have any missing values. # TODO use Metadata API if more filtering is supported in the future. df = metadata.to_dataframe() df = df.dropna() metadata = qiime2.Metadata(df) # filter 0 variance numerical columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_zero_variance=True, drop_all_missing=True) zero_variance_cols = pre_filtered_cols - set(metadata.columns) df = metadata.to_dataframe() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'non_numeric_cols': ', '.join(sorted(non_numeric_cols)), 'zero_variance_cols': ', '.join(sorted(zero_variance_cols)), 'result': result })
def visualize_in_qzv(core, cfg, output_dir): """Visualize in qzv""" TEMPLATES = pkg_resources.resource_filename('q2_coremicrobiome', 'coremic_assets') outputfile = cfg['outputfile'] # data inppstr = format_inputs_qzv(cfg) usr_inputs = q2templates.df_to_html(inppstr, index=False) # downloadable file inputs_path = os.path.join(output_dir, outputfile+'InputParams.tsv') inppstr.to_csv(inputs_path, sep='\t', index=False) # results outpstr = format_results_qzv(core) results = '' if len(outpstr.index) == 0: results = "<b>No core microbes found.</b> <br>\ <i>Please try relaxing the following:</i> <br>\ --p-min-frac Minimum fractional presence in the interest group <br>\ --p-max-p pvalue cutoff <br>\ <i>or changing the:</i> <br>\ normalization using --p-make-relative and/or --p-quantile-normalize <br>\ method for multiple testing correction using --p-p-val-adj" else: results = q2templates.df_to_html(outpstr, index=False) # downloadable file results_path = os.path.join(output_dir, outputfile+'Results.tsv') outpstr.to_csv(results_path, sep='\t', index=False) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'outputfile': outputfile, 'usr_inputs': usr_inputs, 'results': results }) return None
def kNN_LOOCV_F_measures(output_dir: str, nearest_neighbors: dict, class_weight: DataFrame): y = nearest_neighbors['taxonomies'] indices = nearest_neighbors['neighbors'] weights = class_weight.T['Weight'].to_dict() uniform = _loocv(y, indices, weights, True) bespoke = _loocv(y, indices, weights) index = os.path.join(TEMPLATES, 'index.html') f_measures = DataFrame({'F-measure': [bespoke, uniform, bespoke-uniform]}, index=['Weighted', 'Uniform', 'Difference']) f_measures = q2templates.df_to_html(f_measures) q2templates.render(index, output_dir, context={ 'title': 'Indicators of Taxonomic Weight Importance', 'f_measures': f_measures, })
def summarize_selections(output_dir: str, selections: IDSelection): table = _build_summary_table(selections) html_table = q2templates.df_to_html(table, index=False) table_fn = 'table.tsv' # Not using qiime2.Metadata b/c we don't have a meaningful ID col table.to_csv(os.path.join(output_dir, table_fn), sep='\t', encoding='utf-8') context = { 'table': html_table, 'table_fn': table_fn, } q2templates.render(SUMMARY_TEMPLATE, output_dir, context=context)
def test_defaults_override(self): df = pd.DataFrame({ 'col1': ['foo', 'bar', 'baz'], 'col2': [1, 2, 4.2] }, index=['id1', 'id2', 'id3']) obs = df_to_html(df, border=1, classes=('class1', 'class2'), index=False) self.assertIn('border="1"', obs) self.assertIn('class1 class2', obs) self.assertNotIn('id1', obs) self.assertNotIn('id2', obs) self.assertNotIn('id3', obs)
def _visualize(output_dir, results, plot): pd.set_option('display.max_colwidth', -1) # save results results.to_csv(join(output_dir, 'evaluate_seqs_results.tsv'), sep='\t') results = q2templates.df_to_html(results, index=True) plot.savefig(join(output_dir, 'evaluate_seqs.png'), bbox_inches='tight') plot.savefig(join(output_dir, 'evaluate_seqs.pdf'), bbox_inches='tight') index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': 'Sequence Evaluation Results', 'running_title': 'evaluate_seqs', 'results': results, })
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: int = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: column = metadata.get_column(i.name()) if column.has_missing_values(): raise ValueError( 'adonis requires metadata columns with no ' 'NaN values (missing values in column `%s`.)' % (column.name, )) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = [ 'run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp ] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) # drop non-numeric columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric') non_numeric_cols = pre_filtered_cols - set(metadata.columns) # filter 0 variance numerical columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_zero_variance=True, drop_all_missing=True) zero_variance_cols = pre_filtered_cols - set(metadata.columns) # Drop samples that have any missing values. # TODO use Metadata API if this type of filtering is supported in the # future. df = metadata.to_dataframe() df = df.dropna(axis='index', how='any') # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'non_numeric_cols': ', '.join(sorted(non_numeric_cols)), 'zero_variance_cols': ', '.join(sorted(zero_variance_cols)), 'result': result})
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: str = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: metadata.get_column(i.name()) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def summarize(output_dir: str, data: _PlotQualView, n: int = 10000) -> None: paired = data.paired data = data.directory_format dangers = [] warnings = [] manifest = pd.read_csv(os.path.join(str(data), data.manifest.pathspec), header=0, comment='#') manifest.filename = manifest.filename.apply( lambda x: os.path.join(str(data), x)) fwd = manifest[manifest.direction == 'forward'].filename.tolist() rev = manifest[manifest.direction == 'reverse'].filename.tolist() per_sample_fastq_counts = {} reads = rev if not fwd and rev else fwd file_records = [] for file in reads: count = 0 for seq in _read_fastq_seqs(file): count += 1 sample_id = manifest.loc[manifest.filename == file, 'sample-id'].iloc[0] per_sample_fastq_counts[sample_id] = count file_records.append((file, sample_id)) result = pd.Series(per_sample_fastq_counts) result.name = 'Sequence count' result.index.name = 'Sample name' result.sort_values(inplace=True, ascending=False) result.to_csv(os.path.join(output_dir, 'per-sample-fastq-counts.csv'), header=True, index=True) sequence_count = result.sum() if n > sequence_count: n = sequence_count warnings.append('A subsample value was provided that is greater than ' 'the amount of sequences across all samples. The plot ' 'was generated using all available sequences.') subsample_ns = sorted(random.sample(range(sequence_count), n)) link = _link_sample_n_to_file(file_records, per_sample_fastq_counts, subsample_ns) if paired: sample_map = [(file, rev[fwd.index(file)], link[file]) for file in link] quality_scores, min_seq_len = _subsample_paired(sample_map) else: sample_map = [(file, link[file]) for file in link] quality_scores, min_seq_len = _subsample_single(sample_map) forward_scores = pd.DataFrame(quality_scores['forward']) forward_stats = _compute_stats_of_df(forward_scores) if (forward_stats.loc['50%'] > 45).any(): dangers.append('Some of the PHRED quality values are out of range. ' 'This is likely because an incorrect PHRED offset ' 'was chosen on import of your raw data. You can learn ' 'how to choose your PHRED offset during import in the ' 'importing tutorial.') if paired: reverse_scores = pd.DataFrame(quality_scores['reverse']) reverse_stats = _compute_stats_of_df(reverse_scores) show_plot = len(fwd) > 1 if show_plot: ax = sns.distplot(result, kde=False) ax.set_xlabel('Number of sequences') ax.set_ylabel('Frequency') fig = ax.get_figure() fig.savefig(os.path.join(output_dir, 'demultiplex-summary.png')) fig.savefig(os.path.join(output_dir, 'demultiplex-summary.pdf')) html = q2templates.df_to_html(result.to_frame()) index = os.path.join(TEMPLATES, 'assets', 'index.html') overview_template = os.path.join(TEMPLATES, 'assets', 'overview.html') quality_template = os.path.join(TEMPLATES, 'assets', 'quality-plot.html') context = { 'result_data': { 'min': result.min(), 'median': result.median(), 'mean': result.mean(), 'max': result.max(), 'sum': sequence_count }, 'result': html, 'show_plot': show_plot, 'paired': paired, 'tabs': [{ 'title': 'Overview', 'url': 'overview.html' }, { 'title': 'Interactive Quality Plot', 'url': 'quality-plot.html' }], 'dangers': dangers, 'warnings': warnings, } templates = [index, overview_template, quality_template] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") json.dump( { 'n': int(n), 'totalSeqCount': int(sequence_count), 'minSeqLen': min_seq_len }, fh) fh.write(',') forward_stats.to_json(fh) if paired: fh.write(',') reverse_stats.to_json(fh) fh.write(');')
def build_context(output_dir, problem): labels = problem['label'] features = pd.DataFrame(problem['data/X'], columns=labels) y = pd.DataFrame({'y': problem['data/y']}) c = pd.DataFrame(problem['data/C'], columns=labels) features.to_csv(os.path.join(output_dir, 'features.csv'), header=True, index=False) y.to_csv(os.path.join(output_dir, 'samples.csv'), header=True, index=False) c.to_csv(os.path.join(output_dir, 'constraints.csv'), header=True, index=False) context = { 'path': False, 'cv': False, 'stabsel': False, 'lam': False, 'labels': labels, 'tabs': [{ 'title': 'Overview', 'url': 'overview.html' }] } dico = { 'formulation': name_formulation(problem['formulation'].attrs.asdict(), output_dir), 'concomitant': problem['formulation'].attrs['concomitant'], 'n': len(problem['data/X']), 'd': len(problem['data/X'][0]), 'k': len(problem['data/C']) } context['dico'] = dico dico_ms = problem['model_selection'].attrs.asdict() if dico_ms['PATH']: context['path'] = True context['tabs'].append({'title': 'Lambda-path', 'url': 'path.html'}) dico_path = { **problem['model_selection/PATHparameters'].attrs.asdict(), **problem['solution/PATH'].attrs.asdict() } dico_path['lambdas'] = problem['solution/PATH/LAMBDAS'] dico_path['lamin'] = min(dico_path['lambdas']) dico_path['Nlam'] = len(dico_path['lambdas']) data = pd.DataFrame(np.array(problem['solution/PATH/BETAS']), index=dico_path['lambdas'], columns=labels) data.to_csv(os.path.join(output_dir, 'path.csv'), header=True, index=True) SIGMAS = None if dico['concomitant']: SIGMAS = problem['solution/PATH/SIGMAS'] context['dicopath'] = dico_path plot_path(np.array(problem['solution/PATH/BETAS']), SIGMAS, problem['solution/PATH/LAMBDAS'], output_dir, labels) if dico_ms['CV']: context['cv'] = True xGraph, yGraph, standard_error = problem[ 'solution/CV/xGraph'], problem['solution/CV/yGraph'], problem[ 'solution/CV/standard_error'] context['tabs'].append({'title': 'Cross-Validation', 'url': 'cv.html'}) dico_cv = { **problem['model_selection/CVparameters'].attrs.asdict(), **problem['solution/CV'].attrs.asdict() } dico_cv['lamin'] = min(xGraph) dico_cv['Nlam'] = len(xGraph) beta = pd.DataFrame(data={ 'label': problem['label'], 'beta': problem['solution/CV/refit'] }) beta.to_csv(os.path.join(output_dir, 'CV-beta.csv'), header=True, index=False) selected_param = np.array(problem['solution/CV/selected_param']) beta_support = beta[selected_param] dico_cv['htmlbeta'] = q2templates.df_to_html(beta_support, index=False) context['dicocv'] = dico_cv if (dico_cv['oneSE']): lam = dico_cv['lambda_1SE'] else: lam = dico_cv['lambda_min'] plot_beta( np.array(problem['solution/CV/refit']), selected_param, output_dir, labels, 'cv-refit.png', r"Refitted coefficients of $\beta$ after CV model selection finds $\lambda$ = " + str(lam)) plot_cv(xGraph, yGraph, dico_cv['index_1SE'], dico_cv['index_min'], standard_error, output_dir, 'cv-graph.png') if dico_ms['StabSel']: context['stabsel'] = True context['tabs'].append({ 'title': 'Stability Selection', 'url': 'stabsel.html' }) dico_stabsel = { **problem['model_selection/StabSelparameters'].attrs.asdict(), **problem['solution/StabSel'].attrs.asdict() } stability = pd.DataFrame( data={ 'label': problem['label'], 'stability-probability': problem['solution/StabSel/distribution'] }) stability.to_csv(os.path.join(output_dir, 'StabSel-prob.csv'), header=True, index=False) selected_param = np.array(problem['solution/StabSel/selected_param']) stability_support = stability[selected_param] dico_stabsel['nsel'] = len(stability_support) dico_stabsel['htmlstab'] = q2templates.df_to_html(stability_support, index=False) context['dicostabsel'] = dico_stabsel plot_beta( np.array(problem['solution/StabSel/refit']), selected_param, output_dir, labels, 'stabsel-refit.png', r"Refitted coefficients of $\beta$ after stability selection") plot_stability(problem['solution/StabSel/distribution'], selected_param, dico_stabsel['threshold'], dico_stabsel['method'], labels, output_dir, 'stabsel-graph.png') if dico_ms['LAMfixed']: context['lam'] = True context['tabs'].append({'title': 'LAM fixed', 'url': 'lam-fixed.html'}) dico_lam = { **problem['model_selection/LAMfixedparameters'].attrs.asdict(), **problem['solution/LAMfixed'].attrs.asdict() } dico_lam['lamtype'] = problem[ 'model_selection/LAMfixedparameters'].attrs['lam'] beta = pd.DataFrame(data={ 'label': problem['label'], 'beta': problem['solution/LAMfixed/refit'] }) beta.to_csv(os.path.join(output_dir, 'LAM-beta.csv'), header=True, index=False) selected_param = np.array(problem['solution/LAMfixed/selected_param']) beta_support = beta[selected_param] dico_lam['htmlbeta'] = q2templates.df_to_html(beta_support, index=False) context['dicolam'] = dico_lam plot_beta( np.array(problem['solution/LAMfixed/beta']), None, output_dir, labels, 'lam-beta.png', r"Coefficients of $\beta$ at $\lambda$ = " + str(dico_lam['lam'])) plot_beta( np.array(problem['solution/LAMfixed/refit']), selected_param, output_dir, labels, 'lam-refit.png', r"Reffited coefficients of $\beta$ at $\lambda$ = " + str(dico_lam['lam'])) return context
def summarize(output_dir: str, table: biom.Table, sample_metadata: qiime2.Metadata=None) -> None: number_of_features, number_of_samples = table.shape sample_summary, sample_frequencies = _frequency_summary( table, axis='sample') if number_of_samples > 1: # Calculate the bin count, with a minimum of 5 bins IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile'] if IQR == 0.0: bins = 5 else: # Freedman–Diaconis rule bin_width = (2 * IQR) / (number_of_samples ** (1/3)) bins = max((sample_summary['Maximum frequency'] - sample_summary['Minimum frequency']) / bin_width, 5) sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False, rug=True, bins=int(round(bins))) sample_frequencies_ax.get_xaxis().set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) sample_frequencies_ax.set_xlabel('Frequency per sample') sample_frequencies_ax.set_ylabel('Number of samples') sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.pdf')) sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.png')) plt.gcf().clear() feature_summary, feature_frequencies = _frequency_summary( table, axis='observation') if number_of_features > 1: feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False, rug=False) feature_frequencies_ax.set_xlabel('Frequency per feature') feature_frequencies_ax.set_ylabel('Number of features') feature_frequencies_ax.set_xscale('log') feature_frequencies_ax.set_yscale('log') feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.pdf')) feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.png')) sample_summary_table = q2templates.df_to_html( sample_summary.apply('{:,}'.format).to_frame('Frequency')) feature_summary_table = q2templates.df_to_html( feature_summary.apply('{:,}'.format).to_frame('Frequency')) index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html') context = { 'number_of_samples': number_of_samples, 'number_of_features': number_of_features, 'total_frequencies': int(np.sum(sample_frequencies)), 'sample_summary_table': sample_summary_table, 'feature_summary_table': feature_summary_table, } feature_qualitative_data = _compute_qualitative_summary(table) sample_frequencies.sort_values(inplace=True, ascending=False) feature_frequencies.sort_values(inplace=True, ascending=False) sample_frequencies.to_csv( os.path.join(output_dir, 'sample-frequency-detail.csv')) feature_frequencies.to_csv( os.path.join(output_dir, 'feature-frequency-detail.csv')) feature_frequencies = feature_frequencies.astype(int) \ .apply('{:,}'.format).to_frame('Frequency') feature_frequencies['# of Samples Observed In'] = \ pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format) feature_frequencies_table = q2templates.df_to_html(feature_frequencies) overview_template = os.path.join( TEMPLATES, 'summarize_assets', 'overview.html') sample_frequency_template = os.path.join( TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html') feature_frequency_template = os.path.join( TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html') context.update({'max_count': sample_frequencies.max(), 'feature_frequencies_table': feature_frequencies_table, 'feature_qualitative_data': feature_qualitative_data, 'tabs': [{'url': 'overview.html', 'title': 'Overview'}, {'url': 'sample-frequency-detail.html', 'title': 'Interactive Sample Detail'}, {'url': 'feature-frequency-detail.html', 'title': 'Feature Detail'}]}) templates = [index, sample_frequency_template, feature_frequency_template, overview_template] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") if sample_metadata: sample_metadata = sample_metadata.filter_ids( sample_frequencies.index) # TODO use Metadata.to_json() API if/when it exists in the future. sample_metadata.to_dataframe().to_json(fh) else: fh.write('{}') fh.write(', ') sample_frequencies.to_json(fh) fh.write(');')
def mantel(output_dir: str, dm1: skbio.DistanceMatrix, dm2: skbio.DistanceMatrix, method: str = 'spearman', permutations: int = 999, intersect_ids: bool = False, label1: str = 'Distance Matrix 1', label2: str = 'Distance Matrix 2') -> None: test_statistics = {'spearman': 'rho', 'pearson': 'r'} alt_hypothesis = 'two-sided' # The following code to handle mismatched IDs, and subsequently filter the # distance matrices, is not technically necessary because skbio's mantel # function will raise an error on mismatches with `strict=True`, and will # handle intersection if `strict=False`. However, we need to handle the ID # matching explicitly to find *which* IDs are mismatched -- the error # message coming from scikit-bio doesn't describe those. We also need to # have the mismatched IDs to display as a warning in the viz if # `intersect_ids=True`. Finally, the distance matrices are explicitly # filtered to matching IDs only because their data are used elsewhere in # this function (e.g. extracting scatter plot data). # Find the symmetric difference between ID sets. ids1 = set(dm1.ids) ids2 = set(dm2.ids) mismatched_ids = ids1 ^ ids2 if not intersect_ids and mismatched_ids: raise ValueError( 'The following ID(s) are not contained in both distance matrices. ' 'This sometimes occurs when mismatched files are passed. If this ' 'is not the case, you can use `intersect_ids` to discard these ' 'mismatches and apply the Mantel test to only those IDs that are ' 'found in both distance matrices.\n\n%s' % ', '.join(sorted(mismatched_ids))) if mismatched_ids: matched_ids = ids1 & ids2 # Run in `strict` mode because the matches should all be found in both # matrices. dm1 = dm1.filter(matched_ids, strict=True) dm2 = dm2.filter(matched_ids, strict=True) # Run in `strict` mode because all IDs should be matched at this point. r, p, sample_size = skbio.stats.distance.mantel( dm1, dm2, method=method, permutations=permutations, alternative=alt_hypothesis, strict=True) result = pd.Series([method.title(), sample_size, permutations, alt_hypothesis, r, p], index=['Method', 'Sample size', 'Permutations', 'Alternative hypothesis', '%s %s' % (method.title(), test_statistics[method]), 'p-value'], name='Mantel test results') table_html = q2templates.df_to_html(result.to_frame()) # We know the distance matrices have matching ID sets at this point, so we # can safely generate all pairs of IDs using one of the matrices' ID sets # (it doesn't matter which one). scatter_data = [] for id1, id2 in itertools.combinations(dm1.ids, 2): scatter_data.append((dm1[id1, id2], dm2[id1, id2])) plt.figure() x = 'Pairwise Distance (%s)' % label1 y = 'Pairwise Distance (%s)' % label2 scatter_data = pd.DataFrame(scatter_data, columns=[x, y]) sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False) plt.savefig(os.path.join(output_dir, 'mantel-scatter.svg')) context = { 'table': table_html, 'sample_size': sample_size, 'mismatched_ids': mismatched_ids } index = os.path.join( TEMPLATES, 'mantel_assets', 'index.html') q2templates.render(index, output_dir, context=context)
def mantel(output_dir: str, dm1: skbio.DistanceMatrix, dm2: skbio.DistanceMatrix, method: str = 'spearman', permutations: int = 999, intersect_ids: bool = False, label1: str = 'Distance Matrix 1', label2: str = 'Distance Matrix 2') -> None: test_statistics = {'spearman': 'rho', 'pearson': 'r'} alt_hypothesis = 'two-sided' # The following code to handle mismatched IDs, and subsequently filter the # distance matrices, is not technically necessary because skbio's mantel # function will raise an error on mismatches with `strict=True`, and will # handle intersection if `strict=False`. However, we need to handle the ID # matching explicitly to find *which* IDs are mismatched -- the error # message coming from scikit-bio doesn't describe those. We also need to # have the mismatched IDs to display as a warning in the viz if # `intersect_ids=True`. Finally, the distance matrices are explicitly # filtered to matching IDs only because their data are used elsewhere in # this function (e.g. extracting scatter plot data). # Find the symmetric difference between ID sets. ids1 = set(dm1.ids) ids2 = set(dm2.ids) mismatched_ids = ids1 ^ ids2 if not intersect_ids and mismatched_ids: raise ValueError( 'The following ID(s) are not contained in both distance matrices. ' 'This sometimes occurs when mismatched files are passed. If this ' 'is not the case, you can use `intersect_ids` to discard these ' 'mismatches and apply the Mantel test to only those IDs that are ' 'found in both distance matrices.\n\n%s' % ', '.join(sorted(mismatched_ids))) if mismatched_ids: matched_ids = ids1 & ids2 # Run in `strict` mode because the matches should all be found in both # matrices. dm1 = dm1.filter(matched_ids, strict=True) dm2 = dm2.filter(matched_ids, strict=True) # Run in `strict` mode because all IDs should be matched at this point. r, p, sample_size = skbio.stats.distance.mantel(dm1, dm2, method=method, permutations=permutations, alternative=alt_hypothesis, strict=True) result = pd.Series( [method.title(), sample_size, permutations, alt_hypothesis, r, p], index=[ 'Method', 'Sample size', 'Permutations', 'Alternative hypothesis', '%s %s' % (method.title(), test_statistics[method]), 'p-value' ], name='Mantel test results') table_html = q2templates.df_to_html(result.to_frame()) # We know the distance matrices have matching ID sets at this point, so we # can safely generate all pairs of IDs using one of the matrices' ID sets # (it doesn't matter which one). scatter_data = [] for id1, id2 in itertools.combinations(dm1.ids, 2): scatter_data.append((dm1[id1, id2], dm2[id1, id2])) plt.figure() x = 'Pairwise Distance (%s)' % label1 y = 'Pairwise Distance (%s)' % label2 scatter_data = pd.DataFrame(scatter_data, columns=[x, y]) sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False) plt.savefig(os.path.join(output_dir, 'mantel-scatter.svg')) context = { 'table': table_html, 'sample_size': sample_size, 'mismatched_ids': mismatched_ids } index = os.path.join(TEMPLATES, 'mantel_assets', 'index.html') q2templates.render(index, output_dir, context=context)
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.CategoricalMetadataColumn, method: str = 'permanova', pairwise: bool = False, permutations: int = 999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) metadata = metadata.drop_missing_values() # filter the distance matrix to exclude samples that were dropped from # the metadata due to missing values, and keep track of how many samples # survived the filtering so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.ids) filtered_dm_length = distance_matrix.shape[0] metadata = metadata.to_series() # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style('white') # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. # TODO: update to use a grouping API and natsort API on # CategoricalMetadataColumn, if those become available. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in natsorted(metadata.groupby(metadata))]) pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) for group_id in groupings: group_distances, x_ticklabels, group_pairs_summary = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) group_pairs_summary = pd.DataFrame( group_pairs_summary, columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) pairs_summary = pd.concat([pairs_summary, group_pairs_summary]) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t') result_html = q2templates.df_to_html(result.to_frame()) if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value']]) columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value'] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join( output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = q2templates.df_to_html(pairwise_results) else: pairwise_results_html = None # repartition groupings for rendering group_ids = list(groupings.keys()) row_count, group_count = 3, len(group_ids) # Start at three plots per row while group_count % row_count != 0: row_count = row_count - 1 group_rows = [group_ids[g:g+row_count] for g in range(0, group_count, row_count)] index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'group_rows': group_rows, 'bootstrap_group_col_size': int(12 / row_count), 'result': result_html, 'pairwise_results': pairwise_results_html })
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str='permanova', pairwise: bool=False, permutations: int=999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Cast metadata to numeric (if applicable), which gives better sorting # in boxplots. Then filter any samples that are not in the distance matrix, # and drop samples with have no data for this metadata # category, including those with empty strings as values. metadata = pd.to_numeric(metadata.to_series(), errors='ignore') metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.index) filtered_dm_length = distance_matrix.shape[0] # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style("white") # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in sorted(metadata.groupby(metadata))]) for group_id in groupings: group_distances, x_ticklabels = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() result_html = q2templates.df_to_html(result.to_frame()) if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value']]) columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value'] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join( output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = q2templates.df_to_html(pairwise_results) else: pairwise_results_html = None index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'groupings': groupings, 'result': result_html, 'pairwise_results': pairwise_results_html })
def visualize_stats(output_dir: str, deblur_stats: pd.DataFrame) -> None: total_artifact = deblur_stats['reads-hit-artifact'] total_input = deblur_stats['reads-raw'] deblur_stats['fraction-artifact'] = total_artifact / total_input minsize_drop = deblur_stats['reads-raw'] - deblur_stats['reads-derep'] deblur_stats['fraction-artifact-with-minsize'] = \ (total_artifact + minsize_drop) / total_input total_not_ref = deblur_stats['reads-missed-reference'] total_deblur = deblur_stats['reads-deblur'] total_chim = deblur_stats['reads-chimeric'] deblur_stats['fraction-missed-reference'] = \ total_not_ref / (total_deblur - total_chim) # reorder such that retained fractions follow total-input-reads and # total-retained-reads columns = list(deblur_stats.columns)[:-3] columns.insert(1, 'fraction-missed-reference') columns.insert(1, 'fraction-artifact') columns.insert(1, 'fraction-artifact-with-minsize') deblur_stats = deblur_stats[columns] deblur_stats.sort_values('fraction-artifact-with-minsize', inplace=True, ascending=False) deblur_stats = deblur_stats.reset_index() html = q2templates.df_to_html(deblur_stats) html = html.replace('table-hover"', 'table-hover" id="stats"') # ghetto force in tooltips description_sources = STATS_DESCRIPTIONS.copy() description_sources.update(COMPUTED_DESCRIPTIONS) htmlparts = html.splitlines() headstart = None headend = None for idx, line in enumerate(htmlparts): if '<thead>' in line: headstart = idx elif '</thead>' in line: headend = idx regex = re.compile("<th>(.*?)</th>") new_header = [] for entry in htmlparts[headstart:headend]: new_entry = entry[:] if '<th>' in entry and entry.strip() != '<th></th>': label = regex.findall(entry)[0] desc = description_sources[label] label = ('<th data-toggle="tooltip" ' 'title="%s" ' 'data-tsorter="numeric">%s</th>' % (desc, label)) new_entry = label new_header.append(new_entry) htmlparts[headstart:headend] = new_header html = '\n'.join(htmlparts) index = os.path.join(TEMPLATES, 'index.html') context = { 'result': html } js = os.path.join(TEMPLATES, 'js', 'tsorter.min.js') os.mkdir(os.path.join(output_dir, 'js')) shutil.copy(js, os.path.join(output_dir, 'js', 'tsorter.min.js')) q2templates.render(index, output_dir, context=context)
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat( [alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis( groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append([ '%s:%s' % (column, names[i]), '%s:%s' % (column, names[j]) ]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests(kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump( { 'initial': initial_data_length, 'filtered': filtered_data_length }, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render( index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons]) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: metadata_df = metadata.to_dataframe() metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore') pre_filtered_cols = set(metadata_df.columns) metadata_df = metadata_df.select_dtypes(exclude=[np.number]) post_filtered_cols = set(metadata_df.columns) filtered_numeric_categories = pre_filtered_cols - post_filtered_cols filtered_group_comparisons = [] categories = metadata_df.columns metric_name = alpha_diversity.name if len(categories) == 0: raise ValueError('Only numeric data is present in metadata file.') filenames = [] filtered_categories = [] for category in categories: metadata_category = metadata.get_category(category).to_series() metadata_category = metadata_category.loc[alpha_diversity.index] metadata_category = metadata_category.replace(r'', np.nan).dropna() initial_data_length = alpha_diversity.shape[0] data = pd.concat([alpha_diversity, metadata_category], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_category.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[alpha_diversity.name])) if (len(groups) > 1 and len(groups) != len(data.index)): escaped_category = quote(category) filename = 'category-%s.jsonp' % escaped_category filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis(groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append( ['%s:%s' % (category, names[i]), '%s:%s' % (category, names[j])]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests( kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_category pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: df = pd.Series(groups, index=names) fh.write("load_data('%s'," % category) df.to_json(fh, orient='split') fh.write(",") json.dump({'initial': initial_data_length, 'filtered': filtered_data_length}, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) else: filtered_categories.append(category) index = os.path.join( TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'categories': [quote(fn) for fn in filenames], 'filtered_numeric_categories': ', '.join(filtered_numeric_categories), 'filtered_categories': ', '.join(filtered_categories), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons])}) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def _visualize(output_dir, title, running_title, results, false_negative_features=None, misclassifications=None, underclassifications=None, composition_regression=None, score_plot=None, mismatch_histogram=None, alignments=None): pd.set_option('display.max_colwidth', -1) # save results results.to_csv(join(output_dir, 'results.tsv'), sep='\t') results = q2templates.df_to_html(results, index=False) if false_negative_features is not None: false_negative_features.to_csv(join(output_dir, 'false_negative_features.tsv'), sep='\t') false_negative_features = q2templates.df_to_html( false_negative_features, index=True) if misclassifications is not None: misclassifications.to_csv(join(output_dir, 'misclassifications.tsv'), sep='\t') misclassifications = q2templates.df_to_html(misclassifications, index=True) if underclassifications is not None: underclassifications.to_csv(join(output_dir, 'underclassifications.tsv'), sep='\t') underclassifications = q2templates.df_to_html(underclassifications, index=True) if composition_regression is not None: composition_regression.savefig(join(output_dir, 'composition_regression.png'), bbox_inches='tight') composition_regression.savefig(join(output_dir, 'composition_regression.pdf'), bbox_inches='tight') if score_plot is not None: score_plot.savefig(join(output_dir, 'score_plot.png'), bbox_inches='tight') score_plot.savefig(join(output_dir, 'score_plot.pdf'), bbox_inches='tight') if mismatch_histogram is not None: mismatch_histogram.savefig(join(output_dir, 'mismatch_histogram.png'), bbox_inches='tight') mismatch_histogram.savefig(join(output_dir, 'mismatch_histogram.pdf'), bbox_inches='tight') if alignments is not None: alignments.to_csv(join(output_dir, 'alignments.tsv'), sep='\t') alignments = _plot_alignments_as_heatmap(alignments) alignments.savefig(join(output_dir, 'alignments.png'), bbox_inches='tight') alignments.savefig(join(output_dir, 'alignments.pdf'), bbox_inches='tight') index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': title, 'running_title': running_title, 'results': results, 'false_negative_features': false_negative_features, 'misclassifications': misclassifications, 'underclassifications': underclassifications, 'composition_regression': composition_regression, 'score_plot': score_plot, 'mismatch_histogram': mismatch_histogram, 'alignments': alignments, })
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns( drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat([alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis(groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append( ['%s:%s' % (column, names[i]), '%s:%s' % (column, names[j])]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests( kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump({'initial': initial_data_length, 'filtered': filtered_data_length}, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join( TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons])}) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def summarize(output_dir: str, table: biom.Table, sample_metadata: qiime2.Metadata = None) -> None: number_of_features, number_of_samples = table.shape sample_summary, sample_frequencies = _frequency_summary(table, axis='sample') if number_of_samples > 1: # Calculate the bin count, with a minimum of 5 bins IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile'] if IQR == 0.0: bins = 5 else: # Freedman–Diaconis rule bin_width = (2 * IQR) / (number_of_samples**(1 / 3)) bins = max((sample_summary['Maximum frequency'] - sample_summary['Minimum frequency']) / bin_width, 5) sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False, rug=True, bins=int(round(bins))) sample_frequencies_ax.get_xaxis().set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) sample_frequencies_ax.set_xlabel('Frequency per sample') sample_frequencies_ax.set_ylabel('Number of samples') sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.pdf')) sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.png')) plt.gcf().clear() feature_summary, feature_frequencies = _frequency_summary( table, axis='observation') if number_of_features > 1: feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False, rug=False) feature_frequencies_ax.set_xlabel('Frequency per feature') feature_frequencies_ax.set_ylabel('Number of features') feature_frequencies_ax.set_xscale('log') feature_frequencies_ax.set_yscale('log') feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.pdf')) feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.png')) sample_summary_table = q2templates.df_to_html( sample_summary.apply('{:,}'.format).to_frame('Frequency')) feature_summary_table = q2templates.df_to_html( feature_summary.apply('{:,}'.format).to_frame('Frequency')) index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html') context = { 'number_of_samples': number_of_samples, 'number_of_features': number_of_features, 'total_frequencies': int(np.sum(sample_frequencies)), 'sample_summary_table': sample_summary_table, 'feature_summary_table': feature_summary_table, } feature_qualitative_data = _compute_qualitative_summary(table) sample_frequencies.sort_values(inplace=True, ascending=False) feature_frequencies.sort_values(inplace=True, ascending=False) sample_frequencies.to_csv( os.path.join(output_dir, 'sample-frequency-detail.csv')) feature_frequencies.to_csv( os.path.join(output_dir, 'feature-frequency-detail.csv')) feature_frequencies = feature_frequencies.astype(int) \ .apply('{:,}'.format).to_frame('Frequency') feature_frequencies['# of Samples Observed In'] = \ pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format) feature_frequencies_table = q2templates.df_to_html(feature_frequencies) overview_template = os.path.join(TEMPLATES, 'summarize_assets', 'overview.html') sample_frequency_template = os.path.join(TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html') feature_frequency_template = os.path.join(TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html') context.update({ 'max_count': sample_frequencies.max(), 'feature_frequencies_table': feature_frequencies_table, 'feature_qualitative_data': feature_qualitative_data, 'tabs': [{ 'url': 'overview.html', 'title': 'Overview' }, { 'url': 'sample-frequency-detail.html', 'title': 'Interactive Sample Detail' }, { 'url': 'feature-frequency-detail.html', 'title': 'Feature Detail' }] }) templates = [ index, sample_frequency_template, feature_frequency_template, overview_template ] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") if sample_metadata: sample_metadata = sample_metadata.filter_ids( sample_frequencies.index) # TODO use Metadata.to_json() API if/when it exists in the future. sample_metadata.to_dataframe().to_json(fh) else: fh.write('{}') fh.write(', ') sample_frequencies.to_json(fh) fh.write(');')
def ancom(output_dir: str, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, transform_function: str = 'clr', difference_function: str = None) -> None: metadata = metadata.filter_ids(table.index) if metadata.has_missing_values(): missing_data_sids = metadata.get_ids(where_values_missing=True) missing_data_sids = ', '.join(sorted(missing_data_sids)) raise ValueError('Metadata column is missing values for the ' 'following samples. Values need to be added for ' 'these samples, or the samples need to be removed ' 'from the table: %s' % missing_data_sids) ancom_results = skbio_ancom(table, metadata.to_series(), significance_test=f_oneway) ancom_results[0].sort_values(by='W', ascending=False, inplace=True) ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'}, inplace=True) significant_features = ancom_results[0][ ancom_results[0]['Reject null hypothesis']] context = dict() if not significant_features.empty: context['significant_features'] = q2templates.df_to_html( significant_features['W'].to_frame()) context['percent_abundances'] = q2templates.df_to_html( ancom_results[1].loc[significant_features.index]) metadata = metadata.to_series() cats = list(set(metadata)) transform_function_name = transform_function transform_function = _transform_functions[transform_function] transformed_table = table.apply( transform_function, axis=1, result_type='broadcast') if difference_function is None: if len(cats) == 2: difference_function = 'mean_difference' else: # len(categories) > 2 difference_function = 'f_statistic' _d_func = _difference_functions[difference_function] def diff_func(x): args = _d_func(*[x[metadata == c] for c in cats]) if isinstance(args, tuple): return args[0] else: return args # effectively doing a groupby operation wrt to the metadata fold_change = transformed_table.apply(diff_func, axis=0) if not pd.isnull(fold_change).all(): volcano_results = pd.DataFrame({transform_function_name: fold_change, 'W': ancom_results[0].W}) volcano_results = volcano_results.reset_index(drop=False) spec = { '$schema': 'https://vega.github.io/schema/vega/v4.json', 'width': 300, 'height': 300, 'data': [ {'name': 'values', 'values': volcano_results.to_dict(orient='records')}], 'scales': [ {'name': 'xScale', 'domain': {'data': 'values', 'field': transform_function_name}, 'range': 'width'}, {'name': 'yScale', 'domain': {'data': 'values', 'field': 'W'}, 'range': 'height'}], 'axes': [ {'scale': 'xScale', 'orient': 'bottom', 'title': transform_function_name}, {'scale': 'yScale', 'orient': 'left', 'title': 'W'}], 'marks': [ {'type': 'symbol', 'from': {'data': 'values'}, 'encode': { 'hover': { 'fill': {'value': '#FF0000'}, 'opacity': {'value': 1}}, 'enter': { 'x': {'scale': 'xScale', 'field': transform_function_name}, 'y': {'scale': 'yScale', 'field': 'W'}}, 'update': { 'fill': {'value': 'black'}, 'opacity': {'value': 0.3}, 'tooltip': { 'signal': "{{'title': datum['index'], '{0}': " "datum['{0}'], 'W': datum['W']}}".format( transform_function_name)}}}}]} context['vega_spec'] = json.dumps(spec) copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir) ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'), header=True, index=True, sep='\t') ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.tsv'), header=True, index=True, sep='\t') index = os.path.join(TEMPLATES, 'ancom', 'index.html') q2templates.render(index, output_dir, context=context)