def anova(output_dir: str, metadata: qiime2.Metadata, formula: str, sstype: str = 'II') -> None: # Grab metric and covariate names from formula metric, group_columns = _parse_formula(formula) columns = [metric] + list(group_columns) # Validate formula (columns are in metadata, etc) for col in columns: metadata.get_column(col) # store categorical column names for later use cats = metadata.filter_columns(column_type='categorical').columns.keys() metadata = metadata.to_dataframe()[columns].dropna() # Run anova lm = ols(formula, metadata).fit() results = pd.DataFrame(sm.stats.anova_lm(lm, typ=sstype)).fillna('') results.to_csv(os.path.join(output_dir, 'anova.tsv'), sep='\t') # Run pairwise t-tests with multiple test correction pairwise_tests = pd.DataFrame() for group in group_columns: # only run on categorical columns — numeric columns raise error if group in cats: ttests = lm.t_test_pairwise(group, method='fdr_bh').result_frame pairwise_tests = pd.concat([pairwise_tests, pd.DataFrame(ttests)]) if pairwise_tests.empty: pairwise_tests = False # Plot fit vs. residuals metadata['residual'] = lm.resid metadata['fitted_values'] = lm.fittedvalues res = _regplot_subplots_from_dataframe('fitted_values', 'residual', metadata, group_columns, lowess=False, ci=95, palette='Set1', fit_reg=False) # Visualize results _visualize_anova(output_dir, pairwise_tests=pairwise_tests, model_results=results, residuals=res, pairwise_test_name='Pairwise t-tests')
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: int = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: metadata.get_column(i.name()) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = [ 'run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp ] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: str = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: metadata.get_column(i.name()) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat( [alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis( groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append([ '%s:%s' % (column, names[i]), '%s:%s' % (column, names[j]) ]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests(kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump( { 'initial': initial_data_length, 'filtered': filtered_data_length }, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render( index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons]) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str = 'spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata contains only non-numeric or empty columns. This " "visualizer requires at least one numeric metadata column to " "execute.") # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_column.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = { 'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0] } escaped_column = quote(column) filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % column) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump( { 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0] }, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'filtered_columns': ', '.join(sorted(filtered_columns)) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns( drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat([alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis(groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append( ['%s:%s' % (column, names[i]), '%s:%s' % (column, names[j])]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests( kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump({'initial': initial_data_length, 'filtered': filtered_data_length}, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join( TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons])}) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str = 'spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata contains only non-numeric or empty columns. This " "visualizer requires at least one numeric metadata column to " "execute.") # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_column.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = {'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0]} escaped_column = quote(column) filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % column) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump({ 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0]}, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'filtered_columns': ', '.join(sorted(filtered_columns))}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'), os.path.join(output_dir, 'dist'))
def gtr_single_partition(alignment: qiime2.Metadata, time: qiime2.NumericMetadataColumn, n_generations: int, sample_every: int, time_uncertainty: qiime2.NumericMetadataColumn = None, base_freq: str = "estimated", site_gamma: int = 4, site_invariant: bool = True, clock: str = 'ucln', coalescent_model: str = 'skygrid', skygrid_intervals: int = None, skygrid_duration: float = None, print_every: int = None, use_gpu: bool = False, n_threads: int = 1) -> BEASTPosteriorDirFmt: if coalescent_model == 'skygrid': if skygrid_duration is None or skygrid_intervals is None: raise ValueError("skygrid not parameterized (TODO: better error)") # Parallelization options beast_call = ['beast'] if use_gpu: if n_threads != 1: raise ValueError beast_call += ['-beagle_GPU', '-beagle_cuda', '-beagle_instances', '1'] else: beast_call += [ '-beagle_CPU', '-beagle_SSE', '-beagle_instances', str(n_threads) ] # Set up directory format where BEAST will write everything result = BEASTPosteriorDirFmt() control_file = str(result.control.path_maker()) ops_file = str(result.ops.path_maker().relative_to(result.path)) log_file = str(result.log.path_maker().relative_to(result.path)) trees_file = str(result.trees.path_maker().relative_to(result.path)) # Setup up samples for templating into control file seq_series = alignment.get_column('Sequence').to_series() time_series = time.to_series() if time_uncertainty is not None: uncertainty_series = time_uncertainty.to_series() else: uncertainty_series = time_series.copy() uncertainty_series[...] = None samples_df = pd.concat([seq_series, time_series, uncertainty_series], axis='columns', join='inner') samples_df.index.name = 'id' samples_df.columns = ['seq', 'time', 'time_uncertainty'] samples_df = samples_df.replace({pd.np.nan: None}) samples = list(samples_df.itertuples(index=True)) # Default print behavior if print_every is None: print_every = sample_every # Generate control file for BEAST template_kwargs = dict(trees_file=trees_file, ops_file=ops_file, log_file=log_file, sample_every=sample_every, print_every=print_every, n_generations=n_generations, time_unit='years', samples=samples, base_freq=base_freq, site_gamma=site_gamma, site_invariant=site_invariant, clock=clock, coalescent_model=coalescent_model, skygrid_duration=skygrid_duration, skygrid_intervals=skygrid_intervals) template = _get_template("gtr_single_partition.xml") template.stream(**template_kwargs).dump(control_file) beast_call += [str(control_file)] # Execute subprocess.run(beast_call, check=True, cwd=result.path) return result
def site_heterogeneous_hky( coding_regions: qiime2.Metadata, noncoding_regions: qiime2.Metadata, time: qiime2.NumericMetadataColumn, n_generations: int, sample_every: int, print_every: int = None, time_uncertainty: qiime2.NumericMetadataColumn = None, use_gpu: bool = False, n_threads: int = 1) -> BEASTPosteriorDirFmt: # Parallelization options beast_call = ['beast'] if use_gpu: if n_threads != 1: raise ValueError beast_call += ['-beagle_GPU', '-beagle_cuda', '-beagle_instances', '1'] else: beast_call += [ '-beagle_CPU', '-beagle_SSE', '-beagle_instances', str(n_threads) ] # Set up directory format where BEAST will write everything result = BEASTPosteriorDirFmt() control_file = str(result.control.path_maker()) ops_file = str(result.ops.path_maker().relative_to(result.path)) log_file = str(result.log.path_maker().relative_to(result.path)) trees_file = str(result.trees.path_maker().relative_to(result.path)) # Setup up samples for templating into control file orf_series = coding_regions.get_column('Sequence').to_series() nc_series = noncoding_regions.get_column('Sequence').to_series() time_series = time.to_series() uncertainty_series = time_uncertainty.to_series() samples_df = pd.concat( [orf_series, nc_series, time_series, uncertainty_series], axis='columns', join='inner') samples_df.index.name = 'id' samples_df.columns = ['seq_orf', 'seq_nc', 'time', 'time_uncertainty'] samples_df = samples_df.replace({pd.np.nan: None}) samples = list(samples_df.itertuples(index=True)) # Default print behavior if print_every is None: print_every = sample_every # Generate control file for BEAST template_kwargs = dict(trees_file=trees_file, ops_file=ops_file, log_file=log_file, sample_every=sample_every, print_every=print_every, n_generations=n_generations, time_unit='years', samples=samples) template = _get_template("orf_and_nc.xml") template.stream(**template_kwargs).dump(control_file) beast_call += [str(control_file)] # Execute subprocess.run(beast_call, check=True, cwd=result.path) return result
def preprocess( ctx, table, metadata, sampling_depth, min_frequency, target_variable, discrete, phylogeny=None, with_replacement=False, n_jobs=1, ): # Define QIIME2 methods to call rarefy = ctx.get_action("feature_table", "rarefy") filter_min_features = ctx.get_action("feature_table", "filter_features") filter_samples = ctx.get_action("feature_table", "filter_samples") beta = ctx.get_action("diversity", "beta") beta_phylogenetic = ctx.get_action("diversity", "beta_phylogenetic") filter_features = ctx.get_action("fragment-insertion", "filter_features") results = [] print("Inital sizes") print_datasize(table, metadata) initial_ids_to_keep = table.view(biom.Table).ids() table_id_set = set(initial_ids_to_keep) metadata_id_set = set(metadata.ids) shared_ids = table_id_set.intersection(metadata_id_set) num_shared_ids = len(shared_ids) if num_shared_ids == 0: raise ValueError("No sample IDs are shared between Table and Metadata") print( "# of shared sample IDs between Table and Metadata: ", num_shared_ids, "\n" ) # Filter metadata by samples in table print("Filtering Metadata by samples in table") filteredmetadata = metadata.filter_ids(ids_to_keep=shared_ids) print_datasize(table, filteredmetadata) # Filter samples from metadata where NaN in target_variable column # Reduce metadata to 1 column mapping of sample-id to target print( "Filtering samples from Metadata where NaN in target_variable column" ) print("Reducing Metadata to 1 column mapping of sample-id to target") df = filteredmetadata.to_dataframe() clean_subset_df = clean_metadata( df=df, target_variable=target_variable, discrete=discrete ) target_mapping = Metadata(clean_subset_df) print_datasize(table, target_mapping) # Filter features that do not exist in phylogeny if phylogeny: print("Filtering features from Table that do not exist in phylogeny") phylo_filtered_results = filter_features(table=table, tree=phylogeny) table = phylo_filtered_results.filtered_table print_datasize(table, target_mapping) # Filter low-abundance features from table print( f"Filtering low-abundance features (frequency<{min_frequency}) from Table" ) (table,) = filter_min_features( table=table, min_frequency=min_frequency ) print_datasize(table, target_mapping) # Rarefy Table to sampling_depth print(f"Rarefying Table to sampling depth of {sampling_depth}") (rarefied_table,) = rarefy( table=table, sampling_depth=sampling_depth, with_replacement=with_replacement, ) print_datasize(rarefied_table, target_mapping) print("Filtering Rarefied Table by samples in Metadata") filtered_rarefied_table_results = filter_samples( table=rarefied_table, metadata=target_mapping ) filtered_rarefied_table = filtered_rarefied_table_results.filtered_table print_datasize(filtered_rarefied_table, target_mapping) results += filtered_rarefied_table_results # Refilter target_mapping by samples in table print("Refiltering Metadata by samples in Rarefied Table") ids_to_keep = filtered_rarefied_table.view(biom.Table).ids() target_mapping = target_mapping.filter_ids(ids_to_keep=ids_to_keep) print_datasize(filtered_rarefied_table, target_mapping) # Filter Rarefied Table by samples in metadata print("Filtering Unrarefied Table by samples in Metadata to match Rarefied Table") filtered_table_results = filter_samples( table=table, metadata=target_mapping ) print_datasize(filtered_table_results.filtered_table, target_mapping) results += filtered_table_results # Some transformations to get data into correct format for artifact target_mapping_col = target_mapping.get_column(target_variable) target_mapping_series = target_mapping_col.to_series() print("Reindexing Metadata to match Sample ID order of Table") target_mapping_series = target_mapping_series.reindex( index=ids_to_keep, copy=False ) print("Validating Table and Metadata Sample ID agreement...") if list(target_mapping_series.index) != list(ids_to_keep): print(list(target_mapping_series.index)) print(ids_to_keep) raise ValueError( "Table and Metadata Sample IDs do not match in contents and/or order" ) target_mapping_artifact = ctx.make_artifact( "SampleData[Target]", target_mapping_series ) results += [target_mapping_artifact] # Generate Distance Matrices print("Generating Distance Matrices...") for metric in ["jaccard", "braycurtis", "jensenshannon", "aitchison"]: beta_results = beta( table=filtered_rarefied_table, metric=metric, n_jobs=n_jobs ) results += beta_results if phylogeny: for metric in ["unweighted_unifrac", "weighted_unifrac"]: beta_phylo_results = beta_phylogenetic( table=filtered_rarefied_table, phylogeny=phylogeny, metric=metric, threads=n_jobs, ) results += beta_phylo_results else: # No phylogeny, return empty (1,1) matrices. results += 2*[Artifact.import_data( "DistanceMatrix", skbio.DistanceMatrix(data=[]) )] return tuple(results)