def _generic_plot(output_dir: str, master: skbio.OrdinationResults, metadata: qiime2.Metadata, other_pcoa: skbio.OrdinationResults, plot_name, custom_axes: str=None, feature_metadata: qiime2.Metadata=None): mf = metadata.to_dataframe() if feature_metadata is not None: feature_metadata = feature_metadata.to_dataframe() if other_pcoa is None: procrustes = None else: procrustes = [other_pcoa] viz = Emperor(master, mf, feature_mapping_file=feature_metadata, procrustes=procrustes, remote='.') if custom_axes is not None: viz.custom_axes = custom_axes if other_pcoa: viz.procrustes_names = ['reference', 'other'] html = viz.make_emperor(standalone=True) viz.copy_support_files(output_dir) with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh: fh.write(html) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={'plot_name': plot_name})
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # convert metadata to numeric values where applicable, drop the non-numeric # values, and then drop samples that contain NaNs df = metadata.to_dataframe() df = df.apply(lambda x: pd.to_numeric(x, errors='ignore')) # filter categorical columns pre_filtered_cols = set(df.columns) df = df.select_dtypes([numpy.number]).dropna() filtered_categorical_cols = pre_filtered_cols - set(df.columns) # filter 0 variance numerical columns pre_filtered_cols = set(df.columns) df = df.loc[:, df.var() != 0] filtered_zero_variance_cols = pre_filtered_cols - set(df.columns) # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index, strict=False) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = result.to_html(classes='table table-striped table-hover').replace( 'border="1"', 'border="0"') index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'filtered_categorical_cols': ', '.join(filtered_categorical_cols), 'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols), 'result': result})
def test_view_as_metadata(self): A = Artifact.import_data('Mapping', {'a': '1', 'b': '3'}) obs_md = A.view(Metadata) exp_df = pd.DataFrame({'a': '1', 'b': '3'}, index=pd.Index(['0'], name='id', dtype=object), dtype=object) exp_md = Metadata(exp_df) exp_md._add_artifacts([A]) self.assertEqual(obs_md, exp_md) # This check is redundant because `Metadata.__eq__` being used above # takes source artifacts into account. Doesn't hurt to have an explicit # check though, since this API didn't always track source artifacts # (this check also future-proofs the test in case `Metadata.__eq__` # changes in the future). self.assertEqual(obs_md.artifacts, (A,))
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) # drop non-numeric columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric') non_numeric_cols = pre_filtered_cols - set(metadata.columns) # filter 0 variance numerical columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_zero_variance=True, drop_all_missing=True) zero_variance_cols = pre_filtered_cols - set(metadata.columns) # Drop samples that have any missing values. # TODO use Metadata API if this type of filtering is supported in the # future. df = metadata.to_dataframe() df = df.dropna(axis='index', how='any') # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'non_numeric_cols': ', '.join(sorted(non_numeric_cols)), 'zero_variance_cols': ', '.join(sorted(zero_variance_cols)), 'result': result})
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: str = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: metadata.get_column(i.name()) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, where: str=None, exclude_ids: bool=False) -> skbio.DistanceMatrix: ids_to_keep = metadata.ids(where=where) if exclude_ids: ids_to_keep = set(distance_matrix.ids) - set(ids_to_keep) # NOTE: there is no guaranteed ordering to output distance matrix because # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration # order. try: return distance_matrix.filter(ids_to_keep, strict=False) except skbio.stats.distance.DissimilarityMatrixError: raise ValueError( "All samples were filtered out of the distance matrix.")
def plot(output_dir: str, pcoa: skbio.OrdinationResults, metadata: qiime2.Metadata, custom_axis: str=None) -> None: mf = metadata.to_dataframe() viz = Emperor(pcoa, mf, remote='.') if custom_axis is not None: # put custom_axis inside a list to workaround the type system not # supporting lists of types html = viz.make_emperor(standalone=True, custom_axes=[custom_axis]) else: html = viz.make_emperor(standalone=True) viz.copy_support_files(output_dir) with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh: fh.write(html) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir)
def filter_seqs(data: pd.Series, table: biom.Table=None, metadata: qiime2.Metadata=None, where: str=None, exclude_ids: bool=False) -> pd.Series: if table is not None and metadata is not None: raise ValueError('Filtering with metadata and filtering with a table ' 'are mutually exclusive.') elif table is None and metadata is None: raise ValueError('No filtering requested. Must provide either table ' 'or metadata.') elif table is not None: ids_to_keep = table.ids(axis='observation') else: # Note, no need to check for missing feature IDs in the metadata, # because that is basically the point of this method. ids_to_keep = metadata.get_ids(where=where) if exclude_ids is True: ids_to_keep = set(data.index) - set(ids_to_keep) filtered = data[data.index.isin(ids_to_keep)] if filtered.empty is True: raise ValueError('All features were filtered out of the data.') return filtered
def add_dietary_phase( host_subject_id, phase_name, key_dates_spreadsheet, input_metadata_file, output_metadata_file, ) -> None: """Encodes dietary phase information into a sample metadata file. The main information needed for this are the phase name (-p) and the key dates spreadsheet (-k). This program looks for rows in the key dates spreadsheet where the "Event" column contains the text "Started PHASENAME" or "Stopped PHASENAME", where PHASENAME is just the string you specified in the -p option. This program will then use the dates associated with these rows to determine ranges of dates for which the given dietary phase was being followed -- this is useful if the subject went on and off a diet multiple times. The start date of a phase is counted as being in that range; the end date is NOT counted as being in that range. Finally, this will add a PHASENAME column to the metadata file. Samples will be assigned one of three possible values in this column: Samples where host_subject_id is equal to the -hsid parameter AND the collection_timestamp falls within a dietary phase range will be labelled "TRUE". Samples where host_subject_id is equal to the -hsid parameter AND the collection_timestamp DOES NOT fall within a dietary phase range will be labelled "FALSE". Samples where host_subject_id is NOT EQUAL to the -hsid parameter will be labelled "not applicable". This only treats dates as down to the day. So if the subject started a diet at 12pm on a day and then ended that diet at 5pm that same day, this code will treat both of these dates as occurring on the same day and thus raise an error. """ m = Metadata.load(input_metadata_file) m_df = m.to_dataframe() # Validate the input metadata file, somewhat required_cols = {"host_subject_id", "collection_timestamp"} if len(required_cols & set(m_df.columns)) < len(required_cols): raise ValueError( "Input metadata file must include the following columns: " "{}".format(required_cols)) if phase_name in m_df.columns: raise ValueError( "A {} column already exists in the input metadata!".format( phase_name)) # Validate the key dates spreadsheet, somewhat kd = pd.read_excel(key_dates_spreadsheet, index_col=0) # I didn't actually know this functionality existed until I saw this SO # answer: https://stackoverflow.com/a/57187654/10730311 if not pd.api.types.is_datetime64_any_dtype(kd.index): raise ValueError( "First column of the key dates spreadsheet must contain " "dates/timestamps") if "Event" not in kd.columns: raise ValueError( 'Key dates spreadsheet must contain an "Event" column') # Determine ranges for starting/stopping a given diet (this requires a # decent amount of validation) starting_dates = kd.loc[ kd["Event"].str.find("Started {}".format(phase_name)) >= 0] if len(starting_dates.index) < 1: raise ValueError("No starting dates for the specified phase given") stopping_dates = kd.loc[ kd["Event"].str.find("Stopped {}".format(phase_name)) >= 0] if len(stopping_dates.index) < 1: raise ValueError("No stopping dates for the specified phase given") if len(starting_dates.index) != len(stopping_dates.index): raise ValueError( "Number of starting/stopping dates must be consistent (if the " "phase continues to the final sample, then you'll need to add a " "stoppping row for the day of or after that sample)") print('Found {} ranges for the "{}" dietary phase.'.format( len(starting_dates.index), phase_name)) # We now know that we have an equal (and >= 1) number of starting and # stopping dates, but we'd like to know if the dates actually make sense. # # This necessitates checking that every stopping date occurs later than its # corresponding starting date, *and* ensuring that every starting date # occurs later than the previous stopping date (i.e. the ranges are in # chronological order) # # You can think of this graphically as something like: # # A1---B1 A2--B2 A3B3 A4-----B4 A5-B5 A6--B6 # # where each A is a starting date and each B is a stopping date. Notice how # these ranges are not overlapping, so they can just be represented as a # single line -- this is what we're checking for here. for i in range(len(starting_dates.index)): # NOTE: we use .date() to just get the date, not the timestamp, of # datetimes. This lets us do comparisons only down to the day level. # Thanks to https://stackoverflow.com/a/13227661/10730311. da = starting_dates.iloc[i].name.date() db = stopping_dates.iloc[i].name.date() if da >= db: raise ValueError("Starting date {} occurs later or on same day as " "corresponding stopping date {}.".format(da, db)) if i > 0: prev_db = stopping_dates.iloc[i - 1].name.date() if da <= prev_db: raise ValueError( "Starting date {} occurs earlier or on same day as " "previous stopping date {}.".format(da, prev_db)) # OK, now we know the ranges are good! We're done validating the inputs at # this point. m_df[phase_name] = "not applicable" for sample_id in m_df.index: if m_df.loc[sample_id, "host_subject_id"] == host_subject_id: # Parse sample timestamp sample_date = parse(m_df["collection_timestamp"][sample_id]).date() # If the sample was collected before any of the ranges, then we'll # never get into the first "if" statement in the for loop below. # That's fine; in this case, the sample doesn't fall in any of the # ranges, so we can safely leave its value as FALSE. phase_value = "FALSE" # Iterate backwards through ranges for ii in range(len(starting_dates.index))[::-1]: if sample_date >= starting_dates.iloc[ii].name.date(): if sample_date < stopping_dates.iloc[ii].name.date(): phase_value = "TRUE" break else: # We know that this sample occurred after the current # range. Furthermore, we're looking at the ranges in # descending order, so we know that the sample wasn't # in any ranges after this one. Therefore, we can # conclusively say that this sample is not present in # any ranges. # # ...However, the fact that this sample was collected # *after* the diet was started for the first time could # be interesting, esp. if the effects of the diet were # residual. So we assign a special value for these # samples; depending on how you want to interpret this # data, this can be handled in a few different ways. # (For stuff like plotting sample ordinations, making # this distinction clear is useful.) phase_value = "FALSE BUT TAKEN AFTER DIET START" break m_df.loc[sample_id, phase_name] = phase_value # For samples where the host subject ID *does not* match the one # specified, the phase_name value will be left as "not applicable" # Cool, we're done! Metadata(m_df).save(output_metadata_file)
'stdv-kmer-per-region': np.std([1, 2], ddof=1), 'mapped-asvs': 'asv02|asv03|asv08' }, 'seq4': { 'num-regions': 1, 'total-kmers-mapped': 1, 'mean-kmer-per-region': 1, 'stdv-kmer-per-region': 0, 'mapped-asvs': 'asv09' }, 'seq5': { 'num-regions': 2, 'total-kmers-mapped': 2, 'mean-kmer-per-region': 1, 'stdv-kmer-per-region': 0, 'mapped-asvs': 'asv04|asv05|asv10', }, 'seq6': { 'num-regions': 2, 'total-kmers-mapped': 2, 'mean-kmer-per-region': 1, 'stdv-kmer-per-region': 0, 'mapped-asvs': 'asv04|asv05|asv11', }, }) db_summary.index.set_names('feature-id', inplace=True) db_summary = Artifact.import_data('FeatureData[ReconstructionSummary]', Metadata(db_summary))
def _14(obj:Metadata) -> ReconSummaryFormat: ff = ReconSummaryFormat() obj.save(str(ff)) return ff
def summarize(output_dir: str, table: biom.Table, sample_metadata: qiime2.Metadata = None) -> None: number_of_features, number_of_samples = table.shape sample_summary, sample_frequencies = _frequency_summary(table, axis='sample') if number_of_samples > 1: # Calculate the bin count, with a minimum of 5 bins IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile'] if IQR == 0.0: bins = 5 else: # Freedman–Diaconis rule bin_width = (2 * IQR) / (number_of_samples**(1 / 3)) bins = max((sample_summary['Maximum frequency'] - sample_summary['Minimum frequency']) / bin_width, 5) sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False, rug=True, bins=int(round(bins))) sample_frequencies_ax.get_xaxis().set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) sample_frequencies_ax.set_xlabel('Frequency per sample') sample_frequencies_ax.set_ylabel('Number of samples') sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.pdf')) sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.png')) plt.gcf().clear() feature_summary, feature_frequencies = _frequency_summary( table, axis='observation') if number_of_features > 1: feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False, rug=False) feature_frequencies_ax.set_xlabel('Frequency per feature') feature_frequencies_ax.set_ylabel('Number of features') feature_frequencies_ax.set_xscale('log') feature_frequencies_ax.set_yscale('log') feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.pdf')) feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.png')) sample_summary_table = q2templates.df_to_html( sample_summary.apply('{:,}'.format).to_frame('Frequency')) feature_summary_table = q2templates.df_to_html( feature_summary.apply('{:,}'.format).to_frame('Frequency')) index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html') context = { 'number_of_samples': number_of_samples, 'number_of_features': number_of_features, 'total_frequencies': int(np.sum(sample_frequencies)), 'sample_summary_table': sample_summary_table, 'feature_summary_table': feature_summary_table, } feature_qualitative_data = _compute_qualitative_summary(table) sample_frequencies.sort_values(inplace=True, ascending=False) feature_frequencies.sort_values(inplace=True, ascending=False) sample_frequencies.to_csv( os.path.join(output_dir, 'sample-frequency-detail.csv')) feature_frequencies.to_csv( os.path.join(output_dir, 'feature-frequency-detail.csv')) feature_frequencies = feature_frequencies.astype(int) \ .apply('{:,}'.format).to_frame('Frequency') feature_frequencies['# of Samples Observed In'] = \ pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format) feature_frequencies_table = q2templates.df_to_html(feature_frequencies) overview_template = os.path.join(TEMPLATES, 'summarize_assets', 'overview.html') sample_frequency_template = os.path.join(TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html') feature_frequency_template = os.path.join(TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html') context.update({ 'max_count': sample_frequencies.max(), 'feature_frequencies_table': feature_frequencies_table, 'feature_qualitative_data': feature_qualitative_data, 'tabs': [{ 'url': 'overview.html', 'title': 'Overview' }, { 'url': 'sample-frequency-detail.html', 'title': 'Interactive Sample Detail' }, { 'url': 'feature-frequency-detail.html', 'title': 'Feature Detail' }] }) templates = [ index, sample_frequency_template, feature_frequency_template, overview_template ] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") if sample_metadata: sample_metadata = sample_metadata.filter_ids( sample_frequencies.index) # TODO use Metadata.to_json() API if/when it exists in the future. sample_metadata.to_dataframe().to_json(fh) else: fh.write('{}') fh.write(', ') sample_frequencies.to_json(fh) fh.write(');')
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat( [alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis( groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append([ '%s:%s' % (column, names[i]), '%s:%s' % (column, names[j]) ]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests(kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump( { 'initial': initial_data_length, 'filtered': filtered_data_length }, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render( index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons]) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str = 'spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata contains only non-numeric or empty columns. This " "visualizer requires at least one numeric metadata column to " "execute.") filenames = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_column.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = { 'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0] } escaped_column = quote(column) filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % column) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump( { 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0] }, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'filtered_columns': ', '.join(sorted(filtered_columns)) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'), os.path.join(output_dir, 'dist'))
def cast_metadata(paths, cast, output_file, ignore_extra, error_on_missing): import tempfile from qiime2 import Metadata, metadata md = _merge_metadata(paths) cast_dict = {} try: for casting in cast: if ':' not in casting: raise click.BadParameter( message=f'Missing `:` in --cast {casting}', param_hint='cast') splitter = casting.split(':') if len(splitter) != 2: raise click.BadParameter( message=f'Incorrect number of fields in --cast {casting}.' f' Observed {len(splitter)}' f' {tuple(splitter)}, expected 2.', param_hint='cast') col, type_ = splitter if col in cast_dict: raise click.BadParameter( message=(f'Column name "{col}" appears in cast more than' ' once.'), param_hint='cast') cast_dict[col] = type_ except Exception as err: header = \ ('Could not parse provided cast arguments into unique COLUMN:TYPE' ' pairs. Please make sure all cast flags are of the format --cast' ' COLUMN:TYPE') q2cli.util.exit_with_error(err, header=header) types = set(cast_dict.values()) if not types.issubset(_COLUMN_TYPES): raise click.BadParameter( message=('Unknown column type provided. Please make sure all' ' columns included in your cast contain a valid column' ' type. Valid types: %s' % (', '.join(_COLUMN_TYPES))), param_hint='cast') column_names = set(md.columns.keys()) cast_names = set(cast_dict.keys()) if not ignore_extra: if not cast_names.issubset(column_names): cast = cast_names.difference(column_names) raise click.BadParameter( message=('The following cast columns were not found' ' within the metadata: %s' % (', '.join(cast))), param_hint='cast') if error_on_missing: if not column_names.issubset(cast_names): cols = column_names.difference(cast_names) raise click.BadParameter( message='The following columns within the metadata' ' were not provided in the cast: %s' % (', '.join(cols)), param_hint='cast') # Remove entries from the cast dict that are not in the metadata to avoid # errors further down the road for cast in cast_names: if cast not in column_names: cast_dict.pop(cast) with tempfile.NamedTemporaryFile() as temp: md.save(temp.name) try: cast_md = Metadata.load(temp.name, cast_dict) except metadata.io.MetadataFileError as e: raise click.BadParameter(message=e, param_hint='cast') from e if output_file: cast_md.save(output_file) else: with tempfile.NamedTemporaryFile(mode='w+') as stdout_temp: cast_md.save(stdout_temp.name) stdout_str = stdout_temp.read() click.echo(stdout_str)
def _4(data: qiime2.Metadata) -> QiitaMetadataFormat: ff = QiitaMetadataFormat() md_df = data.to_dataframe() with ff.open() as fh: md_df.to_csv(fh, sep='\t', header=True) return ff
def draw_interactive_map(output_dir: str, metadata: qiime2.Metadata, column: str = None, latitude: str = 'Latitude', longitude: str = 'Longitude', color_palette: str = 'rainbow', discrete: bool = False, missing_data: str = 'error'): metadata = _load_and_validate( metadata, [column, latitude, longitude], ['column', 'latitude', 'longitude'], missing_data) lat_0, lat_1, lon_0, lon_1 = get_max_extent( metadata[latitude], metadata[longitude]) loc_min, loc_max = [lon_0, lat_0], [lon_1, lat_1] cmap = plt.get_cmap(color_palette) data = [] # If column is numeric, color points by column if np.issubdtype(metadata[column].dtype, np.number) and not discrete: metadata[column] = metadata[column].astype(float) normalize = mcolors.Normalize( vmin=metadata[column].min(), vmax=metadata[column].max()) scalarmappaple = cm.ScalarMappable( norm=normalize, cmap=cmap) scalarmappaple.set_array(metadata[column]) fig, ax = plt.subplots() plt.colorbar(scalarmappaple).set_label(column) ax.remove() metadata.sort_values(by=column, ascending=False, inplace=True) for i, row in metadata.iterrows(): data.append({ 'sample_id': i, column: row[column], 'latitude': row[latitude], 'longitude': row[longitude], 'color': mcolors.to_hex(scalarmappaple.to_rgba(row[column])) }) # if column is not numeric, color discretely else: groups = metadata[column].unique() len_groups = len(groups) colors = {g: mcolors.to_hex(c) for g, c in zip( groups, cmap(np.linspace(0, 1, len(groups))))} for i, row in metadata.iterrows(): data.append({ 'sample_id': i, column: row[column], 'latitude': row[latitude], 'longitude': row[longitude], 'color': colors[row[column]] }) fig = plt.figure(figsize=[len_groups * 0.05, len_groups/2]) ax = fig.add_axes([0, 0, 1, 1]) for idx, (g, color) in enumerate(colors.items()): r = mpatch.Rectangle((0, idx), 1, 1, color=color) _ = ax.text(2, idx+.5, ' %s' % g, va='center', fontsize=10) ax.add_patch(r) ax.axhline(idx, color='k') ax.set_xlim(0, 3) ax.set_ylim(0, idx + 2) ax.axis('off') save_animated_map(output_dir, loc_min, loc_max, data, column)
def import_dada2_stats_df_to_q2(df): combined_artifact = Artifact.import_data("SampleData[DADA2Stats]", Metadata(df)) return combined_artifact
def ancombc( table: pd.DataFrame, metadata: qiime2.Metadata, formula: str, p_adj_method: str = "holm", zero_cut: float = 0.90, lib_cut: int = 1000, group: str = None, struc_zero: bool = True, neg_lb: bool = True, tol: float = 1e-5, max_iter: int = 100, conserve: bool = True, alpha: float = 0.05, # global_test : bool = True ) -> pd.DataFrame: # create series from the metadata column meta = metadata.to_dataframe() # checks for variable lengths and warns if there's only one value per # group. ANCOM will fail silently lateer because of it and thats harder # to debug variables = np.unique(np.hstack([x.split("*") for x in formula.split("+")])) variables = np.array([x.strip() for x in variables]) var_counts = pd.DataFrame.from_dict( orient='index', data={ var: { 'n_groups': len(meta[var].dropna().unique()) } for var in variables }) if (var_counts['n_groups'] < 2).all(): raise ValueError("None of the columns in the metadata satisfy " "ANCOM-BC's requirements. All columns in the " "formula should have more than one value.") # filter the metadata so only the samples present in the table are used # this also reorders it for the correct condition selection # it has to be re ordered for ancombc to correctly input the conditions meta = meta.loc[list(table.index)] # force reorder based on the data to ensure conds are selected correctly with tempfile.TemporaryDirectory() as temp_dir_name: temp_dir_name = '.' # debug biom_fp = os.path.join(temp_dir_name, 'input.biom.tsv') meta_fp = os.path.join(temp_dir_name, 'input.map.txt') summary_fp = os.path.join(temp_dir_name, 'output.summary.txt') # Need to manually specify header=True for Series (i.e. "meta"). It's # already the default for DataFrames (i.e. "table"), but we manually # specify it here anyway to alleviate any potential confusion. table.to_csv(biom_fp, sep='\t', header=True) meta.to_csv(meta_fp, sep='\t', header=True) if group is None: group = formula cmd = [ 'run_ancombc.R', biom_fp, # inp.abundances.path meta_fp, # inp.metadata.path formula, # formula p_adj_method, # p_adj_method zero_cut, # zero_cut lib_cut, # lib_cut group, # group str(struc_zero).upper(), # struc_zero str(neg_lb).upper(), # neg_lb tol, # tol max_iter, # max_iter str(conserve).upper(), # conserve alpha, # alpha 'FALSE', # global -- temporary until better understood # str(global_test).upper(), # global summary_fp # output ] cmd = list(map(str, cmd)) try: global_test = run_commands([cmd]) # TODO: not sure what to do about the `global_test` statistic # may need another custom q2 type for this... except subprocess.CalledProcessError as e: raise Exception("An error was encountered while running ANCOMBC" " in R (return code %d), please inspect stdout" " and stderr to learn more." % e.returncode) summary = pd.read_csv(summary_fp, index_col=0) # del summary['diff_abn'] # remove this field for now ... # summary.index.name = "featureid" return summary
def ordinate(table, metadata=None, metric='jaccard', sampling_depth=-1, phylogeny=None, number_of_dimensions=None, biplot=False): """Perform ordination using principal coordinate analysis (PCoA). This method wraps multiple QIIME 2 methods to perform ordination and returns Artifact object containing PCoA results. Under the hood, this method filters the samples (if requested), performs rarefying of the feature table (if requested), computes distance matrix, and then runs PCoA. By default, the method returns PCoAResults. For creating a biplot, use `biplot=True` which returns PCoAResults % Properties('biplot'). Parameters ---------- table : str or qiime2.Artifact Artifact file or object corresponding to FeatureTable[Frequency]. metadata : str or qiime2.Metadata, optional Metadata file or object. All samples in 'metadata' that are also in the feature table will be retained. metric : str, default: 'jaccard' Metric used for distance matrix computation ('jaccard', 'bray_curtis', 'unweighted_unifrac', or 'weighted_unifrac'). sampling_depth : int, default: -1 If negative, skip rarefying. If 0, rarefy to the sample with minimum depth. Otherwise, rarefy to the provided sampling depth. phylogeny : str, optional Rooted tree file. Required if using 'unweighted_unifrac', or 'weighted_unifrac' as metric. number_of_dimensions : int, optional Dimensions to reduce the distance matrix to. biplot : bool, default: False If true, return PCoAResults % Properties('biplot'). Returns ------- qiime2.Artifact Artifact object corresponding to PCoAResults or PCoAResults % Properties('biplot'). See Also -------- beta_2d_plot beta_3d_plot beta_scree_plot beta_parallel_plot Notes ----- The resulting Artifact object can be directly used for plotting. Examples -------- Below is a simple example. Note that the default distance metric used is ``jaccard``. The resulting object ``pcoa`` can be directly used for plotting by the ``dokdo.beta_2d_plot`` method as shown below. >>> table_file = f'{data_dir}/moving-pictures-tutorial/table.qza' >>> metadata_file = f'{data_dir}/moving-pictures-tutorial/sample-metadata.tsv' >>> pcoa_results = dokdo.ordinate(table_file) >>> dokdo.beta_2d_plot(pcoa_results, metadata=metadata_file, hue='body-site', artist_kwargs=dict(show_legend=True)) >>> plt.tight_layout() .. image:: images/ordinate-1.png You can choose a subset of samples. >>> from qiime2 import Metadata >>> mf = dokdo.get_mf(metadata_file) >>> mf = mf[mf['body-site'].isin(['gut', 'left palm'])] >>> pcoa_results = dokdo.ordinate(table_file, metadata=Metadata(mf)) >>> dokdo.beta_2d_plot(pcoa_results, metadata=metadata_file, hue='body-site', artist_kwargs=dict(show_legend=True)) >>> plt.tight_layout() .. image:: images/ordinate-2.png You can also generate a biplot. >>> pcoa_results = dokdo.ordinate(table_file, biplot=True, number_of_dimensions=10) >>> fig, ax = plt.subplots(1, 1, figsize=(8, 8)) >>> dokdo.beta_2d_plot(pcoa_results, ax=ax, metadata=metadata_file, hue='body-site', artist_kwargs=dict(show_legend=True)) >>> dokdo.addbiplot(pcoa_results, ax=ax, count=7) >>> plt.tight_layout() .. image:: images/ordinate-3.png """ if isinstance(table, Artifact): table = table elif isinstance(table, str): table = Artifact.load(table) else: raise TypeError(f"Incorrect feature table type: {type(table)}") # If metadata is provided, perform sample filtration. if metadata is not None: if isinstance(metadata, Metadata): _metadata = metadata else: _metadata = Metadata.load(metadata) _table = feature_table.methods.filter_samples( table=table, metadata=_metadata).filtered_table else: _table = table # Perform rarefying. if sampling_depth < 0: rarefied_table = _table else: if sampling_depth == 0: sampling_depth = int(_table.view(pd.DataFrame).sum(axis=1).min()) rarefy_result = feature_table.methods.rarefy( table=_table, sampling_depth=sampling_depth) rarefied_table = rarefy_result.rarefied_table if metric == 'jaccard': distance_matrix_result = diversity_lib.methods.jaccard( table=rarefied_table) elif metric == 'bray_curtis': distance_matrix_result = diversity_lib.methods.bray_curtis( table=rarefied_table) elif metric == 'unweighted_unifrac': distance_matrix_result = diversity_lib.methods.unweighted_unifrac( table=rarefied_table, phylogeny=Artifact.load(phylogeny)) elif metric == 'weighted_unifrac': distance_matrix_result = diversity_lib.methods.weighted_unifrac( table=rarefied_table, phylogeny=Artifact.load(phylogeny)) else: raise ValueError(f"Incorrect metric detected: {metric}") distance_matrix = distance_matrix_result.distance_matrix result_obj = diversity.methods.pcoa( distance_matrix=distance_matrix, number_of_dimensions=number_of_dimensions) pcoa_results = result_obj.pcoa if biplot: rf_result = feature_table.methods.relative_frequency(table=_table) rf_table = rf_result.relative_frequency_table result_obj = diversity.methods.pcoa_biplot(pcoa=pcoa_results, features=rf_table) pcoa_results = result_obj.biplot return pcoa_results
def _15(df: MappingDirectoryFormat) -> Metadata: d = df.mapping.view(dict) return Metadata(pd.DataFrame(d, index=pd.Index(["0"], name='id')))
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: metadata_df = metadata.to_dataframe() metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore') pre_filtered_cols = set(metadata_df.columns) metadata_df = metadata_df.select_dtypes(exclude=[np.number]) post_filtered_cols = set(metadata_df.columns) filtered_numeric_categories = pre_filtered_cols - post_filtered_cols filtered_group_comparisons = [] categories = metadata_df.columns metric_name = alpha_diversity.name if len(categories) == 0: raise ValueError('Only numeric data is present in metadata file.') filenames = [] filtered_categories = [] for category in categories: metadata_category = metadata.get_category(category).to_series() metadata_category = metadata_category[alpha_diversity.index] metadata_category = metadata_category.replace(r'', np.nan).dropna() initial_data_length = alpha_diversity.shape[0] data = pd.concat([alpha_diversity, metadata_category], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_category.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[alpha_diversity.name])) if (len(groups) > 1 and len(groups) != len(data.index)): escaped_category = quote(category) filename = 'category-%s.jsonp' % escaped_category filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis(groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append( ['%s:%s' % (category, names[i]), '%s:%s' % (category, names[j])]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests( kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_category pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: df = pd.Series(groups, index=names) fh.write("load_data('%s'," % category) df.to_json(fh, orient='split') fh.write(",") json.dump({'initial': initial_data_length, 'filtered': filtered_data_length}, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = kw_H_pairwise.to_html(classes="table table-striped " "table-hover") table = table.replace('border="1"', 'border="0"') fh.write(table.replace('\n', '')) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) else: filtered_categories.append(category) index = os.path.join( TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'categories': [quote(fn) for fn in filenames], 'filtered_numeric_categories': ', '.join(filtered_numeric_categories), 'filtered_categories': ', '.join(filtered_categories), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons])}) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dst'), os.path.join(output_dir, 'dist'))
def qarcoal( table: biom.Table, taxonomy: pd.DataFrame, num_string: str, denom_string: str, samples_to_use: Metadata = None, allow_shared_features: bool = False, ) -> pd.DataFrame: """Calculate sample-wise log-ratios of features based on taxonomy. Parameters: ----------- table: biom file with which to calculate log ratios taxonomy: pd.DataFrame with taxonomy information (should have Taxon column in which features will be searched) num_string: numerator string to search for in taxonomy denom_string: denominator string to search for in taxonomy samples_to_use: Q2 Metadata file with samples to use. If provided, feature table will be filtered to only consider samples present in this file. (optional) allow_shared_features: bool denoting handling of shared features between numerator and denominator. If False, an error is raised if features are shared between numerator and denominator. If True, will allow shared features without throwing an error. Returns: -------- comparison_df: pd DataFrame in the form: Sample-ID Num_Sum Denom_Sum log_ratio S1 7 15 -0.762140 """ # biom table is features x samples if samples_to_use is not None: filt_samples = set(samples_to_use.to_dataframe().index) feat_table = table.filter(filt_samples, axis="sample", inplace=False) feat_table = feat_table.to_dataframe() else: feat_table = table.to_dataframe() # raise error if there are any negative counts in the feature table if feat_table.lt(0).any().any(): raise ValueError("Feature table has negative counts!") tax_num_df, tax_denom_df = filter_and_join_taxonomy( feat_table, taxonomy, num_string, denom_string, ) # if shared features are disallowed, check to make sure they don't occur # if allowed, can skip this step at user's risk if not allow_shared_features: shared_features = set(tax_num_df.index) & set(tax_denom_df.index) if shared_features: raise ValueError("Shared features between num and denom!") tax_num_sample_sum = tax_num_df.sum(axis=0) tax_denom_sample_sum = tax_denom_df.sum(axis=0) comparison_df = pd.DataFrame.from_records( [tax_num_sample_sum, tax_denom_sample_sum], index=["Num_Sum", "Denom_Sum"], ).T comparison_df["log_ratio"] = comparison_df.apply( lambda x: np.log(x.Num_Sum / x.Denom_Sum), axis=1 ) comparison_df.index.name = "Sample-ID" return comparison_df
def md1_factory(): return Metadata( pd.DataFrame({'a': ['1', '2', '3']}, index=pd.Index(['0', '1', '2'], name='id')))
ZebraFilter(.9999, "../zebra.csv"), ]) return MultiFactory([ woltka_levels, zebra, ]) if __name__ == "__main__": # Pretend all scripts are run from root of repo for file paths. import os os.chdir("..") factory = configure() # SavePreprocessedTables().run(factory) metadata = Metadata.load(metadata_filepath) print(metadata) for config in factory.gen_configurations(): print(config.analysis_name) path = "tables/"+config.analysis_name+"_train.qza" table = Artifact.load(path) alpha_result = alpha(table=table, metric='shannon') alpha_diversity = alpha_result.alpha_diversity series = alpha_diversity.view(pd.Series) print(series) # Argh, how do I tell it what I care about? # vis_alpha = alpha_group_significance(alpha_diversity, metadata)
def md2_factory(): return Metadata( pd.DataFrame({'b': ['4', '5', '6']}, index=pd.Index(['0', '1', '2'], name='id')))
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() metadata_df.columns = pd.MultiIndex.from_tuples([ (c, '') for c in metadata_df.columns ]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata( column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = [ 'depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values ] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={ 'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
print("%s sample pairs matched together" % (len(case_to_control_match.keys()))) for key in case_to_control_match: key_value = case_to_control_match[key] matchDF.at[key, "matched_to"] = str(key_value) matchDF.at[key_value, "matched_to"] = str(key) else: print("%s cases matched" % (len(case_dictionary.keys()))) for case in case_dictionary: for control in case_dictionary[case]: if control in control_dictionary: control_dictionary[control].append(case) else: control_dictionary[control] = [case] matchDF.at[case, "matched_to"] = ", ".join(sorted(case_dictionary[case])) for control in control_dictionary: matchDF.at[control, "matched_to"] = ", ".join( sorted(control_dictionary[control])) matchedMD = Metadata(matchDF) if only_matches: ids = matchedMD.get_ids("matched_to NOT IN ('none')") #shrinks the MD to only have matched samples matchedMD = matchedMD.filter_ids(ids) return matchedMD
def setUp(self): super().setUp() # setup taxonomy to be edited tax_fp = self.get_data_path('escherichia_shigella_taxonomy.txt') self.taxonomy = TSVTaxonomyFormat(tax_fp, mode='r').view(pd.Series) # setup full string replacement replc = self.get_data_path('taxonomy-replacement-full-strings.txt') md_replc = Metadata.load(replc) self.md_replc_col = md_replc.get_column('replacements') # setup substring replacement ss_replc = self.get_data_path('taxonomy-replacement-pass.txt') md_ss_replc = Metadata.load(ss_replc) self.md_ss_replc_col = md_ss_replc.get_column('replacements') # setup substring regex replacement ssr_replc = self.get_data_path('taxonomy-replacement-regex.txt') md_ssr_replc = Metadata.load(ssr_replc) self.md_ssr_replc_col = md_ssr_replc.get_column('replacements') # setup reusable dicts self.exp_dict_00 = { 'Sal01': ('d__SUPER_DUPER_BACTERIA; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'Sal02': ('d__SUPER_DUPER_BACTERIA; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'UncultSal': ('d__SUPER_DUPER_BACTERIA; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__uncultured_Salmonella'), 'Esch01': ('d__SUPER_DUPER_BACTERIA; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; s__'), 'Shig01': ('d__SUPER_DUPER_BACTERIA; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; s__') } self.exp_dict_01 = { 'Sal01': ('d__Bacteria; p__LAME-PYHLA; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'Sal02': ('d__Bacteria; p__LAME-PYHLA; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'UncultSal': ('d__Bacteria; ' 'p__LAME-PYHLA; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__UNCIVILIZED_Salmonella'), 'Esch01': ('d__Bacteria; p__LAME-PYHLA; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; s__'), 'Shig01': ('d__Bacteria; p__LAME-PYHLA; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; s__') } self.exp_dict_02 = { 'Sal01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'Sal02': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'UncultSal': ('d__Bacteria; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__uncultured_Salmonella'), 'Esch01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__unknown_Escherichia-Shigella'), 'Shig01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__unknown_Escherichia-Shigella') } self.exp_dict_03 = { 'Sal01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'Sal02': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__Salmonella_enterica'), 'UncultSal': ('d__Bacteria; ' 'p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__UNCIVIL_Salmonella'), 'Esch01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__unknown_Escherichia-Shigella'), 'Shig01': ('d__Bacteria; p__Proteobacteria; ' 'c__Gammaproteobacteria; ' 'o__Enterobacterales; ' 'f__Enterobacteriaceae; ' 'g__Escherichia-Shigella; ' 's__unknown_Escherichia-Shigella') }
def determine_cases_and_controls(afterExclusion_MD, query_line_dict, extra): ''' Determines what samples are cases or controls using the queries in query_line_array. The labels of each sample are stored in case_controlDF Parameters ---------- afterExclusion_MD : Metadata object Metadata object with unwanted samples filtered out query_line_array : array of arrays of strings there are two sub arrays the first array are made of queries to determine controls the second array are made of queries to determine cases extra: boolean Tells function whether to shrink metadata in one step or in multiple extra print statements that show how many potential case or control samples are left after each query Returns ------- mergedMD : Metadata object Metadata object with unwanted samples filtered out and a case_control column that reflects if the index is a case, control, or Undefined Raises ------ ValueError If the input file of sql commands for determining case and controls is empty ''' ids = afterExclusion_MD.get_ids() case_control_Series = pd.Series(["Unspecified"] * len(ids), ids) case_control_Series.index.name = afterExclusion_MD.id_header case_controlDF = case_control_Series.to_frame("case_control") print("Metadata Object has %s samples" % (afterExclusion_MD.id_count)) for key in query_line_dict: if key != "case" and key != "control": print("Wrong key used for query. Must be 'case' or 'control'.") continue #resets shrunk_MD so that filtering down to control samples does not #influence filtering down to case shrunk_MD = afterExclusion_MD #get query and filtering down to control or case samples based on key query_lines = query_line_dict[key] if len(query_lines) < 1: raise ValueError("The %s query file is empty" % (key)) if extra: for line in query_lines: initial_size = shrunk_MD.id_count ids = shrunk_MD.get_ids(line) shrunk_MD = shrunk_MD.filter_ids(ids) print(line) print( "\tFilters down number of potental %s samples left to %s" % (key, shrunk_MD.id_count)) else: ids = shrunk_MD.get_ids(' AND '.join(query_lines)) shrunk_MD = shrunk_MD.filter_ids(ids) print("Final number of %s samples is %s" % (shrunk_MD.id_count, key)) #replaces the true values created by the loop above to case or control ids = shrunk_MD.ids case_controlDF.loc[ids, "case_control"] = key #turns case_controlDF into a metadata object case_controlMD = Metadata(case_controlDF) #merges afterExclution_MD and case_controlMD into one new metadata object mergedMD = Metadata.merge(afterExclusion_MD, case_controlMD) return mergedMD
def _13(ff:ReconSummaryFormat) -> Metadata: return Metadata.load(str(ff))
def matcher(prepped_for_match_MD, conditions_for_match_lines, one_to_one, only_matches): ''' matches case samples to controls and puts the case's id in column matched to on the control sample's row Parameters ---------- prepped_for_match_MD : Metadata object Samples that do not have valid entries for columns that determine matching are removed. Everything else is the same as merged_MD. conditions_for_match_lines : array of strings contains information on what conditions must be met to constitue a match one_to_one: boolean determines if match does one to one matching using stable marriage only_matches: boolean determines if the returned metadata object should only include samples with matches Returns ------- matchedMD : Metadata object Metadata based of the samples in prepped_for_match_MD with matches represented by a column called matched to. Values in matched to are the sample ids of the sample samples matches to Raises ------ KeyError conditions_for_match_lines tells the program to match based on a column that does not exist in the metadata conditions_for_match_lines has a range value that can not be converted to a float For one of the columns that conditions_for_match_lines tells to match on using a range a control or case sample in prepped_for_match_MD has a value that can not be converted into a float ''' case_dictionary = {} control_dictionary = {} control_match_count_dictionary = {} case_match_count_dictionary = {} matchDF = prepped_for_match_MD.to_dataframe() case_for_matchDF = matchDF[matchDF["case_control"].isin(["case"])] # creates column to show matches. since it will contain the sample number #it was matched too the null value will be 0 matchDF["matched_to"] = 'none' # loops though case samples and matches them to controls for case_index, case_row in case_for_matchDF.iterrows(): #set matchDF to be only the samples of masterDF that are control samples controlDF = matchDF[matchDF["case_control"].isin(["control"])] if controlDF.size == 0: return Metadata(matchDF) # loop though input columns to determine matches for conditions in conditions_for_match_lines: column_name = conditions.split("\t")[1].strip() try: matchDF[column_name] except: raise KeyError("Column %s not found in your input data. " "Correct this error in your --match file" % (column_name)) # get the type of data for the given column. This determine how a #match is determined if conditions.split("\t")[0] == "range": num = conditions.split("\t")[2].strip() try: row_num = float(case_row[column_name]) except: raise ValueError("column %s contains a string that can " "not be converted to a numerical value" % (column_name)) try: fnum = float(num) except: raise ValueError("input number for condition %s is not a " "valid number" % (column_name)) try: nums_in_column = pd.to_numeric(controlDF[column_name]) except: raise ValueError("column %s contains a string that can " "not be converted to a numerical value" % (column_name)) # filters controls based on if the value in the control is not #within a given distance form the case controlDF = controlDF[(nums_in_column >= (row_num - fnum)) & (nums_in_column <= (row_num + fnum))] else: # filters controls if the strings for the control and case #don't match controlDF = controlDF[controlDF[column_name].isin( [case_row[column_name]])] if controlDF.index.values.size > 0: case_dictionary.update({case_index: controlDF.index.values}) case_match_count_dictionary.update( {case_index: (controlDF.index.values.size)}) for id_control in controlDF.index: if id_control not in control_match_count_dictionary: control_match_count_dictionary.update({id_control: 0}) control_match_count_dictionary.update( {id_control: (control_match_count_dictionary[id_control] + 1)}) if one_to_one: stable = Stable_Marriage() case_to_control_match = stable.stableMarriageRunner( case_dictionary, control_match_count_dictionary, case_match_count_dictionary) print("%s sample pairs matched together" % (len(case_to_control_match.keys()))) for key in case_to_control_match: key_value = case_to_control_match[key] matchDF.at[key, "matched_to"] = str(key_value) matchDF.at[key_value, "matched_to"] = str(key) else: print("%s cases matched" % (len(case_dictionary.keys()))) for case in case_dictionary: for control in case_dictionary[case]: if control in control_dictionary: control_dictionary[control].append(case) else: control_dictionary[control] = [case] matchDF.at[case, "matched_to"] = ", ".join(sorted(case_dictionary[case])) for control in control_dictionary: matchDF.at[control, "matched_to"] = ", ".join( sorted(control_dictionary[control]))
def _15(df: MappingDirectoryFormat) -> Metadata: d = df.mapping.view(dict) return Metadata(pd.DataFrame(d, index=["0"]))
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() if metadata_df.empty or len(metadata.columns) == 0: raise ValueError("All metadata filtered after dropping columns " "that contained non-categorical data.") metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata(column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def simple_plot(output_dir, table: biom.Table, feature_tree: skbio.TreeNode, metadata: q2.Metadata, case_where: str, control_where: str, n_transects: int = 10, stratify_by: str = None, mode: str = 'max'): print("Data extracted") layer_dir = os.path.join(output_dir, 'layers') rank_dir = os.path.join(output_dir, 'ranks') os.mkdir(layer_dir) os.mkdir(rank_dir) metadata = metadata.filter_ids(table.ids(axis='sample')) case_samples = sorted(list(metadata.get_ids(case_where))) control_samples = sorted(list(metadata.get_ids(control_where))) get_pairs = comparisons(metadata, control_samples, case_samples, stratify_by) table.filter(case_samples + control_samples) table.remove_empty('observation') features = list(table.ids(axis='observation')) feature_tree = shear_no_prune(feature_tree, features) print("Extraneous features removed") for n in feature_tree.traverse(): if not n.length: n.length = 0 tree = tree_to_array(feature_tree, mode) print("Tree index created") possible_transects = len(np.unique(np.asarray(tree['distances']))) tree_length = tree['distances'][0] # root of tree if n_transects > possible_transects: n_transects = possible_transects print("Only %d transects exist, using that instead" % n_transects) transects = list(np.linspace(0, tree_length, num=n_transects)) print("Will transect at: %s" % ", ".join(map(str, transects))) figure_gen = prepare_plot(tree_length) figure_gen.send(None) # initialize co-routine colors = [] points, _ = pairwise_components(table, get_pairs()) color_fig, highlight_fig, color = figure_gen.send((points, None)) color_fig.savefig(os.path.join(layer_dir, 'original.png'), transparent=True) plt.close(color_fig) highlight_fig.savefig(os.path.join(layer_dir, 'original.h.png'), transparent=True) plt.close(highlight_fig) colors.append(color) rank_files = [] collapsed_groups = pd.DataFrame() for distance in transects: collapsed_table, collapsed_counts, groups = group_by_transect( table, tree, distance) collapsed_groups[groups.name] = groups print("Table collapsed at transect %s" % distance) points, ranks = pairwise_components(collapsed_table, get_pairs()) filename = write_ranks(rank_dir, collapsed_counts, ranks, distance) rank_files.append(filename) color_fig, highlight_fig, color = figure_gen.send((points, distance)) colors.append(color) color_fig.savefig(os.path.join(layer_dir, 'T_%s.png' % distance), transparent=True) plt.close(color_fig) highlight_fig.savefig(os.path.join(layer_dir, 'T_%s.h.png' % distance), transparent=True) plt.close(highlight_fig) print("Finalizing visualization") figure = figure_gen.send((None, None)) figure.savefig(os.path.join(layer_dir, 'trajectory.png'), transparent=True) plt.close(figure) background = next(figure_gen) background.savefig(os.path.join(layer_dir, 'bg.png'), transparent=True) plt.close(background) with open(os.path.join(output_dir, 'collapsed_groups.tsv'), 'w') as fh: collapsed_groups.to_csv(fh, sep='\t') with open(os.path.join(output_dir, 'index.html'), 'w') as fh: template = Environment(loader=BaseLoader).from_string(TEMPLATE) fh.write( template.render({ 'legend': list( zip(['original'] + ['T_%s' % d for d in transects] + ['trajectory'], list(map(to_hex, colors)) + ['red'])), 'filenames': rank_files }))
if args.regions is None or args.regions == 'None': if regions: args.regions = regions else: raise ValueError('Failed to identify correctly named regions (V* / ITS*)') else: args.regions = args.regions.split(',') for r in args.regions: if not r in primers: raise ValueError('libprepkit: {} does not support region: {}'.format(args.libprep, r)) if not primers[r] in available_classifiers(args.classifier_dir, level=args.classifier_level): raise ValueError('prebuildt classifier dir: {} does not contain region: {}'.format(args.classifier_dir, r)) write_message('loading sample info') samples = Metadata.load(os.path.abspath(args.sample_info)) write_message('starting demultiplex fastq files') # adata key: region, value: SampleData[PairedEndSequencesWithQuality] artifact adata = demultiplex_manifests(args.input, primers, args.regions, split_on_header=True, threads=args.threads) write_message('completed demultiplex fastq files') write_message('starting read count of fastq files') counts, merged_counts = sequence_counts(adata, min_count=args.filter_region_count ) # filter regions with too few reads for k in list(adata.keys()): if not k in counts: del adata[k] write_message('completed read count') DADA2_PARAMS = dada2_denoise_params(args.libprep_config, args.libprep)
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str='spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) metadata_df = metadata.to_dataframe() metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore') pre_filtered_cols = set(metadata_df.columns) metadata_df = metadata_df.select_dtypes(include=[np.number]) post_filtered_cols = set(metadata_df.columns) filtered_categories = pre_filtered_cols - post_filtered_cols categories = metadata_df.columns if len(categories) == 0: raise ValueError('Only non-numeric data is present in metadata file.') filenames = [] for category in categories: metadata_category = metadata_df[category] metadata_category = metadata_category[alpha_diversity.index] metadata_category = metadata_category.dropna() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_category, alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_category.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = {'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0]} escaped_category = quote(category) filename = 'category-%s.jsonp' % escaped_category filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % category) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump({ 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0]}, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'categories': [quote(fn) for fn in filenames], 'filtered_categories': ', '.join(filtered_categories)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dst'), os.path.join(output_dir, 'dist'))
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode=None, metrics: set=None, metadata: qiime2.Metadata=None, min_depth: int=1, steps: int=10, iterations: int=10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is not None: metadata_ids = metadata.ids() table_ids = set(table.ids(axis='sample')) if not table_ids.issubset(metadata_ids): raise ValueError('Missing samples in metadata: %r' % table_ids.difference(metadata_ids)) filenames, categories, empty_columns = [], [], [] data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: metadata_df = metadata.to_dataframe() metadata_df = metadata_df.loc[data.index] all_columns = metadata_df.columns metadata_df.dropna(axis='columns', how='all', inplace=True) empty_columns = set(all_columns) - set(metadata_df.columns) metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) merged = data.join(metadata_df, how='left') categories = metadata_df.columns.get_level_values(0) for category in categories: category_name = quote(category) reindexed_df, counts = _reindex_with_metadata(category, categories, merged) c_df = _compute_summary(reindexed_df, category, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, category_name) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': filenames, 'categories': list(categories), 'empty_columns': sorted(empty_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str, clustering_method: str, metadata: qiime2.Metadata, sampling_depth: int, iterations: int=10, phylogeny: skbio.TreeNode=None, correlation_method: str='spearman', color_scheme: str='BrBG') -> None: if metric in phylogenetic_metrics(): if phylogeny is None: raise ValueError("A phylogenetic metric (%s) was requested, " "but a phylogenetic tree was not provided. " "Phylogeny must be provided when using a " "phylogenetic diversity metric." % metric) beta_func = functools.partial(beta_phylogenetic, phylogeny=phylogeny) else: beta_func = beta if table.is_empty(): raise ValueError("Input feature table is empty.") # Filter metadata to only include sample IDs present in the feature table. # Also ensures every feature table sample ID is present in the metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) distance_matrices = _get_multiple_rarefaction( beta_func, metric, iterations, table, sampling_depth) primary = distance_matrices[0] support = distance_matrices[1:] heatmap_fig, similarity_df = _make_heatmap( distance_matrices, metric, correlation_method, color_scheme) heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg')) similarity_df.to_csv( os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'), sep='\t') tree = _cluster_samples(primary, support, clustering_method) tree.write(os.path.join(output_dir, 'sample-clustering-%s.tre' % clustering_method)) emperor = _jackknifed_emperor(primary, support, metadata) emperor_dir = os.path.join(output_dir, 'emperor') emperor.copy_support_files(emperor_dir) with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh: fh.write(emperor.make_emperor(standalone=True)) templates = list(map( lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page), ['index.html', 'heatmap.html', 'tree.html', 'emperor.html'])) context = { 'metric': metric, 'clustering_method': clustering_method, 'tabs': [{'url': 'emperor.html', 'title': 'PCoA'}, {'url': 'heatmap.html', 'title': 'Heatmap'}, {'url': 'tree.html', 'title': 'Clustering'}] } q2templates.render(templates, output_dir, context=context)
def _10(ff: ErrorCorrectionDetailsFmt) -> Metadata: return Metadata.load(str(ff))
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str = 'spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata contains only non-numeric or empty columns. This " "visualizer requires at least one numeric metadata column to " "execute.") # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_column.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = {'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0]} escaped_column = quote(column) filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % column) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump({ 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0]}, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'filtered_columns': ', '.join(sorted(filtered_columns))}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'), os.path.join(output_dir, 'dist'))
def _4(obj: qiime2.Metadata) -> MMvecStatsFormat: ff = MMvecStatsFormat() obj.save(str(ff)) return ff
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns( drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat([alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis(groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append( ['%s:%s' % (column, names[i]), '%s:%s' % (column, names[j])]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests( kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump({'initial': initial_data_length, 'filtered': filtered_data_length}, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join( TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons])}) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def _8(data: pd.DataFrame) -> ErrorCorrectionDetailsFmt: ff = ErrorCorrectionDetailsFmt() Metadata(data).save(str(ff)) return ff
def summarize(output_dir: str, table: biom.Table, sample_metadata: qiime2.Metadata=None) -> None: number_of_features, number_of_samples = table.shape sample_summary, sample_frequencies = _frequency_summary( table, axis='sample') if number_of_samples > 1: # Calculate the bin count, with a minimum of 5 bins IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile'] if IQR == 0.0: bins = 5 else: # Freedman–Diaconis rule bin_width = (2 * IQR) / (number_of_samples ** (1/3)) bins = max((sample_summary['Maximum frequency'] - sample_summary['Minimum frequency']) / bin_width, 5) sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False, rug=True, bins=int(round(bins))) sample_frequencies_ax.get_xaxis().set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) sample_frequencies_ax.set_xlabel('Frequency per sample') sample_frequencies_ax.set_ylabel('Number of samples') sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.pdf')) sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.png')) plt.gcf().clear() feature_summary, feature_frequencies = _frequency_summary( table, axis='observation') if number_of_features > 1: feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False, rug=False) feature_frequencies_ax.set_xlabel('Frequency per feature') feature_frequencies_ax.set_ylabel('Number of features') feature_frequencies_ax.set_xscale('log') feature_frequencies_ax.set_yscale('log') feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.pdf')) feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.png')) sample_summary_table = q2templates.df_to_html( sample_summary.apply('{:,}'.format).to_frame('Frequency')) feature_summary_table = q2templates.df_to_html( feature_summary.apply('{:,}'.format).to_frame('Frequency')) index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html') context = { 'number_of_samples': number_of_samples, 'number_of_features': number_of_features, 'total_frequencies': int(np.sum(sample_frequencies)), 'sample_summary_table': sample_summary_table, 'feature_summary_table': feature_summary_table, } feature_qualitative_data = _compute_qualitative_summary(table) sample_frequencies.sort_values(inplace=True, ascending=False) feature_frequencies.sort_values(inplace=True, ascending=False) sample_frequencies.to_csv( os.path.join(output_dir, 'sample-frequency-detail.csv')) feature_frequencies.to_csv( os.path.join(output_dir, 'feature-frequency-detail.csv')) feature_frequencies = feature_frequencies.astype(int) \ .apply('{:,}'.format).to_frame('Frequency') feature_frequencies['# of Samples Observed In'] = \ pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format) feature_frequencies_table = q2templates.df_to_html(feature_frequencies) overview_template = os.path.join( TEMPLATES, 'summarize_assets', 'overview.html') sample_frequency_template = os.path.join( TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html') feature_frequency_template = os.path.join( TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html') context.update({'max_count': sample_frequencies.max(), 'feature_frequencies_table': feature_frequencies_table, 'feature_qualitative_data': feature_qualitative_data, 'tabs': [{'url': 'overview.html', 'title': 'Overview'}, {'url': 'sample-frequency-detail.html', 'title': 'Interactive Sample Detail'}, {'url': 'feature-frequency-detail.html', 'title': 'Feature Detail'}]}) templates = [index, sample_frequency_template, feature_frequency_template, overview_template] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") if sample_metadata: sample_metadata = sample_metadata.filter_ids( sample_frequencies.index) # TODO use Metadata.to_json() API if/when it exists in the future. sample_metadata.to_dataframe().to_json(fh) else: fh.write('{}') fh.write(', ') sample_frequencies.to_json(fh) fh.write(');')
def _9(ff: ErrorCorrectionDetailsFmt) -> pd.DataFrame: return Metadata.load(str(ff)).to_dataframe()