Ejemplo n.º 1
0
def _generic_plot(output_dir: str, master: skbio.OrdinationResults,
                  metadata: qiime2.Metadata,
                  other_pcoa: skbio.OrdinationResults, plot_name,
                  custom_axes: str=None,
                  feature_metadata: qiime2.Metadata=None):

    mf = metadata.to_dataframe()
    if feature_metadata is not None:
        feature_metadata = feature_metadata.to_dataframe()

    if other_pcoa is None:
        procrustes = None
    else:
        procrustes = [other_pcoa]

    viz = Emperor(master, mf, feature_mapping_file=feature_metadata,
                  procrustes=procrustes, remote='.')

    if custom_axes is not None:
        viz.custom_axes = custom_axes

    if other_pcoa:
        viz.procrustes_names = ['reference', 'other']

    html = viz.make_emperor(standalone=True)
    viz.copy_support_files(output_dir)
    with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh:
        fh.write(html)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={'plot_name': plot_name})
Ejemplo n.º 2
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # convert metadata to numeric values where applicable, drop the non-numeric
    # values, and then drop samples that contain NaNs
    df = metadata.to_dataframe()
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    # filter categorical columns
    pre_filtered_cols = set(df.columns)
    df = df.select_dtypes([numpy.number]).dropna()
    filtered_categorical_cols = pre_filtered_cols - set(df.columns)

    # filter 0 variance numerical columns
    pre_filtered_cols = set(df.columns)
    df = df.loc[:, df.var() != 0]
    filtered_zero_variance_cols = pre_filtered_cols - set(df.columns)

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index, strict=False)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = result.to_html(classes='table table-striped table-hover').replace(
        'border="1"', 'border="0"')

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'filtered_categorical_cols': ', '.join(filtered_categorical_cols),
        'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols),
        'result': result})
Ejemplo n.º 3
0
    def test_view_as_metadata(self):
        A = Artifact.import_data('Mapping', {'a': '1', 'b': '3'})

        obs_md = A.view(Metadata)

        exp_df = pd.DataFrame({'a': '1', 'b': '3'},
                              index=pd.Index(['0'], name='id', dtype=object),
                              dtype=object)
        exp_md = Metadata(exp_df)
        exp_md._add_artifacts([A])

        self.assertEqual(obs_md, exp_md)

        # This check is redundant because `Metadata.__eq__` being used above
        # takes source artifacts into account. Doesn't hurt to have an explicit
        # check though, since this API didn't always track source artifacts
        # (this check also future-proofs the test in case `Metadata.__eq__`
        # changes in the future).
        self.assertEqual(obs_md.artifacts, (A,))
Ejemplo n.º 4
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)

    # drop non-numeric columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric')
    non_numeric_cols = pre_filtered_cols - set(metadata.columns)

    # filter 0 variance numerical columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_zero_variance=True,
                                       drop_all_missing=True)
    zero_variance_cols = pre_filtered_cols - set(metadata.columns)

    # Drop samples that have any missing values.
    # TODO use Metadata API if this type of filtering is supported in the
    # future.
    df = metadata.to_dataframe()
    df = df.dropna(axis='index', how='any')

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'non_numeric_cols': ', '.join(sorted(non_numeric_cols)),
        'zero_variance_cols': ', '.join(sorted(zero_variance_cols)),
        'result': result})
Ejemplo n.º 5
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: str = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            metadata.get_column(i.name())

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations),
               str(n_jobs), results_fp]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
Ejemplo n.º 6
0
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix,
                           metadata: qiime2.Metadata,
                           where: str=None,
                           exclude_ids: bool=False) -> skbio.DistanceMatrix:
    ids_to_keep = metadata.ids(where=where)
    if exclude_ids:
        ids_to_keep = set(distance_matrix.ids) - set(ids_to_keep)
    # NOTE: there is no guaranteed ordering to output distance matrix because
    # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration
    # order.
    try:
        return distance_matrix.filter(ids_to_keep, strict=False)
    except skbio.stats.distance.DissimilarityMatrixError:
        raise ValueError(
            "All samples were filtered out of the distance matrix.")
Ejemplo n.º 7
0
def plot(output_dir: str, pcoa: skbio.OrdinationResults,
         metadata: qiime2.Metadata, custom_axis: str=None) -> None:

    mf = metadata.to_dataframe()
    viz = Emperor(pcoa, mf, remote='.')

    if custom_axis is not None:
        # put custom_axis inside a list to workaround the type system not
        # supporting lists of types
        html = viz.make_emperor(standalone=True, custom_axes=[custom_axis])
    else:
        html = viz.make_emperor(standalone=True)
    viz.copy_support_files(output_dir)
    with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh:
        fh.write(html)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir)
Ejemplo n.º 8
0
def filter_seqs(data: pd.Series, table: biom.Table=None,
                metadata: qiime2.Metadata=None, where: str=None,
                exclude_ids: bool=False) -> pd.Series:
    if table is not None and metadata is not None:
        raise ValueError('Filtering with metadata and filtering with a table '
                         'are mutually exclusive.')
    elif table is None and metadata is None:
        raise ValueError('No filtering requested. Must provide either table '
                         'or metadata.')
    elif table is not None:
        ids_to_keep = table.ids(axis='observation')
    else:
        # Note, no need to check for missing feature IDs in the metadata,
        # because that is basically the point of this method.
        ids_to_keep = metadata.get_ids(where=where)

    if exclude_ids is True:
        ids_to_keep = set(data.index) - set(ids_to_keep)
    filtered = data[data.index.isin(ids_to_keep)]
    if filtered.empty is True:
        raise ValueError('All features were filtered out of the data.')
    return filtered
Ejemplo n.º 9
0
def add_dietary_phase(
    host_subject_id,
    phase_name,
    key_dates_spreadsheet,
    input_metadata_file,
    output_metadata_file,
) -> None:
    """Encodes dietary phase information into a sample metadata file.

    The main information needed for this are the phase name (-p) and the key
    dates spreadsheet (-k). This program looks for rows in the key dates
    spreadsheet where the "Event" column contains the text "Started PHASENAME"
    or "Stopped PHASENAME", where PHASENAME is just the string you specified in
    the -p option.

    This program will then use the dates associated with these rows to
    determine ranges of dates for which the given dietary phase was being
    followed -- this is useful if the subject went on and off a diet multiple
    times. The start date of a phase is counted as being in that range; the end
    date is NOT counted as being in that range.

    Finally, this will add a PHASENAME column to the metadata file. Samples
    will be assigned one of three possible values in this column:

        Samples where host_subject_id is equal to the -hsid parameter AND the
        collection_timestamp falls within a dietary phase range will be
        labelled "TRUE".

        Samples where host_subject_id is equal to the -hsid parameter AND the
        collection_timestamp DOES NOT fall within a dietary phase range will
        be labelled "FALSE".

        Samples where host_subject_id is NOT EQUAL to the -hsid parameter
        will be labelled "not applicable".

    This only treats dates as down to the day. So if the subject started a diet
    at 12pm on a day and then ended that diet at 5pm that same day, this code
    will treat both of these dates as occurring on the same day and thus raise
    an error.
    """

    m = Metadata.load(input_metadata_file)
    m_df = m.to_dataframe()

    # Validate the input metadata file, somewhat
    required_cols = {"host_subject_id", "collection_timestamp"}
    if len(required_cols & set(m_df.columns)) < len(required_cols):
        raise ValueError(
            "Input metadata file must include the following columns: "
            "{}".format(required_cols))
    if phase_name in m_df.columns:
        raise ValueError(
            "A {} column already exists in the input metadata!".format(
                phase_name))

    # Validate the key dates spreadsheet, somewhat
    kd = pd.read_excel(key_dates_spreadsheet, index_col=0)
    # I didn't actually know this functionality existed until I saw this SO
    # answer: https://stackoverflow.com/a/57187654/10730311
    if not pd.api.types.is_datetime64_any_dtype(kd.index):
        raise ValueError(
            "First column of the key dates spreadsheet must contain "
            "dates/timestamps")
    if "Event" not in kd.columns:
        raise ValueError(
            'Key dates spreadsheet must contain an "Event" column')

    # Determine ranges for starting/stopping a given diet (this requires a
    # decent amount of validation)
    starting_dates = kd.loc[
        kd["Event"].str.find("Started {}".format(phase_name)) >= 0]
    if len(starting_dates.index) < 1:
        raise ValueError("No starting dates for the specified phase given")

    stopping_dates = kd.loc[
        kd["Event"].str.find("Stopped {}".format(phase_name)) >= 0]
    if len(stopping_dates.index) < 1:
        raise ValueError("No stopping dates for the specified phase given")

    if len(starting_dates.index) != len(stopping_dates.index):
        raise ValueError(
            "Number of starting/stopping dates must be consistent (if the "
            "phase continues to the final sample, then you'll need to add a "
            "stoppping row for the day of or after that sample)")
    print('Found {} ranges for the "{}" dietary phase.'.format(
        len(starting_dates.index), phase_name))

    # We now know that we have an equal (and >= 1) number of starting and
    # stopping dates, but we'd like to know if the dates actually make sense.
    #
    # This necessitates checking that every stopping date occurs later than its
    # corresponding starting date, *and* ensuring that every starting date
    # occurs later than the previous stopping date (i.e. the ranges are in
    # chronological order)
    #
    # You can think of this graphically as something like:
    #
    # A1---B1 A2--B2     A3B3 A4-----B4  A5-B5 A6--B6
    #
    # where each A is a starting date and each B is a stopping date. Notice how
    # these ranges are not overlapping, so they can just be represented as a
    # single line -- this is what we're checking for here.
    for i in range(len(starting_dates.index)):
        # NOTE: we use .date() to just get the date, not the timestamp, of
        # datetimes. This lets us do comparisons only down to the day level.
        # Thanks to https://stackoverflow.com/a/13227661/10730311.
        da = starting_dates.iloc[i].name.date()
        db = stopping_dates.iloc[i].name.date()
        if da >= db:
            raise ValueError("Starting date {} occurs later or on same day as "
                             "corresponding stopping date {}.".format(da, db))
        if i > 0:
            prev_db = stopping_dates.iloc[i - 1].name.date()
            if da <= prev_db:
                raise ValueError(
                    "Starting date {} occurs earlier or on same day as "
                    "previous stopping date {}.".format(da, prev_db))

    # OK, now we know the ranges are good! We're done validating the inputs at
    # this point.

    m_df[phase_name] = "not applicable"

    for sample_id in m_df.index:
        if m_df.loc[sample_id, "host_subject_id"] == host_subject_id:
            # Parse sample timestamp
            sample_date = parse(m_df["collection_timestamp"][sample_id]).date()

            # If the sample was collected before any of the ranges, then we'll
            # never get into the first "if" statement in the for loop below.
            # That's fine; in this case, the sample doesn't fall in any of the
            # ranges, so we can safely leave its value as FALSE.
            phase_value = "FALSE"

            # Iterate backwards through ranges
            for ii in range(len(starting_dates.index))[::-1]:
                if sample_date >= starting_dates.iloc[ii].name.date():
                    if sample_date < stopping_dates.iloc[ii].name.date():
                        phase_value = "TRUE"
                        break
                    else:
                        # We know that this sample occurred after the current
                        # range. Furthermore, we're looking at the ranges in
                        # descending order, so we know that the sample wasn't
                        # in any ranges after this one. Therefore, we can
                        # conclusively say that this sample is not present in
                        # any ranges.
                        #
                        # ...However, the fact that this sample was collected
                        # *after* the diet was started for the first time could
                        # be interesting, esp. if the effects of the diet were
                        # residual. So we assign a special value for these
                        # samples; depending on how you want to interpret this
                        # data, this can be handled in a few different ways.
                        # (For stuff like plotting sample ordinations, making
                        # this distinction clear is useful.)
                        phase_value = "FALSE BUT TAKEN AFTER DIET START"
                        break

            m_df.loc[sample_id, phase_name] = phase_value

        # For samples where the host subject ID *does not* match the one
        # specified, the phase_name value will be left as "not applicable"

    # Cool, we're done!

    Metadata(m_df).save(output_metadata_file)
Ejemplo n.º 10
0
                                            'stdv-kmer-per-region':
                                            np.std([1, 2], ddof=1),
                                            'mapped-asvs':
                                            'asv02|asv03|asv08'
                                        },
                                        'seq4': {
                                            'num-regions': 1,
                                            'total-kmers-mapped': 1,
                                            'mean-kmer-per-region': 1,
                                            'stdv-kmer-per-region': 0,
                                            'mapped-asvs': 'asv09'
                                        },
                                        'seq5': {
                                            'num-regions': 2,
                                            'total-kmers-mapped': 2,
                                            'mean-kmer-per-region': 1,
                                            'stdv-kmer-per-region': 0,
                                            'mapped-asvs': 'asv04|asv05|asv10',
                                        },
                                        'seq6': {
                                            'num-regions': 2,
                                            'total-kmers-mapped': 2,
                                            'mean-kmer-per-region': 1,
                                            'stdv-kmer-per-region': 0,
                                            'mapped-asvs': 'asv04|asv05|asv11',
                                        },
                                    })
db_summary.index.set_names('feature-id', inplace=True)
db_summary = Artifact.import_data('FeatureData[ReconstructionSummary]',
                                  Metadata(db_summary))
Ejemplo n.º 11
0
def _14(obj:Metadata) -> ReconSummaryFormat:
    ff = ReconSummaryFormat()
    obj.save(str(ff))
    return ff
Ejemplo n.º 12
0
def summarize(output_dir: str,
              table: biom.Table,
              sample_metadata: qiime2.Metadata = None) -> None:
    number_of_features, number_of_samples = table.shape

    sample_summary, sample_frequencies = _frequency_summary(table,
                                                            axis='sample')
    if number_of_samples > 1:
        # Calculate the bin count, with a minimum of 5 bins
        IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile']
        if IQR == 0.0:
            bins = 5
        else:
            # Freedman–Diaconis rule
            bin_width = (2 * IQR) / (number_of_samples**(1 / 3))

            bins = max((sample_summary['Maximum frequency'] -
                        sample_summary['Minimum frequency']) / bin_width, 5)

        sample_frequencies_ax = sns.distplot(sample_frequencies,
                                             kde=False,
                                             rug=True,
                                             bins=int(round(bins)))
        sample_frequencies_ax.get_xaxis().set_major_formatter(
            matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        sample_frequencies_ax.set_xlabel('Frequency per sample')
        sample_frequencies_ax.set_ylabel('Number of samples')
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.pdf'))
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.png'))
        plt.gcf().clear()

    feature_summary, feature_frequencies = _frequency_summary(
        table, axis='observation')
    if number_of_features > 1:
        feature_frequencies_ax = sns.distplot(feature_frequencies,
                                              kde=False,
                                              rug=False)
        feature_frequencies_ax.set_xlabel('Frequency per feature')
        feature_frequencies_ax.set_ylabel('Number of features')
        feature_frequencies_ax.set_xscale('log')
        feature_frequencies_ax.set_yscale('log')
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.pdf'))
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.png'))

    sample_summary_table = q2templates.df_to_html(
        sample_summary.apply('{:,}'.format).to_frame('Frequency'))
    feature_summary_table = q2templates.df_to_html(
        feature_summary.apply('{:,}'.format).to_frame('Frequency'))

    index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html')
    context = {
        'number_of_samples': number_of_samples,
        'number_of_features': number_of_features,
        'total_frequencies': int(np.sum(sample_frequencies)),
        'sample_summary_table': sample_summary_table,
        'feature_summary_table': feature_summary_table,
    }

    feature_qualitative_data = _compute_qualitative_summary(table)
    sample_frequencies.sort_values(inplace=True, ascending=False)
    feature_frequencies.sort_values(inplace=True, ascending=False)
    sample_frequencies.to_csv(
        os.path.join(output_dir, 'sample-frequency-detail.csv'))
    feature_frequencies.to_csv(
        os.path.join(output_dir, 'feature-frequency-detail.csv'))

    feature_frequencies = feature_frequencies.astype(int) \
        .apply('{:,}'.format).to_frame('Frequency')
    feature_frequencies['# of Samples Observed In'] = \
        pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format)
    feature_frequencies_table = q2templates.df_to_html(feature_frequencies)
    overview_template = os.path.join(TEMPLATES, 'summarize_assets',
                                     'overview.html')
    sample_frequency_template = os.path.join(TEMPLATES, 'summarize_assets',
                                             'sample-frequency-detail.html')
    feature_frequency_template = os.path.join(TEMPLATES, 'summarize_assets',
                                              'feature-frequency-detail.html')

    context.update({
        'max_count':
        sample_frequencies.max(),
        'feature_frequencies_table':
        feature_frequencies_table,
        'feature_qualitative_data':
        feature_qualitative_data,
        'tabs': [{
            'url': 'overview.html',
            'title': 'Overview'
        }, {
            'url': 'sample-frequency-detail.html',
            'title': 'Interactive Sample Detail'
        }, {
            'url': 'feature-frequency-detail.html',
            'title': 'Feature Detail'
        }]
    })
    templates = [
        index, sample_frequency_template, feature_frequency_template,
        overview_template
    ]
    q2templates.render(templates, output_dir, context=context)

    shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'),
                    os.path.join(output_dir, 'dist'))

    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
        fh.write("app.init(")
        if sample_metadata:
            sample_metadata = sample_metadata.filter_ids(
                sample_frequencies.index)
            # TODO use Metadata.to_json() API if/when it exists in the future.
            sample_metadata.to_dataframe().to_json(fh)
        else:
            fh.write('{}')
        fh.write(', ')
        sample_frequencies.to_json(fh)
        fh.write(');')
Ejemplo n.º 13
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    # Metadata column filtering could be done in one pass, but this visualizer
    # displays separate warnings for non-categorical columns, and categorical
    # columns that didn't satisfy the requirements of the statistics being
    # computed.
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='categorical')
    non_categorical_columns = pre_filtered_cols - set(metadata.columns)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_all_unique=True,
                                       drop_zero_variance=True,
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata does not contain any columns that satisfy this "
            "visualizer's requirements. There must be at least one metadata "
            "column that contains categorical data, isn't empty, doesn't "
            "consist of unique values, and doesn't consist of exactly one "
            "value.")

    metric_name = alpha_diversity.name

    filenames = []
    filtered_group_comparisons = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat(
            [alpha_diversity, metadata_column.to_series()],
            axis=1,
            join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_column.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[metric_name]))

        escaped_column = quote(column)
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        # perform Kruskal-Wallis across all groups
        kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

        # perform pairwise Kruskal-Wallis across all pairs of groups and
        # correct for multiple comparisons
        kw_H_pairwise = []
        for i in range(len(names)):
            for j in range(i):
                try:
                    H, p = scipy.stats.mstats.kruskalwallis(
                        groups[i], groups[j])
                    kw_H_pairwise.append([names[j], names[i], H, p])
                except ValueError:
                    filtered_group_comparisons.append([
                        '%s:%s' % (column, names[i]),
                        '%s:%s' % (column, names[j])
                    ])
        kw_H_pairwise = pd.DataFrame(
            kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
        kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
        kw_H_pairwise['q-value'] = multipletests(kw_H_pairwise['p-value'],
                                                 method='fdr_bh')[1]
        kw_H_pairwise.sort_index(inplace=True)
        pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column
        pairwise_path = os.path.join(output_dir, pairwise_fn)
        kw_H_pairwise.to_csv(pairwise_path)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            series = pd.Series(groups, index=names)

            fh.write("load_data('%s'," % column)
            series.to_json(fh, orient='split')
            fh.write(",")
            json.dump(
                {
                    'initial': initial_data_length,
                    'filtered': filtered_data_length
                }, fh)
            fh.write(",")
            json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
            fh.write(",'")
            table = q2templates.df_to_html(kw_H_pairwise)
            fh.write(table.replace('\n', '').replace("'", "\\'"))
            fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))

    index = os.path.join(TEMPLATES, 'alpha_group_significance_assets',
                         'index.html')
    q2templates.render(
        index,
        output_dir,
        context={
            'columns': [quote(fn) for fn in filenames],
            'non_categorical_columns':
            ', '.join(sorted(non_categorical_columns)),
            'filtered_columns':
            ', '.join(sorted(filtered_columns)),
            'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])
        })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Ejemplo n.º 14
0
def alpha_correlation(output_dir: str,
                      alpha_diversity: pd.Series,
                      metadata: qiime2.Metadata,
                      method: str = 'spearman') -> None:
    try:
        alpha_correlation_fn = _alpha_correlation_fns[method]
    except KeyError:
        raise ValueError('Unknown alpha correlation method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_alpha_correlation_fns.keys())))

    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric',
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata contains only non-numeric or empty columns. This "
            "visualizer requires at least one numeric metadata column to "
            "execute.")

    filenames = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        # create a dataframe containing the data to be correlated, and drop
        # any samples that have no data in either column
        df = pd.concat([metadata_column.to_series(), alpha_diversity],
                       axis=1,
                       join='inner')

        # compute correlation
        correlation_result = alpha_correlation_fn(df[metadata_column.name],
                                                  df[alpha_diversity.name])

        warning = None
        if alpha_diversity.shape[0] != df.shape[0]:
            warning = {
                'initial': alpha_diversity.shape[0],
                'method': method.title(),
                'filtered': df.shape[0]
            }

        escaped_column = quote(column)
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('%s'," % column)
            df.to_json(fh, orient='split')
            fh.write(",")
            json.dump(warning, fh)
            fh.write(",")
            json.dump(
                {
                    'method': method.title(),
                    'testStat': '%1.4f' % correlation_result[0],
                    'pVal': '%1.4f' % correlation_result[1],
                    'sampleSize': df.shape[0]
                }, fh)
            fh.write(");")

    index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'columns': [quote(fn) for fn in filenames],
                           'filtered_columns':
                           ', '.join(sorted(filtered_columns))
                       })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Ejemplo n.º 15
0
def cast_metadata(paths, cast, output_file, ignore_extra, error_on_missing):
    import tempfile
    from qiime2 import Metadata, metadata

    md = _merge_metadata(paths)

    cast_dict = {}
    try:
        for casting in cast:
            if ':' not in casting:
                raise click.BadParameter(
                    message=f'Missing `:` in --cast {casting}',
                    param_hint='cast')
            splitter = casting.split(':')
            if len(splitter) != 2:
                raise click.BadParameter(
                    message=f'Incorrect number of fields in --cast {casting}.'
                    f' Observed {len(splitter)}'
                    f' {tuple(splitter)}, expected 2.',
                    param_hint='cast')
            col, type_ = splitter
            if col in cast_dict:
                raise click.BadParameter(
                    message=(f'Column name "{col}" appears in cast more than'
                             ' once.'),
                    param_hint='cast')
            cast_dict[col] = type_
    except Exception as err:
        header = \
            ('Could not parse provided cast arguments into unique COLUMN:TYPE'
             ' pairs. Please make sure all cast flags are of the format --cast'
             ' COLUMN:TYPE')
        q2cli.util.exit_with_error(err, header=header)

    types = set(cast_dict.values())
    if not types.issubset(_COLUMN_TYPES):
        raise click.BadParameter(
            message=('Unknown column type provided. Please make sure all'
                     ' columns included in your cast contain a valid column'
                     ' type. Valid types: %s' % (', '.join(_COLUMN_TYPES))),
            param_hint='cast')

    column_names = set(md.columns.keys())
    cast_names = set(cast_dict.keys())

    if not ignore_extra:
        if not cast_names.issubset(column_names):
            cast = cast_names.difference(column_names)
            raise click.BadParameter(
                message=('The following cast columns were not found'
                         ' within the metadata: %s' % (', '.join(cast))),
                param_hint='cast')

    if error_on_missing:
        if not column_names.issubset(cast_names):
            cols = column_names.difference(cast_names)
            raise click.BadParameter(
                message='The following columns within the metadata'
                ' were not provided in the cast: %s' % (', '.join(cols)),
                param_hint='cast')

    # Remove entries from the cast dict that are not in the metadata to avoid
    # errors further down the road
    for cast in cast_names:
        if cast not in column_names:
            cast_dict.pop(cast)

    with tempfile.NamedTemporaryFile() as temp:
        md.save(temp.name)
        try:
            cast_md = Metadata.load(temp.name, cast_dict)
        except metadata.io.MetadataFileError as e:
            raise click.BadParameter(message=e, param_hint='cast') from e

    if output_file:
        cast_md.save(output_file)
    else:
        with tempfile.NamedTemporaryFile(mode='w+') as stdout_temp:
            cast_md.save(stdout_temp.name)
            stdout_str = stdout_temp.read()
            click.echo(stdout_str)
Ejemplo n.º 16
0
def _4(data: qiime2.Metadata) -> QiitaMetadataFormat:
    ff = QiitaMetadataFormat()
    md_df = data.to_dataframe()
    with ff.open() as fh:
        md_df.to_csv(fh, sep='\t', header=True)
    return ff
Ejemplo n.º 17
0
def draw_interactive_map(output_dir: str,
                         metadata: qiime2.Metadata,
                         column: str = None,
                         latitude: str = 'Latitude',
                         longitude: str = 'Longitude',
                         color_palette: str = 'rainbow',
                         discrete: bool = False,
                         missing_data: str = 'error'):

    metadata = _load_and_validate(
        metadata, [column, latitude, longitude],
        ['column', 'latitude', 'longitude'], missing_data)

    lat_0, lat_1, lon_0, lon_1 = get_max_extent(
        metadata[latitude], metadata[longitude])
    loc_min, loc_max = [lon_0, lat_0], [lon_1, lat_1]

    cmap = plt.get_cmap(color_palette)

    data = []
    # If column is numeric, color points by column
    if np.issubdtype(metadata[column].dtype, np.number) and not discrete:
        metadata[column] = metadata[column].astype(float)
        normalize = mcolors.Normalize(
            vmin=metadata[column].min(), vmax=metadata[column].max())
        scalarmappaple = cm.ScalarMappable(
            norm=normalize, cmap=cmap)
        scalarmappaple.set_array(metadata[column])

        fig, ax = plt.subplots()
        plt.colorbar(scalarmappaple).set_label(column)
        ax.remove()

        metadata.sort_values(by=column, ascending=False, inplace=True)
        for i, row in metadata.iterrows():
            data.append({
                'sample_id': i,
                column: row[column],
                'latitude': row[latitude],
                'longitude': row[longitude],
                'color': mcolors.to_hex(scalarmappaple.to_rgba(row[column]))
            })
    # if column is not numeric, color discretely
    else:
        groups = metadata[column].unique()
        len_groups = len(groups)
        colors = {g: mcolors.to_hex(c) for g, c in zip(
            groups, cmap(np.linspace(0, 1, len(groups))))}

        for i, row in metadata.iterrows():
            data.append({
                'sample_id': i,
                column: row[column],
                'latitude': row[latitude],
                'longitude': row[longitude],
                'color': colors[row[column]]
            })

        fig = plt.figure(figsize=[len_groups * 0.05, len_groups/2])
        ax = fig.add_axes([0, 0, 1, 1])

        for idx, (g, color) in enumerate(colors.items()):
            r = mpatch.Rectangle((0, idx), 1, 1, color=color)
            _ = ax.text(2, idx+.5, '  %s' % g, va='center', fontsize=10)
            ax.add_patch(r)
            ax.axhline(idx, color='k')
        ax.set_xlim(0, 3)
        ax.set_ylim(0, idx + 2)
        ax.axis('off')

    save_animated_map(output_dir, loc_min, loc_max, data, column)
Ejemplo n.º 18
0
def import_dada2_stats_df_to_q2(df):
    combined_artifact = Artifact.import_data("SampleData[DADA2Stats]", Metadata(df))

    return combined_artifact
Ejemplo n.º 19
0
def ancombc(
    table: pd.DataFrame,
    metadata: qiime2.Metadata,
    formula: str,
    p_adj_method: str = "holm",
    zero_cut: float = 0.90,
    lib_cut: int = 1000,
    group: str = None,
    struc_zero: bool = True,
    neg_lb: bool = True,
    tol: float = 1e-5,
    max_iter: int = 100,
    conserve: bool = True,
    alpha: float = 0.05,
    # global_test : bool = True
) -> pd.DataFrame:

    # create series from the metadata column
    meta = metadata.to_dataframe()

    # checks for variable lengths and warns if there's only one value per
    # group. ANCOM will fail silently lateer because of it and thats harder
    # to debug
    variables = np.unique(np.hstack([x.split("*")
                                     for x in formula.split("+")]))
    variables = np.array([x.strip() for x in variables])
    var_counts = pd.DataFrame.from_dict(
        orient='index',
        data={
            var: {
                'n_groups': len(meta[var].dropna().unique())
            }
            for var in variables
        })
    if (var_counts['n_groups'] < 2).all():
        raise ValueError("None of the columns in the metadata satisfy "
                         "ANCOM-BC's requirements. All columns in the "
                         "formula should have more than one value.")

    # filter the metadata so only the samples present in the table are used
    # this also reorders it for the correct condition selection
    # it has to be re ordered for ancombc to correctly input the conditions
    meta = meta.loc[list(table.index)]

    # force reorder based on the data to ensure conds are selected correctly
    with tempfile.TemporaryDirectory() as temp_dir_name:
        temp_dir_name = '.'  # debug
        biom_fp = os.path.join(temp_dir_name, 'input.biom.tsv')
        meta_fp = os.path.join(temp_dir_name, 'input.map.txt')
        summary_fp = os.path.join(temp_dir_name, 'output.summary.txt')
        # Need to manually specify header=True for Series (i.e. "meta"). It's
        # already the default for DataFrames (i.e. "table"), but we manually
        # specify it here anyway to alleviate any potential confusion.
        table.to_csv(biom_fp, sep='\t', header=True)
        meta.to_csv(meta_fp, sep='\t', header=True)

        if group is None:
            group = formula

        cmd = [
            'run_ancombc.R',
            biom_fp,  # inp.abundances.path
            meta_fp,  # inp.metadata.path
            formula,  # formula
            p_adj_method,  # p_adj_method
            zero_cut,  # zero_cut
            lib_cut,  # lib_cut
            group,  # group
            str(struc_zero).upper(),  # struc_zero
            str(neg_lb).upper(),  # neg_lb
            tol,  # tol
            max_iter,  # max_iter
            str(conserve).upper(),  # conserve
            alpha,  # alpha
            'FALSE',  # global -- temporary until better understood
            # str(global_test).upper(),   # global
            summary_fp  # output
        ]
        cmd = list(map(str, cmd))

        try:
            global_test = run_commands([cmd])
            # TODO: not sure what to do about the `global_test` statistic
            # may need another custom q2 type for this...
        except subprocess.CalledProcessError as e:
            raise Exception("An error was encountered while running ANCOMBC"
                            " in R (return code %d), please inspect stdout"
                            " and stderr to learn more." % e.returncode)

        summary = pd.read_csv(summary_fp, index_col=0)

        # del summary['diff_abn']  # remove this field for now ...
        # summary.index.name = "featureid"
        return summary
Ejemplo n.º 20
0
def ordinate(table,
             metadata=None,
             metric='jaccard',
             sampling_depth=-1,
             phylogeny=None,
             number_of_dimensions=None,
             biplot=False):
    """Perform ordination using principal coordinate analysis (PCoA).

    This method wraps multiple QIIME 2 methods to perform ordination and
    returns Artifact object containing PCoA results.

    Under the hood, this method filters the samples (if requested), performs
    rarefying of the feature table (if requested), computes distance matrix,
    and then runs PCoA.

    By default, the method returns PCoAResults. For creating a biplot,
    use `biplot=True` which returns PCoAResults % Properties('biplot').

    Parameters
    ----------
    table : str or qiime2.Artifact
        Artifact file or object corresponding to FeatureTable[Frequency].
    metadata : str or qiime2.Metadata, optional
        Metadata file or object. All samples in 'metadata' that are also in
        the feature table will be retained.
    metric : str, default: 'jaccard'
        Metric used for distance matrix computation ('jaccard',
        'bray_curtis', 'unweighted_unifrac', or 'weighted_unifrac').
    sampling_depth : int, default: -1
        If negative, skip rarefying. If 0, rarefy to the sample with minimum
        depth. Otherwise, rarefy to the provided sampling depth.
    phylogeny : str, optional
        Rooted tree file. Required if using 'unweighted_unifrac', or
        'weighted_unifrac' as metric.
    number_of_dimensions : int, optional
        Dimensions to reduce the distance matrix to.
    biplot : bool, default: False
        If true, return PCoAResults % Properties('biplot').

    Returns
    -------
    qiime2.Artifact
        Artifact object corresponding to PCoAResults or
        PCoAResults % Properties('biplot').

    See Also
    --------
    beta_2d_plot
    beta_3d_plot
    beta_scree_plot
    beta_parallel_plot

    Notes
    -----
    The resulting Artifact object can be directly used for plotting.

    Examples
    --------
    Below is a simple example. Note that the default distance metric
    used is ``jaccard``. The resulting object ``pcoa`` can be directly
    used for plotting by the ``dokdo.beta_2d_plot`` method as shown below.

    >>> table_file = f'{data_dir}/moving-pictures-tutorial/table.qza'
    >>> metadata_file = f'{data_dir}/moving-pictures-tutorial/sample-metadata.tsv'
    >>> pcoa_results = dokdo.ordinate(table_file)
    >>> dokdo.beta_2d_plot(pcoa_results, metadata=metadata_file, hue='body-site', artist_kwargs=dict(show_legend=True))
    >>> plt.tight_layout()

    .. image:: images/ordinate-1.png

    You can choose a subset of samples.

    >>> from qiime2 import Metadata
    >>> mf = dokdo.get_mf(metadata_file)
    >>> mf = mf[mf['body-site'].isin(['gut', 'left palm'])]
    >>> pcoa_results = dokdo.ordinate(table_file, metadata=Metadata(mf))
    >>> dokdo.beta_2d_plot(pcoa_results, metadata=metadata_file, hue='body-site', artist_kwargs=dict(show_legend=True))
    >>> plt.tight_layout()

    .. image:: images/ordinate-2.png

    You can also generate a biplot.

    >>> pcoa_results = dokdo.ordinate(table_file, biplot=True, number_of_dimensions=10)
    >>> fig, ax = plt.subplots(1, 1, figsize=(8, 8))
    >>> dokdo.beta_2d_plot(pcoa_results, ax=ax, metadata=metadata_file, hue='body-site', artist_kwargs=dict(show_legend=True))
    >>> dokdo.addbiplot(pcoa_results, ax=ax, count=7)
    >>> plt.tight_layout()

    .. image:: images/ordinate-3.png
    """
    if isinstance(table, Artifact):
        table = table
    elif isinstance(table, str):
        table = Artifact.load(table)
    else:
        raise TypeError(f"Incorrect feature table type: {type(table)}")

    # If metadata is provided, perform sample filtration.
    if metadata is not None:
        if isinstance(metadata, Metadata):
            _metadata = metadata
        else:
            _metadata = Metadata.load(metadata)
        _table = feature_table.methods.filter_samples(
            table=table, metadata=_metadata).filtered_table
    else:
        _table = table

    # Perform rarefying.
    if sampling_depth < 0:
        rarefied_table = _table
    else:
        if sampling_depth == 0:
            sampling_depth = int(_table.view(pd.DataFrame).sum(axis=1).min())

        rarefy_result = feature_table.methods.rarefy(
            table=_table, sampling_depth=sampling_depth)

        rarefied_table = rarefy_result.rarefied_table

    if metric == 'jaccard':
        distance_matrix_result = diversity_lib.methods.jaccard(
            table=rarefied_table)
    elif metric == 'bray_curtis':
        distance_matrix_result = diversity_lib.methods.bray_curtis(
            table=rarefied_table)
    elif metric == 'unweighted_unifrac':
        distance_matrix_result = diversity_lib.methods.unweighted_unifrac(
            table=rarefied_table, phylogeny=Artifact.load(phylogeny))
    elif metric == 'weighted_unifrac':
        distance_matrix_result = diversity_lib.methods.weighted_unifrac(
            table=rarefied_table, phylogeny=Artifact.load(phylogeny))
    else:
        raise ValueError(f"Incorrect metric detected: {metric}")

    distance_matrix = distance_matrix_result.distance_matrix

    result_obj = diversity.methods.pcoa(
        distance_matrix=distance_matrix,
        number_of_dimensions=number_of_dimensions)
    pcoa_results = result_obj.pcoa

    if biplot:
        rf_result = feature_table.methods.relative_frequency(table=_table)
        rf_table = rf_result.relative_frequency_table
        result_obj = diversity.methods.pcoa_biplot(pcoa=pcoa_results,
                                                   features=rf_table)
        pcoa_results = result_obj.biplot

    return pcoa_results
Ejemplo n.º 21
0
def _15(df: MappingDirectoryFormat) -> Metadata:
    d = df.mapping.view(dict)
    return Metadata(pd.DataFrame(d, index=pd.Index(["0"], name='id')))
Ejemplo n.º 22
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    metadata_df = metadata.to_dataframe()
    metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore')
    pre_filtered_cols = set(metadata_df.columns)
    metadata_df = metadata_df.select_dtypes(exclude=[np.number])
    post_filtered_cols = set(metadata_df.columns)
    filtered_numeric_categories = pre_filtered_cols - post_filtered_cols
    filtered_group_comparisons = []

    categories = metadata_df.columns
    metric_name = alpha_diversity.name

    if len(categories) == 0:
        raise ValueError('Only numeric data is present in metadata file.')

    filenames = []
    filtered_categories = []
    for category in categories:
        metadata_category = metadata.get_category(category).to_series()
        metadata_category = metadata_category[alpha_diversity.index]
        metadata_category = metadata_category.replace(r'', np.nan).dropna()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat([alpha_diversity, metadata_category], axis=1,
                         join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_category.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[alpha_diversity.name]))

        if (len(groups) > 1 and len(groups) != len(data.index)):
            escaped_category = quote(category)
            filename = 'category-%s.jsonp' % escaped_category
            filenames.append(filename)

            # perform Kruskal-Wallis across all groups
            kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

            # perform pairwise Kruskal-Wallis across all pairs of groups and
            # correct for multiple comparisons
            kw_H_pairwise = []
            for i in range(len(names)):
                for j in range(i):
                    try:
                        H, p = scipy.stats.mstats.kruskalwallis(groups[i],
                                                                groups[j])
                        kw_H_pairwise.append([names[j], names[i], H, p])
                    except ValueError:
                        filtered_group_comparisons.append(
                            ['%s:%s' % (category, names[i]),
                             '%s:%s' % (category, names[j])])
            kw_H_pairwise = pd.DataFrame(
                kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
            kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
            kw_H_pairwise['q-value'] = multipletests(
                kw_H_pairwise['p-value'], method='fdr_bh')[1]
            kw_H_pairwise.sort_index(inplace=True)
            pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_category
            pairwise_path = os.path.join(output_dir, pairwise_fn)
            kw_H_pairwise.to_csv(pairwise_path)

            with open(os.path.join(output_dir, filename), 'w') as fh:
                df = pd.Series(groups, index=names)

                fh.write("load_data('%s'," % category)
                df.to_json(fh, orient='split')
                fh.write(",")
                json.dump({'initial': initial_data_length,
                           'filtered': filtered_data_length}, fh)
                fh.write(",")
                json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
                fh.write(",'")
                table = kw_H_pairwise.to_html(classes="table table-striped "
                                              "table-hover")
                table = table.replace('border="1"', 'border="0"')
                fh.write(table.replace('\n', ''))
                fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))
        else:
            filtered_categories.append(category)

    index = os.path.join(
        TEMPLATES, 'alpha_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'categories': [quote(fn) for fn in filenames],
        'filtered_numeric_categories': ', '.join(filtered_numeric_categories),
        'filtered_categories': ', '.join(filtered_categories),
        'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])})

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dst'),
        os.path.join(output_dir, 'dist'))
Ejemplo n.º 23
0
def qarcoal(
    table: biom.Table,
    taxonomy: pd.DataFrame,
    num_string: str,
    denom_string: str,
    samples_to_use: Metadata = None,
    allow_shared_features: bool = False,
) -> pd.DataFrame:
    """Calculate sample-wise log-ratios of features based on taxonomy.

    Parameters:
    -----------
        table: biom file with which to calculate log ratios
        taxonomy: pd.DataFrame with taxonomy information (should have Taxon
            column in which features will be searched)
        num_string: numerator string to search for in taxonomy
        denom_string: denominator string to search for in taxonomy
        samples_to_use: Q2 Metadata file with samples to use.
            If provided, feature table will be filtered to only consider
            samples present in this file. (optional)
        allow_shared_features: bool denoting handling of shared features
            between numerator and denominator. If False, an error is raised
            if features are shared between numerator and denominator. If True,
            will allow shared features without throwing an error.
    Returns:
    --------
        comparison_df: pd DataFrame in the form:

            Sample-ID    Num_Sum    Denom_Sum   log_ratio
                   S1          7           15   -0.762140
    """

    # biom table is features x samples
    if samples_to_use is not None:
        filt_samples = set(samples_to_use.to_dataframe().index)
        feat_table = table.filter(filt_samples, axis="sample", inplace=False)
        feat_table = feat_table.to_dataframe()
    else:
        feat_table = table.to_dataframe()

    # raise error if there are any negative counts in the feature table
    if feat_table.lt(0).any().any():
        raise ValueError("Feature table has negative counts!")

    tax_num_df, tax_denom_df = filter_and_join_taxonomy(
        feat_table,
        taxonomy,
        num_string,
        denom_string,
    )

    # if shared features are disallowed, check to make sure they don't occur
    # if allowed, can skip this step at user's risk
    if not allow_shared_features:
        shared_features = set(tax_num_df.index) & set(tax_denom_df.index)
        if shared_features:
            raise ValueError("Shared features between num and denom!")

    tax_num_sample_sum = tax_num_df.sum(axis=0)
    tax_denom_sample_sum = tax_denom_df.sum(axis=0)

    comparison_df = pd.DataFrame.from_records(
        [tax_num_sample_sum, tax_denom_sample_sum],
        index=["Num_Sum", "Denom_Sum"],
    ).T
    comparison_df["log_ratio"] = comparison_df.apply(
        lambda x: np.log(x.Num_Sum / x.Denom_Sum), axis=1
    )
    comparison_df.index.name = "Sample-ID"

    return comparison_df
Ejemplo n.º 24
0
def md1_factory():
    return Metadata(
        pd.DataFrame({'a': ['1', '2', '3']},
                     index=pd.Index(['0', '1', '2'], name='id')))
Ejemplo n.º 25
0
        ZebraFilter(.9999, "../zebra.csv"),
    ])

    return MultiFactory([
        woltka_levels,
        zebra,
    ])


if __name__ == "__main__":
    # Pretend all scripts are run from root of repo for file paths.
    import os
    os.chdir("..")
    factory = configure()
    # SavePreprocessedTables().run(factory)
    metadata = Metadata.load(metadata_filepath)
    print(metadata)

    for config in factory.gen_configurations():
        print(config.analysis_name)
        path = "tables/"+config.analysis_name+"_train.qza"
        table = Artifact.load(path)

        alpha_result = alpha(table=table, metric='shannon')
        alpha_diversity = alpha_result.alpha_diversity
        series = alpha_diversity.view(pd.Series)

        print(series)

        # Argh, how do I tell it what I care about?
        # vis_alpha = alpha_group_significance(alpha_diversity, metadata)
Ejemplo n.º 26
0
def md2_factory():
    return Metadata(
        pd.DataFrame({'b': ['4', '5', '6']},
                     index=pd.Index(['0', '1', '2'], name='id')))
Ejemplo n.º 27
0
def alpha_rarefaction(output_dir: str,
                      table: biom.Table,
                      max_depth: int,
                      phylogeny: skbio.TreeNode = None,
                      metrics: set = None,
                      metadata: qiime2.Metadata = None,
                      min_depth: int = 1,
                      steps: int = 10,
                      iterations: int = 10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))

    if metadata is None:
        columns, filtered_columns = set(), set()
    else:
        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        # Drop metadata columns that aren't categorical, or consist solely of
        # missing values.
        pre_filtered_cols = set(metadata.columns)
        metadata = metadata.filter_columns(column_type='categorical',
                                           drop_all_missing=True)
        filtered_columns = pre_filtered_cols - set(metadata.columns)

        metadata_df = metadata.to_dataframe()
        metadata_df.columns = pd.MultiIndex.from_tuples([
            (c, '') for c in metadata_df.columns
        ])
        columns = metadata_df.columns.get_level_values(0)

    data = _compute_rarefaction_data(table, min_depth, max_depth, steps,
                                     iterations, phylogeny, metrics)

    filenames = []
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            merged = data.join(metadata_df, how='left')
            for column in columns:
                column_name = quote(column)
                reindexed_df, counts = _reindex_with_metadata(
                    column, columns, merged)
                c_df = _compute_summary(reindexed_df, column, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, column)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = [
                'depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values
            ]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'metrics': list(metrics),
                           'filenames': [quote(f) for f in filenames],
                           'columns': list(columns),
                           'steps': steps,
                           'filtered_columns': sorted(filtered_columns)
                       })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Ejemplo n.º 28
0
        print("%s sample pairs matched together" %
              (len(case_to_control_match.keys())))

        for key in case_to_control_match:
            key_value = case_to_control_match[key]
            matchDF.at[key, "matched_to"] = str(key_value)
            matchDF.at[key_value, "matched_to"] = str(key)
    else:
        print("%s cases matched" % (len(case_dictionary.keys())))
        for case in case_dictionary:
            for control in case_dictionary[case]:
                if control in control_dictionary:
                    control_dictionary[control].append(case)
                else:
                    control_dictionary[control] = [case]
            matchDF.at[case,
                       "matched_to"] = ", ".join(sorted(case_dictionary[case]))

        for control in control_dictionary:
            matchDF.at[control, "matched_to"] = ", ".join(
                sorted(control_dictionary[control]))

    matchedMD = Metadata(matchDF)
    if only_matches:
        ids = matchedMD.get_ids("matched_to NOT IN ('none')")
        #shrinks the MD to only have matched samples
        matchedMD = matchedMD.filter_ids(ids)

    return matchedMD
Ejemplo n.º 29
0
    def setUp(self):
        super().setUp()

        # setup taxonomy to be edited
        tax_fp = self.get_data_path('escherichia_shigella_taxonomy.txt')
        self.taxonomy = TSVTaxonomyFormat(tax_fp, mode='r').view(pd.Series)

        # setup full string replacement
        replc = self.get_data_path('taxonomy-replacement-full-strings.txt')
        md_replc = Metadata.load(replc)
        self.md_replc_col = md_replc.get_column('replacements')

        # setup substring replacement
        ss_replc = self.get_data_path('taxonomy-replacement-pass.txt')
        md_ss_replc = Metadata.load(ss_replc)
        self.md_ss_replc_col = md_ss_replc.get_column('replacements')

        # setup substring regex replacement
        ssr_replc = self.get_data_path('taxonomy-replacement-regex.txt')
        md_ssr_replc = Metadata.load(ssr_replc)
        self.md_ssr_replc_col = md_ssr_replc.get_column('replacements')

        # setup reusable dicts
        self.exp_dict_00 = {
            'Sal01': ('d__SUPER_DUPER_BACTERIA; '
                      'p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'Sal02': ('d__SUPER_DUPER_BACTERIA; '
                      'p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'UncultSal': ('d__SUPER_DUPER_BACTERIA; '
                          'p__Proteobacteria; '
                          'c__Gammaproteobacteria; '
                          'o__Enterobacterales; '
                          'f__Enterobacteriaceae; '
                          'g__Escherichia-Shigella; '
                          's__uncultured_Salmonella'),
            'Esch01': ('d__SUPER_DUPER_BACTERIA; '
                       'p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; s__'),
            'Shig01': ('d__SUPER_DUPER_BACTERIA; '
                       'p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; s__')
        }

        self.exp_dict_01 = {
            'Sal01': ('d__Bacteria; p__LAME-PYHLA; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'Sal02': ('d__Bacteria; p__LAME-PYHLA; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'UncultSal': ('d__Bacteria; '
                          'p__LAME-PYHLA; '
                          'c__Gammaproteobacteria; '
                          'o__Enterobacterales; '
                          'f__Enterobacteriaceae; '
                          'g__Escherichia-Shigella; '
                          's__UNCIVILIZED_Salmonella'),
            'Esch01': ('d__Bacteria; p__LAME-PYHLA; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; s__'),
            'Shig01': ('d__Bacteria; p__LAME-PYHLA; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; s__')
        }
        self.exp_dict_02 = {
            'Sal01': ('d__Bacteria; p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'Sal02': ('d__Bacteria; p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'UncultSal': ('d__Bacteria; '
                          'p__Proteobacteria; '
                          'c__Gammaproteobacteria; '
                          'o__Enterobacterales; '
                          'f__Enterobacteriaceae; '
                          'g__Escherichia-Shigella; '
                          's__uncultured_Salmonella'),
            'Esch01': ('d__Bacteria; p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; '
                       's__unknown_Escherichia-Shigella'),
            'Shig01': ('d__Bacteria; p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; '
                       's__unknown_Escherichia-Shigella')
        }
        self.exp_dict_03 = {
            'Sal01': ('d__Bacteria; p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'Sal02': ('d__Bacteria; p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'UncultSal': ('d__Bacteria; '
                          'p__Proteobacteria; '
                          'c__Gammaproteobacteria; '
                          'o__Enterobacterales; '
                          'f__Enterobacteriaceae; '
                          'g__Escherichia-Shigella; '
                          's__UNCIVIL_Salmonella'),
            'Esch01': ('d__Bacteria; p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; '
                       's__unknown_Escherichia-Shigella'),
            'Shig01': ('d__Bacteria; p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; '
                       's__unknown_Escherichia-Shigella')
        }
Ejemplo n.º 30
0
def determine_cases_and_controls(afterExclusion_MD, query_line_dict, extra):
    '''
    Determines what samples are cases or controls using the queries in
        query_line_array. The labels of each sample are stored in
        case_controlDF

    Parameters
    ----------

    afterExclusion_MD : Metadata object
        Metadata object with unwanted samples filtered out

    query_line_array : array of arrays of strings
        there are two sub arrays
        the first array are made of queries to determine controls
        the second array are made of queries to determine cases
        
    extra: boolean
        Tells function whether to shrink metadata in one step or in multiple
            extra print statements that show how many potential case or control
            samples are left after each query

    Returns
    -------
    mergedMD : Metadata object
        Metadata object with unwanted samples filtered out and a case_control
            column that reflects if the index is a case, control, or Undefined
    Raises
    ------
    ValueError
        If the input file of sql commands for determining case and controls is
            empty
    '''

    ids = afterExclusion_MD.get_ids()
    case_control_Series = pd.Series(["Unspecified"] * len(ids), ids)
    case_control_Series.index.name = afterExclusion_MD.id_header
    case_controlDF = case_control_Series.to_frame("case_control")

    print("Metadata Object has %s samples" % (afterExclusion_MD.id_count))

    for key in query_line_dict:
        if key != "case" and key != "control":
            print("Wrong key used for query. Must be 'case' or 'control'.")
            continue
        #resets shrunk_MD so that filtering down to control samples does not
        #influence filtering down to case
        shrunk_MD = afterExclusion_MD
        #get query and filtering down to control or case samples based on key
        query_lines = query_line_dict[key]
        if len(query_lines) < 1:
            raise ValueError("The %s query file is empty" % (key))
        if extra:
            for line in query_lines:
                initial_size = shrunk_MD.id_count
                ids = shrunk_MD.get_ids(line)
                shrunk_MD = shrunk_MD.filter_ids(ids)
                print(line)
                print(
                    "\tFilters down number of potental %s samples left to %s" %
                    (key, shrunk_MD.id_count))
        else:
            ids = shrunk_MD.get_ids(' AND '.join(query_lines))
            shrunk_MD = shrunk_MD.filter_ids(ids)
        print("Final number of %s samples is %s" % (shrunk_MD.id_count, key))
        #replaces the true values created by the loop above to case or control
        ids = shrunk_MD.ids
        case_controlDF.loc[ids, "case_control"] = key

    #turns case_controlDF into a metadata object
    case_controlMD = Metadata(case_controlDF)
    #merges afterExclution_MD and case_controlMD into one new metadata object
    mergedMD = Metadata.merge(afterExclusion_MD, case_controlMD)

    return mergedMD
Ejemplo n.º 31
0
def _13(ff:ReconSummaryFormat) -> Metadata:
    return Metadata.load(str(ff))
Ejemplo n.º 32
0
def matcher(prepped_for_match_MD, conditions_for_match_lines, one_to_one,
            only_matches):
    '''
    matches case samples to controls and puts the case's id in column matched
        to on the control sample's row

    Parameters
    ----------
    prepped_for_match_MD : Metadata object
        Samples that do not have valid entries for columns that determine
            matching are removed. Everything else is the same as merged_MD.

    conditions_for_match_lines : array of strings
        contains information on what conditions must be met to constitue a
            match

    one_to_one: boolean
        determines if match does one to one matching using stable marriage

    only_matches: boolean
        determines if the returned metadata object should only include
            samples with matches

    Returns
    -------
    matchedMD : Metadata object
        Metadata based of the samples in prepped_for_match_MD with
            matches represented by a column called matched to. Values
            in matched to are the sample ids of the sample samples
            matches to

    Raises
    ------
    KeyError
        conditions_for_match_lines tells the program to match based on a
            column that does not exist in the metadata
        conditions_for_match_lines has a range value that can not be converted
            to a float
        For one of the columns that conditions_for_match_lines tells to match
            on using a range a control or case sample in prepped_for_match_MD
            has a value that can not be converted into a float

    '''
    case_dictionary = {}
    control_dictionary = {}
    control_match_count_dictionary = {}
    case_match_count_dictionary = {}

    matchDF = prepped_for_match_MD.to_dataframe()
    case_for_matchDF = matchDF[matchDF["case_control"].isin(["case"])]
    # creates column to show matches. since it will contain the sample number
    #it was matched too the null value will be 0
    matchDF["matched_to"] = 'none'

    # loops though case samples and matches them to controls
    for case_index, case_row in case_for_matchDF.iterrows():
        #set matchDF to be only the samples of masterDF that are control samples
        controlDF = matchDF[matchDF["case_control"].isin(["control"])]
        if controlDF.size == 0:
            return Metadata(matchDF)

        # loop though input columns to determine matches
        for conditions in conditions_for_match_lines:

            column_name = conditions.split("\t")[1].strip()
            try:
                matchDF[column_name]
            except:
                raise KeyError("Column %s not found in your input data. "
                               "Correct this error in your --match file" %
                               (column_name))

            # get the type of data for the given column. This determine how a
            #match is determined
            if conditions.split("\t")[0] == "range":
                num = conditions.split("\t")[2].strip()

                try:
                    row_num = float(case_row[column_name])
                except:
                    raise ValueError("column %s contains a string that can "
                                     "not be converted to a numerical value" %
                                     (column_name))
                try:
                    fnum = float(num)
                except:
                    raise ValueError("input number for condition %s is not a "
                                     "valid number" % (column_name))
                try:
                    nums_in_column = pd.to_numeric(controlDF[column_name])
                except:
                    raise ValueError("column %s contains a string that can "
                                     "not be converted to a numerical value" %
                                     (column_name))

                # filters controls based on if the value in the control is not
                #within a given distance form the case
                controlDF = controlDF[(nums_in_column >= (row_num - fnum))
                                      & (nums_in_column <= (row_num + fnum))]
            else:
                # filters controls if the strings for the control and case
                #don't match
                controlDF = controlDF[controlDF[column_name].isin(
                    [case_row[column_name]])]
        if controlDF.index.values.size > 0:
            case_dictionary.update({case_index: controlDF.index.values})
            case_match_count_dictionary.update(
                {case_index: (controlDF.index.values.size)})

        for id_control in controlDF.index:
            if id_control not in control_match_count_dictionary:
                control_match_count_dictionary.update({id_control: 0})
            control_match_count_dictionary.update(
                {id_control: (control_match_count_dictionary[id_control] + 1)})

    if one_to_one:
        stable = Stable_Marriage()
        case_to_control_match = stable.stableMarriageRunner(
            case_dictionary, control_match_count_dictionary,
            case_match_count_dictionary)

        print("%s sample pairs matched together" %
              (len(case_to_control_match.keys())))

        for key in case_to_control_match:
            key_value = case_to_control_match[key]
            matchDF.at[key, "matched_to"] = str(key_value)
            matchDF.at[key_value, "matched_to"] = str(key)
    else:
        print("%s cases matched" % (len(case_dictionary.keys())))
        for case in case_dictionary:
            for control in case_dictionary[case]:
                if control in control_dictionary:
                    control_dictionary[control].append(case)
                else:
                    control_dictionary[control] = [case]
            matchDF.at[case,
                       "matched_to"] = ", ".join(sorted(case_dictionary[case]))

        for control in control_dictionary:
            matchDF.at[control, "matched_to"] = ", ".join(
                sorted(control_dictionary[control]))
Ejemplo n.º 33
0
def _15(df: MappingDirectoryFormat) -> Metadata:
    d = df.mapping.view(dict)
    return Metadata(pd.DataFrame(d, index=["0"]))
Ejemplo n.º 34
0
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode = None, metrics: set = None,
                      metadata: qiime2.Metadata = None, min_depth: int = 1,
                      steps: int = 10, iterations: int = 10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))

    if metadata is None:
        columns, filtered_columns = set(), set()
    else:
        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        # Drop metadata columns that aren't categorical, or consist solely of
        # missing values.
        pre_filtered_cols = set(metadata.columns)
        metadata = metadata.filter_columns(column_type='categorical',
                                           drop_all_missing=True)
        filtered_columns = pre_filtered_cols - set(metadata.columns)

        metadata_df = metadata.to_dataframe()
        if metadata_df.empty or len(metadata.columns) == 0:
            raise ValueError("All metadata filtered after dropping columns "
                             "that contained non-categorical data.")
        metadata_df.columns = pd.MultiIndex.from_tuples(
            [(c, '') for c in metadata_df.columns])
        columns = metadata_df.columns.get_level_values(0)

    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)

    filenames = []
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            merged = data.join(metadata_df, how='left')
            for column in columns:
                column_name = quote(column)
                reindexed_df, counts = _reindex_with_metadata(column,
                                                              columns,
                                                              merged)
                c_df = _compute_summary(reindexed_df, column, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, column)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': [quote(f) for f in filenames],
                                'columns': list(columns),
                                'steps': steps,
                                'filtered_columns': sorted(filtered_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Ejemplo n.º 35
0
def simple_plot(output_dir,
                table: biom.Table,
                feature_tree: skbio.TreeNode,
                metadata: q2.Metadata,
                case_where: str,
                control_where: str,
                n_transects: int = 10,
                stratify_by: str = None,
                mode: str = 'max'):
    print("Data extracted")
    layer_dir = os.path.join(output_dir, 'layers')
    rank_dir = os.path.join(output_dir, 'ranks')
    os.mkdir(layer_dir)
    os.mkdir(rank_dir)

    metadata = metadata.filter_ids(table.ids(axis='sample'))
    case_samples = sorted(list(metadata.get_ids(case_where)))
    control_samples = sorted(list(metadata.get_ids(control_where)))
    get_pairs = comparisons(metadata, control_samples, case_samples,
                            stratify_by)

    table.filter(case_samples + control_samples)
    table.remove_empty('observation')
    features = list(table.ids(axis='observation'))
    feature_tree = shear_no_prune(feature_tree, features)
    print("Extraneous features removed")

    for n in feature_tree.traverse():
        if not n.length:
            n.length = 0
    tree = tree_to_array(feature_tree, mode)
    print("Tree index created")

    possible_transects = len(np.unique(np.asarray(tree['distances'])))
    tree_length = tree['distances'][0]  # root of tree
    if n_transects > possible_transects:
        n_transects = possible_transects
        print("Only %d transects exist, using that instead" % n_transects)

    transects = list(np.linspace(0, tree_length, num=n_transects))
    print("Will transect at: %s" % ", ".join(map(str, transects)))

    figure_gen = prepare_plot(tree_length)
    figure_gen.send(None)  # initialize co-routine
    colors = []

    points, _ = pairwise_components(table, get_pairs())
    color_fig, highlight_fig, color = figure_gen.send((points, None))

    color_fig.savefig(os.path.join(layer_dir, 'original.png'),
                      transparent=True)
    plt.close(color_fig)
    highlight_fig.savefig(os.path.join(layer_dir, 'original.h.png'),
                          transparent=True)
    plt.close(highlight_fig)
    colors.append(color)

    rank_files = []
    collapsed_groups = pd.DataFrame()
    for distance in transects:
        collapsed_table, collapsed_counts, groups = group_by_transect(
            table, tree, distance)
        collapsed_groups[groups.name] = groups
        print("Table collapsed at transect %s" % distance)

        points, ranks = pairwise_components(collapsed_table, get_pairs())

        filename = write_ranks(rank_dir, collapsed_counts, ranks, distance)
        rank_files.append(filename)

        color_fig, highlight_fig, color = figure_gen.send((points, distance))
        colors.append(color)

        color_fig.savefig(os.path.join(layer_dir, 'T_%s.png' % distance),
                          transparent=True)
        plt.close(color_fig)
        highlight_fig.savefig(os.path.join(layer_dir, 'T_%s.h.png' % distance),
                              transparent=True)
        plt.close(highlight_fig)

    print("Finalizing visualization")
    figure = figure_gen.send((None, None))
    figure.savefig(os.path.join(layer_dir, 'trajectory.png'), transparent=True)
    plt.close(figure)

    background = next(figure_gen)
    background.savefig(os.path.join(layer_dir, 'bg.png'), transparent=True)
    plt.close(background)

    with open(os.path.join(output_dir, 'collapsed_groups.tsv'), 'w') as fh:
        collapsed_groups.to_csv(fh, sep='\t')

    with open(os.path.join(output_dir, 'index.html'), 'w') as fh:
        template = Environment(loader=BaseLoader).from_string(TEMPLATE)
        fh.write(
            template.render({
                'legend':
                list(
                    zip(['original'] + ['T_%s' % d
                                        for d in transects] + ['trajectory'],
                        list(map(to_hex, colors)) + ['red'])),
                'filenames':
                rank_files
            }))
Ejemplo n.º 36
0
    if args.regions is None or args.regions == 'None':
        if regions:
            args.regions = regions
        else:
            raise ValueError('Failed to identify correctly named regions (V* / ITS*)')

    else:
        args.regions = args.regions.split(',')
        for r in args.regions:
            if not r in primers:
                raise ValueError('libprepkit: {} does not support region: {}'.format(args.libprep, r))
            if not primers[r] in available_classifiers(args.classifier_dir, level=args.classifier_level):
                raise ValueError('prebuildt classifier dir: {} does not contain region: {}'.format(args.classifier_dir, r))
    write_message('loading sample info')
    samples = Metadata.load(os.path.abspath(args.sample_info))

    write_message('starting demultiplex fastq files')
    # adata key: region, value: SampleData[PairedEndSequencesWithQuality] artifact
    adata = demultiplex_manifests(args.input, primers, args.regions, split_on_header=True, threads=args.threads)
    write_message('completed demultiplex fastq files')
    write_message('starting read count of fastq files')
    counts, merged_counts = sequence_counts(adata, min_count=args.filter_region_count )
    # filter regions with too few reads
    for k in list(adata.keys()):
        if not k in counts:
            del adata[k]
    write_message('completed read count')


    DADA2_PARAMS = dada2_denoise_params(args.libprep_config, args.libprep)
Ejemplo n.º 37
0
def alpha_correlation(output_dir: str,
                      alpha_diversity: pd.Series,
                      metadata: qiime2.Metadata,
                      method: str='spearman') -> None:
    try:
        alpha_correlation_fn = _alpha_correlation_fns[method]
    except KeyError:
        raise ValueError('Unknown alpha correlation method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_alpha_correlation_fns.keys())))
    metadata_df = metadata.to_dataframe()
    metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore')
    pre_filtered_cols = set(metadata_df.columns)
    metadata_df = metadata_df.select_dtypes(include=[np.number])
    post_filtered_cols = set(metadata_df.columns)
    filtered_categories = pre_filtered_cols - post_filtered_cols

    categories = metadata_df.columns

    if len(categories) == 0:
        raise ValueError('Only non-numeric data is present in metadata file.')

    filenames = []
    for category in categories:
        metadata_category = metadata_df[category]
        metadata_category = metadata_category[alpha_diversity.index]
        metadata_category = metadata_category.dropna()

        # create a dataframe containing the data to be correlated, and drop
        # any samples that have no data in either column
        df = pd.concat([metadata_category, alpha_diversity], axis=1,
                       join='inner')

        # compute correlation
        correlation_result = alpha_correlation_fn(df[metadata_category.name],
                                                  df[alpha_diversity.name])

        warning = None
        if alpha_diversity.shape[0] != df.shape[0]:
            warning = {'initial': alpha_diversity.shape[0],
                       'method': method.title(),
                       'filtered': df.shape[0]}

        escaped_category = quote(category)
        filename = 'category-%s.jsonp' % escaped_category
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('%s'," % category)
            df.to_json(fh, orient='split')
            fh.write(",")
            json.dump(warning, fh)
            fh.write(",")
            json.dump({
                'method': method.title(),
                'testStat': '%1.4f' % correlation_result[0],
                'pVal': '%1.4f' % correlation_result[1],
                'sampleSize': df.shape[0]}, fh)
            fh.write(");")

    index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'categories': [quote(fn) for fn in filenames],
        'filtered_categories': ', '.join(filtered_categories)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dst'),
                    os.path.join(output_dir, 'dist'))
Ejemplo n.º 38
0
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode=None, metrics: set=None,
                      metadata: qiime2.Metadata=None, min_depth: int=1,
                      steps: int=10, iterations: int=10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))
    if metadata is not None:
        metadata_ids = metadata.ids()
        table_ids = set(table.ids(axis='sample'))
        if not table_ids.issubset(metadata_ids):
            raise ValueError('Missing samples in metadata: %r' %
                             table_ids.difference(metadata_ids))

    filenames, categories, empty_columns = [], [], []
    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            metadata_df = metadata.to_dataframe()
            metadata_df = metadata_df.loc[data.index]

            all_columns = metadata_df.columns
            metadata_df.dropna(axis='columns', how='all', inplace=True)
            empty_columns = set(all_columns) - set(metadata_df.columns)

            metadata_df.columns = pd.MultiIndex.from_tuples(
                [(c, '') for c in metadata_df.columns])
            merged = data.join(metadata_df, how='left')
            categories = metadata_df.columns.get_level_values(0)
            for category in categories:
                category_name = quote(category)
                reindexed_df, counts = _reindex_with_metadata(category,
                                                              categories,
                                                              merged)
                c_df = _compute_summary(reindexed_df, category, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, category_name)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': filenames,
                                'categories': list(categories),
                                'empty_columns': sorted(empty_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Ejemplo n.º 39
0
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str,
                     clustering_method: str, metadata: qiime2.Metadata,
                     sampling_depth: int, iterations: int=10,
                     phylogeny: skbio.TreeNode=None,
                     correlation_method: str='spearman',
                     color_scheme: str='BrBG') -> None:
    if metric in phylogenetic_metrics():
        if phylogeny is None:
            raise ValueError("A phylogenetic metric (%s) was requested, "
                             "but a phylogenetic tree was not provided. "
                             "Phylogeny must be provided when using a "
                             "phylogenetic diversity metric." % metric)
        beta_func = functools.partial(beta_phylogenetic, phylogeny=phylogeny)
    else:
        beta_func = beta

    if table.is_empty():
        raise ValueError("Input feature table is empty.")

    # Filter metadata to only include sample IDs present in the feature table.
    # Also ensures every feature table sample ID is present in the metadata.
    metadata = metadata.filter_ids(table.ids(axis='sample'))

    distance_matrices = _get_multiple_rarefaction(
        beta_func, metric, iterations, table, sampling_depth)
    primary = distance_matrices[0]
    support = distance_matrices[1:]

    heatmap_fig, similarity_df = _make_heatmap(
        distance_matrices, metric, correlation_method, color_scheme)
    heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg'))
    similarity_df.to_csv(
        os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'),
        sep='\t')

    tree = _cluster_samples(primary, support, clustering_method)
    tree.write(os.path.join(output_dir,
                            'sample-clustering-%s.tre' % clustering_method))

    emperor = _jackknifed_emperor(primary, support, metadata)
    emperor_dir = os.path.join(output_dir, 'emperor')
    emperor.copy_support_files(emperor_dir)
    with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh:
        fh.write(emperor.make_emperor(standalone=True))

    templates = list(map(
        lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page),
        ['index.html', 'heatmap.html', 'tree.html', 'emperor.html']))

    context = {
        'metric': metric,
        'clustering_method': clustering_method,
        'tabs': [{'url': 'emperor.html',
                  'title': 'PCoA'},
                 {'url': 'heatmap.html',
                  'title': 'Heatmap'},
                 {'url': 'tree.html',
                  'title': 'Clustering'}]
    }

    q2templates.render(templates, output_dir, context=context)
Ejemplo n.º 40
0
def _10(ff: ErrorCorrectionDetailsFmt) -> Metadata:
    return Metadata.load(str(ff))
Ejemplo n.º 41
0
def alpha_correlation(output_dir: str,
                      alpha_diversity: pd.Series,
                      metadata: qiime2.Metadata,
                      method: str = 'spearman') -> None:
    try:
        alpha_correlation_fn = _alpha_correlation_fns[method]
    except KeyError:
        raise ValueError('Unknown alpha correlation method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_alpha_correlation_fns.keys())))

    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric',
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata contains only non-numeric or empty columns. This "
            "visualizer requires at least one numeric metadata column to "
            "execute.")

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        # create a dataframe containing the data to be correlated, and drop
        # any samples that have no data in either column
        df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1,
                       join='inner')

        # compute correlation
        correlation_result = alpha_correlation_fn(df[metadata_column.name],
                                                  df[alpha_diversity.name])

        warning = None
        if alpha_diversity.shape[0] != df.shape[0]:
            warning = {'initial': alpha_diversity.shape[0],
                       'method': method.title(),
                       'filtered': df.shape[0]}

        escaped_column = quote(column)
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('%s'," % column)
            df.to_json(fh, orient='split')
            fh.write(",")
            json.dump(warning, fh)
            fh.write(",")
            json.dump({
                'method': method.title(),
                'testStat': '%1.4f' % correlation_result[0],
                'pVal': '%1.4f' % correlation_result[1],
                'sampleSize': df.shape[0]}, fh)
            fh.write(");")

    index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'columns': [quote(fn) for fn in filenames],
        'filtered_columns': ', '.join(sorted(filtered_columns))})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Ejemplo n.º 42
0
def _4(obj: qiime2.Metadata) -> MMvecStatsFormat:
    ff = MMvecStatsFormat()
    obj.save(str(ff))
    return ff
Ejemplo n.º 43
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    # Metadata column filtering could be done in one pass, but this visualizer
    # displays separate warnings for non-categorical columns, and categorical
    # columns that didn't satisfy the requirements of the statistics being
    # computed.
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='categorical')
    non_categorical_columns = pre_filtered_cols - set(metadata.columns)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(
        drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata does not contain any columns that satisfy this "
            "visualizer's requirements. There must be at least one metadata "
            "column that contains categorical data, isn't empty, doesn't "
            "consist of unique values, and doesn't consist of exactly one "
            "value.")

    metric_name = alpha_diversity.name

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    filtered_group_comparisons = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat([alpha_diversity, metadata_column.to_series()],
                         axis=1, join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_column.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[metric_name]))

        escaped_column = quote(column)
        escaped_column = escaped_column.replace('/', '%2F')
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        # perform Kruskal-Wallis across all groups
        kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

        # perform pairwise Kruskal-Wallis across all pairs of groups and
        # correct for multiple comparisons
        kw_H_pairwise = []
        for i in range(len(names)):
            for j in range(i):
                try:
                    H, p = scipy.stats.mstats.kruskalwallis(groups[i],
                                                            groups[j])
                    kw_H_pairwise.append([names[j], names[i], H, p])
                except ValueError:
                    filtered_group_comparisons.append(
                        ['%s:%s' % (column, names[i]),
                         '%s:%s' % (column, names[j])])
        kw_H_pairwise = pd.DataFrame(
            kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
        kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
        kw_H_pairwise['q-value'] = multipletests(
            kw_H_pairwise['p-value'], method='fdr_bh')[1]
        kw_H_pairwise.sort_index(inplace=True)
        pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column
        pairwise_path = os.path.join(output_dir, pairwise_fn)
        kw_H_pairwise.to_csv(pairwise_path)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            series = pd.Series(groups, index=names)

            fh.write("load_data('%s'," % column)
            series.to_json(fh, orient='split')
            fh.write(",")
            json.dump({'initial': initial_data_length,
                       'filtered': filtered_data_length}, fh)
            fh.write(",")
            json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
            fh.write(",'")
            table = q2templates.df_to_html(kw_H_pairwise)
            fh.write(table.replace('\n', '').replace("'", "\\'"))
            fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))

    index = os.path.join(
        TEMPLATES, 'alpha_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'columns': [quote(fn) for fn in filenames],
        'non_categorical_columns': ', '.join(sorted(non_categorical_columns)),
        'filtered_columns': ', '.join(sorted(filtered_columns)),
        'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])})

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Ejemplo n.º 44
0
def _8(data: pd.DataFrame) -> ErrorCorrectionDetailsFmt:
    ff = ErrorCorrectionDetailsFmt()
    Metadata(data).save(str(ff))
    return ff
Ejemplo n.º 45
0
def summarize(output_dir: str, table: biom.Table,
              sample_metadata: qiime2.Metadata=None) -> None:
    number_of_features, number_of_samples = table.shape

    sample_summary, sample_frequencies = _frequency_summary(
        table, axis='sample')
    if number_of_samples > 1:
        # Calculate the bin count, with a minimum of 5 bins
        IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile']
        if IQR == 0.0:
            bins = 5
        else:
            # Freedman–Diaconis rule
            bin_width = (2 * IQR) / (number_of_samples ** (1/3))

            bins = max((sample_summary['Maximum frequency'] -
                        sample_summary['Minimum frequency']) / bin_width, 5)

        sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False,
                                             rug=True, bins=int(round(bins)))
        sample_frequencies_ax.get_xaxis().set_major_formatter(
            matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        sample_frequencies_ax.set_xlabel('Frequency per sample')
        sample_frequencies_ax.set_ylabel('Number of samples')
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.pdf'))
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.png'))
        plt.gcf().clear()

    feature_summary, feature_frequencies = _frequency_summary(
        table, axis='observation')
    if number_of_features > 1:
        feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False,
                                              rug=False)
        feature_frequencies_ax.set_xlabel('Frequency per feature')
        feature_frequencies_ax.set_ylabel('Number of features')
        feature_frequencies_ax.set_xscale('log')
        feature_frequencies_ax.set_yscale('log')
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.pdf'))
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.png'))

    sample_summary_table = q2templates.df_to_html(
        sample_summary.apply('{:,}'.format).to_frame('Frequency'))
    feature_summary_table = q2templates.df_to_html(
        feature_summary.apply('{:,}'.format).to_frame('Frequency'))

    index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html')
    context = {
        'number_of_samples': number_of_samples,
        'number_of_features': number_of_features,
        'total_frequencies': int(np.sum(sample_frequencies)),
        'sample_summary_table': sample_summary_table,
        'feature_summary_table': feature_summary_table,
    }

    feature_qualitative_data = _compute_qualitative_summary(table)
    sample_frequencies.sort_values(inplace=True, ascending=False)
    feature_frequencies.sort_values(inplace=True, ascending=False)
    sample_frequencies.to_csv(
        os.path.join(output_dir, 'sample-frequency-detail.csv'))
    feature_frequencies.to_csv(
        os.path.join(output_dir, 'feature-frequency-detail.csv'))

    feature_frequencies = feature_frequencies.astype(int) \
        .apply('{:,}'.format).to_frame('Frequency')
    feature_frequencies['# of Samples Observed In'] = \
        pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format)
    feature_frequencies_table = q2templates.df_to_html(feature_frequencies)
    overview_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'overview.html')
    sample_frequency_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html')
    feature_frequency_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html')

    context.update({'max_count': sample_frequencies.max(),
                    'feature_frequencies_table': feature_frequencies_table,
                    'feature_qualitative_data': feature_qualitative_data,
                    'tabs': [{'url': 'overview.html',
                              'title': 'Overview'},
                             {'url': 'sample-frequency-detail.html',
                              'title': 'Interactive Sample Detail'},
                             {'url': 'feature-frequency-detail.html',
                              'title': 'Feature Detail'}]})
    templates = [index, sample_frequency_template,
                 feature_frequency_template, overview_template]
    q2templates.render(templates, output_dir, context=context)

    shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'),
                    os.path.join(output_dir, 'dist'))

    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
        fh.write("app.init(")
        if sample_metadata:
            sample_metadata = sample_metadata.filter_ids(
                sample_frequencies.index)
            # TODO use Metadata.to_json() API if/when it exists in the future.
            sample_metadata.to_dataframe().to_json(fh)
        else:
            fh.write('{}')
        fh.write(', ')
        sample_frequencies.to_json(fh)
        fh.write(');')
Ejemplo n.º 46
0
def _9(ff: ErrorCorrectionDetailsFmt) -> pd.DataFrame:
    return Metadata.load(str(ff)).to_dataframe()