Ejemplo n.º 1
0
def differentialtest(table: biom.Table, metadata: qiime2.Metadata,
                     variable: str,
                     taxonomy: TSVTaxonomyFormat) -> pd.DataFrame:

    if table.is_empty():
        raise ValueError("The provided table object is empty")
    ## run the R script on the file
    with tempfile.TemporaryDirectory() as temp_dir_name:
        ## write the biom table to file
        input_table = os.path.join(temp_dir_name, 'table.tsv')
        input_metadata = os.path.join(temp_dir_name, 'metadata.tsv')

        with open(input_table, 'w') as fh:
            fh.write(table.to_tsv())
        metadata.save(input_metadata)

        output = os.path.join(temp_dir_name, 'data.tsv')

        cmd = [
            'differentialtest.R', input_table, input_metadata,
            str(taxonomy),
            str(variable),
            str(output)
        ]
        run_commands([cmd])
        data = pd.read_csv(output, sep='\t')
        data.index.name = 'Feature ID'
    return data
def subsample(table: biom.Table, subsampling_depth: int,
              axis: str) -> biom.Table:
    if axis == 'feature':
        # we are transposing the table due to biocore/biom-format#759
        table = table.transpose()

    if len(table.ids()) < subsampling_depth:
        raise ValueError('The subsampling depth exceeds the number of '
                         'elements on the desired axis. The maximum depth '
                         'is: %d.' % len(table.ids()))

    # the axis is always 'sample' due to the above transpose
    table = table.subsample(subsampling_depth, axis='sample', by_id=True)

    # the inverted axis is always observation due to the above transpose
    invaxis = 'observation'
    table.filter(lambda v, i, m: v.sum() > 0, axis=invaxis)

    if axis == 'feature':
        # reverse the transpose necessary due to biocore/biom-format#759
        table = table.transpose()

    if table.is_empty():
        raise ValueError('The subsampled table contains no samples or features'
                         ' (samples/features that sum to zero after filtering'
                         ' are automatically removed). It may be a good idea'
                         ' to double check that your table is valid/nonempty.')

    return table
Ejemplo n.º 3
0
def beta_phylogenetic(table: biom.Table,
                      phylogeny: skbio.TreeNode,
                      metric: str,
                      n_jobs: int = 1) -> skbio.DistanceMatrix:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")
    if n_jobs != 1 and metric == 'weighted_unifrac':
        raise ValueError("Weighted UniFrac is not parallelizable")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')

    try:
        results = skbio.diversity.beta_diversity(
            metric=metric,
            counts=counts,
            ids=sample_ids,
            otu_ids=feature_ids,
            tree=phylogeny,
            pairwise_func=sklearn.metrics.pairwise_distances,
            n_jobs=n_jobs)
    except skbio.tree.MissingNodeError as e:
        message = str(e).replace('otu_ids', 'feature_ids')
        message = message.replace('tree', 'phylogeny')
        raise skbio.tree.MissingNodeError(message)

    return results
Ejemplo n.º 4
0
def beta(table: biom.Table,
         metric: str,
         pseudocount: int = 1,
         n_jobs: int = 1) -> skbio.DistanceMatrix:

    if not (metric in non_phylogenetic_metrics()):
        raise ValueError("Unknown metric: %s" % metric)

    counts = table.matrix_data.toarray().T

    def aitchison(x, y, **kwds):
        return euclidean(clr(x), clr(y))

    if metric == 'aitchison':
        counts += pseudocount
        metric = aitchison

    if table.is_empty():
        raise ValueError("The provided table object is empty")

    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        validate=True,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs)
Ejemplo n.º 5
0
def SRS(table: biom.Table,
        c_min: int,
        set_seed: bool = True,
        seed: int = 1) -> biom.Table:
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    #normalized_table = biom.Table()

    ## run the R script on the file
    with tempfile.TemporaryDirectory() as temp_dir_name:

        ## write the biom table to file
        input_name = os.path.join(temp_dir_name, 'table.tsv')
        with open(input_name, 'w') as fh:
            fh.write(table.to_tsv())

        cmd = ['SRS.R', input_name, str(c_min), str(set_seed), str(seed)]
        run_commands([cmd])
        norm_table_df = pd.read_csv(input_name, sep='\t')

    norm_table_biom = biom.Table(data=norm_table_df.values,
                                 observation_ids=norm_table_df.index,
                                 sample_ids=norm_table_df.columns)
    return norm_table_biom
Ejemplo n.º 6
0
def beta(table: biom.Table, metric: str,
         pseudocount: int=1, n_jobs: int=1)-> skbio.DistanceMatrix:

    if not (metric in non_phylogenetic_metrics()):
        raise ValueError("Unknown metric: %s" % metric)

    counts = table.matrix_data.toarray().T

    def aitchison(x, y, **kwds):
        return euclidean(clr(x), clr(y))

    if metric == 'aitchison':
        counts += pseudocount
        metric = aitchison

    if table.is_empty():
        raise ValueError("The provided table object is empty")

    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        validate=True,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs
    )
Ejemplo n.º 7
0
def group(table: biom.Table, axis: str,
          metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(
        lambda axis_id, _: metadata.get_value(axis_id),
        collapse_f=_mode_lookup[mode],
        axis=biom_axis,
        norm=False,
        include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    # TODO use CategoricalMetadataColumn API for retrieving categories/groups,
    # when the API exists.
    series = metadata.to_series()
    return grouped_table.sort_order(series.unique(), axis=biom_axis)
Ejemplo n.º 8
0
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode,
                      metric: str, n_jobs: int=1)-> skbio.DistanceMatrix:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")
    if n_jobs != 1 and metric == 'weighted_unifrac':
        raise ValueError("Weighted UniFrac is not parallelizable")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')

    try:
        results = skbio.diversity.beta_diversity(
            metric=metric,
            counts=counts,
            ids=sample_ids,
            otu_ids=feature_ids,
            tree=phylogeny,
            pairwise_func=sklearn.metrics.pairwise_distances,
            n_jobs=n_jobs
        )
    except skbio.tree.MissingNodeError as e:
        message = str(e).replace('otu_ids', 'feature_ids')
        message = message.replace('tree', 'phylogeny')
        raise skbio.tree.MissingNodeError(message)

    return results
Ejemplo n.º 9
0
def group(table: biom.Table, axis: str,
          metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(
        lambda axis_id, _: metadata.get_value(axis_id),
        collapse_f=_mode_lookup[mode],
        axis=biom_axis,
        norm=False,
        include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    # TODO use CategoricalMetadataColumn API for retrieving categories/groups,
    # when the API exists.
    series = metadata.to_series()
    return grouped_table.sort_order(series.unique(), axis=biom_axis)
Ejemplo n.º 10
0
def alpha(table: biom.Table):
    """

    :param table:
    :return:
    """
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    table = get_biom_table(table)
    alpha_diversities = []
    counts = table.matrix_data.toarray().astype(float).T
    sample_ids = table.ids(axis='sample')
    sample_metadata = dict(zip(table.ids(), table.metadata()))

    for metric in ALPHA_DIVERSITY_METHODS:
        result = alpha_diversity(metric=metric, counts=counts, ids=sample_ids)
        result.name = metric
        alpha_diversities.append(result)

    aggregated_diversity_results = aggregate_results(alpha_diversities,
                                                     sample_ids)
    formatted_diversity_results = _format_alpha_results_to_json(
        aggregated_diversity_results, sample_metadata)

    return formatted_diversity_results
Ejemplo n.º 11
0
def rarefy(table: biom.Table, sampling_depth: int) -> biom.Table:
    table = table.subsample(sampling_depth, axis='sample', by_id=False)

    if table.is_empty():
        raise ValueError('The rarefied table contains no samples or features. '
                         'Verify your table is valid and that you provided a '
                         'shallow enough sampling depth.')

    return table
Ejemplo n.º 12
0
def collapse_biom(table: biom.Table, mapping: dict, normalize=False):
    """Collapse a BIOM table in many-to-many mode.

    Parameters
    ----------
    table : biom.Table
        Table to collapse.
    mapping : dict of list of str
        Source-to-target(s) mapping.
    normalize : bool, optional
        Whether normalize per-target counts by number of targets per source.

    Returns
    -------
    biom.Table
        Collapsed BIOM table.

    Notes
    -----
    Metadata will not be retained in the collapsed table.

    See Also
    --------
    .table.collapse_table
    """
    # filter table features
    table = table.filter(lambda data, id_, md: id_ in mapping,
                         axis='observation',
                         inplace=False)

    # stop if no feature left
    if table.is_empty():
        return table

    # add mapping to table metadata
    table.add_metadata({k: dict(part=v)
                        for k, v in mapping.items()},
                       axis='observation')

    # determine collapsing method
    kwargs = dict(norm=False,
                  one_to_many=True,
                  axis='observation',
                  one_to_many_mode=('divide' if normalize else 'add'))

    # collapse table in many-to-many mode
    table = table.collapse(lambda id_, md: zip(md['part'], md['part']),
                           **kwargs)

    # round to integers
    if normalize:
        round_biom(table)

    # clean up
    table.del_metadata(keys=['Path'])
    return table
Ejemplo n.º 13
0
def beta(table: biom.Table, metric: str) -> skbio.DistanceMatrix:
    if metric not in non_phylogenetic_metrics():
        raise ValueError("Unknown metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(metric=metric,
                                          counts=counts,
                                          ids=sample_ids)
Ejemplo n.º 14
0
def SRScurve(output_dir: str,
             table: biom.Table,
             metric: str = 'richness',
             step: int = 50,
             sample: int = 0,
             max_sample_size: int = 0,
             rarefy_comparison: bool = False,
             rarefy_repeats: int = 10,
             rarefy_comparison_legend: bool = False,
             srs_color: str = 'black',
             rarefy_color: str = 'red',
             srs_linetype: str = 'solid',
             rarefy_linetype: str = 'longdash',
             label: bool = False) -> None:
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    ## run the R script on the file
    with tempfile.TemporaryDirectory() as temp_dir_name:

        ## write the biom table to file
        input_name = os.path.join(temp_dir_name, 'table.tsv')
        #input_name = 'table.tsv'
        with open(input_name, 'w') as fh:
            fh.write(table.to_tsv())
    #table_df = pd.read_csv(input_name, sep='\t')

        cmd = [
            'SRScurve.R', input_name,
            str(metric),
            str(step),
            str(sample),
            str(max_sample_size),
            str(rarefy_comparison),
            str(rarefy_repeats),
            str(rarefy_comparison_legend),
            str(srs_color),
            str(rarefy_color),
            str(srs_linetype),
            str(rarefy_linetype),
            str(label),
            str(output_dir)
        ]
        run_commands([cmd])

    plot = os.path.join(output_dir, 'plot.png')
    index = os.path.join(output_dir, 'index.html')

    with open(index, 'w') as fh:
        fh.write(
            '<!DOCTYPE html><head></head><body><img src="SRScurve_plot.png" style="max-width: 100vw;max-height: 100vh;object-fit: contain" /></body></html>'
        )
Ejemplo n.º 15
0
def alpha(table: biom.Table, metric: str) -> pd.Series:
    if metric not in non_phylogenetic_metrics():
        raise ValueError("Unknown metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')

    result = skbio.diversity.alpha_diversity(metric=metric, counts=counts,
                                             ids=sample_ids)
    result.name = metric
    return result
Ejemplo n.º 16
0
def alpha(table: biom.Table, metric: str) -> pd.Series:
    if metric not in non_phylogenetic_metrics():
        raise ValueError("Unknown metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')

    result = skbio.diversity.alpha_diversity(metric=metric, counts=counts,
                                             ids=sample_ids)
    result.name = metric
    return result
Ejemplo n.º 17
0
def beta(table: biom.Table,
         metric: str,
         pseudocount: int = 1,
         n_jobs: int = 1) -> skbio.DistanceMatrix:

    if not (metric in non_phylogenetic_metrics()):
        raise ValueError("Unknown metric: %s" % metric)

    counts = table.matrix_data.toarray().T

    def aitchison(x, y, **kwds):
        return euclidean(clr(x), clr(y))

    def canberra_adkins(x, y, **kwds):
        if (x < 0).any() or (y < 0).any():
            raise ValueError("Canberra-Adkins is only defined over positive "
                             "values.")

        nz = ((x > 0) | (y > 0))
        x_ = x[nz]
        y_ = y[nz]
        nnz = nz.sum()

        return (1. / nnz) * np.sum(np.abs(x_ - y_) / (x_ + y_))

    def jensen_shannon(x, y, **kwds):
        return jensenshannon(x, y)

    if metric == 'aitchison':
        counts += pseudocount
        metric = aitchison
    elif metric == 'canberra_adkins':
        metric = canberra_adkins
    elif metric == 'jensenshannon':
        metric = jensen_shannon

    if table.is_empty():
        raise ValueError("The provided table object is empty")

    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        validate=True,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs)
Ejemplo n.º 18
0
def alpha(table: biom.Table) -> AlphaDiversityFormat:
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    output = AlphaDiversityFormat()
    ## run the R script on the file
    with tempfile.TemporaryDirectory() as temp_dir_name:

        ## write the biom table to file
        input_name = os.path.join(temp_dir_name, 'table.tsv')
        with open(input_name, 'w') as fh:
            fh.write(table.to_tsv())

        cmd = ['run_new_richness.R', input_name, str(output)]
        run_commands([cmd])
    return output
Ejemplo n.º 19
0
def beta(table: biom.Table, metric: str, n_jobs: int=1)-> skbio.DistanceMatrix:
    if metric not in non_phylogenetic_metrics():
        raise ValueError("Unknown metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs
    )
Ejemplo n.º 20
0
    def beta(self,
             table: biom.Table,
             metric: str,
             pseudocount: int = 1,
             n_jobs: int = 1):
        counts = table.matrix_data.toarray().T

        if table.is_empty():
            raise ValueError("The provided table object is empty")

        sample_ids = table.ids(axis='sample')
        beta_dv = beta_diversity(
            metric=metric,
            counts=counts,
            ids=sample_ids,
            validate=True,
            pairwise_func=sklearn.metrics.pairwise_distances,
            n_jobs=n_jobs)
        return beta_dv
Ejemplo n.º 21
0
def beta(table: biom.Table, metric: str,
         pseudocount: int = 1, n_jobs: int = 1) -> skbio.DistanceMatrix:

    if not (metric in non_phylogenetic_metrics()):
        raise ValueError("Unknown metric: %s" % metric)

    counts = table.matrix_data.toarray().T

    def aitchison(x, y, **kwds):
        return euclidean(clr(x), clr(y))

    def canberra_adkins(x, y, **kwds):
        if (x < 0).any() or (y < 0).any():
            raise ValueError("Canberra-Adkins is only defined over positive "
                             "values.")

        nz = ((x > 0) | (y > 0))
        x_ = x[nz]
        y_ = y[nz]
        nnz = nz.sum()

        return (1. / nnz) * np.sum(np.abs(x_ - y_) / (x_ + y_))

    if metric == 'aitchison':
        counts += pseudocount
        metric = aitchison
    elif metric == 'canberra_adkins':
        metric = canberra_adkins

    if table.is_empty():
        raise ValueError("The provided table object is empty")

    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        validate=True,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs
    )
Ejemplo n.º 22
0
def group(table: biom.Table, axis: str, metadata: qiime2.MetadataCategory,
          mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    series = _munge_metadata_category(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(lambda axis_id, _: series.loc[axis_id],
                                   collapse_f=_mode_lookup[mode],
                                   axis=biom_axis,
                                   norm=False,
                                   include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    return grouped_table.sort_order(series.unique(), axis=biom_axis)
Ejemplo n.º 23
0
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode,
                      metric: str) -> skbio.DistanceMatrix:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')

    try:
        results = skbio.diversity.beta_diversity(metric=metric,
                                                 counts=counts,
                                                 ids=sample_ids,
                                                 otu_ids=feature_ids,
                                                 tree=phylogeny)
    except skbio.tree.MissingNodeError as e:
        message = str(e).replace('otu_ids', 'feature_ids')
        message = message.replace('tree', 'phylogeny')
        raise skbio.tree.MissingNodeError(message)

    return results
Ejemplo n.º 24
0
def alpha_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode,
                       metric: str) -> pd.Series:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')

    try:
        result = skbio.diversity.alpha_diversity(metric=metric,
                                                 counts=counts,
                                                 ids=sample_ids,
                                                 otu_ids=feature_ids,
                                                 tree=phylogeny)
    except skbio.tree.MissingNodeError as e:
        message = str(e).replace('otu_ids', 'feature_ids')
        message = message.replace('tree', 'phylogeny')
        raise skbio.tree.MissingNodeError(message)

    result.name = metric
    return result
Ejemplo n.º 25
0
def beta_rarefaction(output_dir: str,
                     table: biom.Table,
                     metric: str,
                     clustering_method: str,
                     metadata: qiime2.Metadata,
                     sampling_depth: int,
                     iterations: int = 10,
                     phylogeny: skbio.TreeNode = None,
                     correlation_method: str = 'spearman',
                     color_scheme: str = 'BrBG') -> None:
    with qiime2.sdk.Context() as scope:
        if table.is_empty():
            raise ValueError("Input feature table is empty.")

        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        table = qiime2.Artifact.import_data('FeatureTable[Frequency]', table)

        if metric in phylogenetic_metrics():
            if phylogeny is None:
                raise ValueError("A phylogenetic metric (%s) was requested, "
                                 "but a phylogenetic tree was not provided. "
                                 "Phylogeny must be provided when using a "
                                 "phylogenetic diversity metric." % metric)

            phylogeny = qiime2.Artifact.import_data('Phylogeny[Rooted]',
                                                    phylogeny)
            api_method = scope.ctx.get_action('diversity', 'beta_phylogenetic')
            beta_func = functools.partial(api_method, phylogeny=phylogeny)
        else:
            beta_func = scope.ctx.get_action('diversity', 'beta')

        rare_func = scope.ctx.get_action('feature-table', 'rarefy')

        distance_matrices = _get_multiple_rarefaction(beta_func, rare_func,
                                                      metric, iterations,
                                                      table, sampling_depth)

    primary = distance_matrices[0]
    support = distance_matrices[1:]

    heatmap_fig, similarity_df = _make_heatmap(distance_matrices, metric,
                                               correlation_method,
                                               color_scheme)
    heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg'))
    similarity_df.to_csv(os.path.join(output_dir,
                                      'rarefaction-iteration-correlation.tsv'),
                         sep='\t')

    tree = _cluster_samples(primary, support, clustering_method)
    tree.write(
        os.path.join(output_dir,
                     'sample-clustering-%s.tre' % clustering_method))

    emperor = _jackknifed_emperor(primary, support, metadata)
    emperor_dir = os.path.join(output_dir, 'emperor')
    emperor.copy_support_files(emperor_dir)
    with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh:
        fh.write(emperor.make_emperor(standalone=True))

    templates = list(
        map(
            lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets',
                                      page),
            ['index.html', 'heatmap.html', 'tree.html', 'emperor.html']))

    context = {
        'metric':
        metric,
        'clustering_method':
        clustering_method,
        'tabs': [{
            'url': 'emperor.html',
            'title': 'PCoA'
        }, {
            'url': 'heatmap.html',
            'title': 'Heatmap'
        }, {
            'url': 'tree.html',
            'title': 'Clustering'
        }]
    }

    q2templates.render(templates, output_dir, context=context)
Ejemplo n.º 26
0
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode = None, metrics: set = None,
                      metadata: qiime2.Metadata = None, min_depth: int = 1,
                      steps: int = 10, iterations: int = 10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))

    if metadata is None:
        columns, filtered_columns = set(), set()
    else:
        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        # Drop metadata columns that aren't categorical, or consist solely of
        # missing values.
        pre_filtered_cols = set(metadata.columns)
        metadata = metadata.filter_columns(column_type='categorical',
                                           drop_all_missing=True)
        filtered_columns = pre_filtered_cols - set(metadata.columns)

        metadata_df = metadata.to_dataframe()
        if metadata_df.empty or len(metadata.columns) == 0:
            raise ValueError("All metadata filtered after dropping columns "
                             "that contained non-categorical data.")
        metadata_df.columns = pd.MultiIndex.from_tuples(
            [(c, '') for c in metadata_df.columns])
        columns = metadata_df.columns.get_level_values(0)

    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)

    filenames = []
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            merged = data.join(metadata_df, how='left')
            for column in columns:
                column_name = quote(column)
                reindexed_df, counts = _reindex_with_metadata(column,
                                                              columns,
                                                              merged)
                c_df = _compute_summary(reindexed_df, column, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, column)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': [quote(f) for f in filenames],
                                'columns': list(columns),
                                'steps': steps,
                                'filtered_columns': sorted(filtered_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Ejemplo n.º 27
0
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode,
                      metric: str, weighted: bool)-> skbio.DistanceMatrix:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    # Write table to temp file
    with tempfile.TemporaryDirectory() as temp_dir_name:
        table_fp = os.path.join(temp_dir_name, 'otu_table.tsv')
        newick_fp = os.path.join(temp_dir_name, 'tree.newick')
        with open(table_fp, 'w') as out_table, open(newick_fp, 'w') as newick:
            # This is easy, just write to newick
            phylogeny.write(newick)
            # We have to iterate through each sample
            out_table.write("\t" + "\t".join(table.ids(axis='observation')))
            for sample_id in table.ids(axis='sample'):
                row = table.data(sample_id)
                out_table.write("\n" + str(sample_id) + "\t" + \
                        "\t".join([str(x) for x in row]))
    # Run ExpressBetaDiversity on them
        name_map = {'braycurtis': 'Bray-Curtis',
                    'sorensen': 'Bray-Curtis',
                    'canberra': 'Canberra',
                    'chi_squared': 'Chi-squared',
                    'coeff_similarity': 'CS',
                    'complete_tree': 'CT',
                    'euclidean': 'Euclidean',
                    'f_st': 'Fst',
                    'p_st': 'Fst',
                    'gower': 'Gower',
                    'hellinger': 'Hellinger',
                    'kulczynski': 'Kulczynski',
                    'lennon': 'Lennon',
                    'manhattan': 'Manhattan',
                    'weighted_unifrac': 'Manhattan',
                    'mnnd': 'MNND',
                    'mpd': 'MPD',
                    'morisita_horn': 'Morisita-Horn',
                    'normalized_weighted_unifrac': 'NWU',
                    'pearson': 'Pearson',
                    'raohp': 'RaoHp',
                    'soergel': 'Soergel',
                    'jaccard': 'Soergel',
                    'unweighted_unifrac': 'Soergel',
                    'ruzicka': 'Soergel',
                    'tamas_coeff': 'TC',
                    'weighted_corr': 'WC',
                    'whittaker': 'Whittaker',
                    'yue_clayton': 'Yue-Clayton'
                   }
        if weighted:
            weighted = "-w"
        else:
            weighted = ""
        cmd = 'ExpressBetaDiversity -t tree.newick -s otu_table.tsv %s -c %s' \
                                                  % (weighted, name_map[metric])
        subprocess.run(cmd, cwd=temp_dir_name, shell=True)
        with open(os.path.join(temp_dir_name, 'output.diss'), 'r') as dist_file:
            nsamples = int(dist_file.readline())
            dist_mat = np.zeros((nsamples, nsamples))
            ids = []
            for i, line in enumerate(dist_file):
                ids.append(line.split("\t")[0].strip())
                for j, dist in enumerate(line.split("\t")[1:]):
                    dist_mat[i,j] = float(dist)
                    dist_mat[j,i] = float(dist)

    # Suck the data matrix back in
    # Return a DistanceMatrix object
    results = skbio.DistanceMatrix(dist_mat, ids)
    return results
Ejemplo n.º 28
0
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str,
                     clustering_method: str, metadata: qiime2.Metadata,
                     sampling_depth: int, iterations: int=10,
                     phylogeny: skbio.TreeNode=None,
                     correlation_method: str='spearman',
                     color_scheme: str='BrBG') -> None:
    if metric in phylogenetic_metrics():
        if phylogeny is None:
            raise ValueError("A phylogenetic metric (%s) was requested, "
                             "but a phylogenetic tree was not provided. "
                             "Phylogeny must be provided when using a "
                             "phylogenetic diversity metric." % metric)
        beta_func = functools.partial(beta_phylogenetic, phylogeny=phylogeny)
    else:
        beta_func = beta

    if table.is_empty():
        raise ValueError("Input feature table is empty.")

    # Filter metadata to only include sample IDs present in the feature table.
    # Also ensures every feature table sample ID is present in the metadata.
    metadata = metadata.filter_ids(table.ids(axis='sample'))

    distance_matrices = _get_multiple_rarefaction(
        beta_func, metric, iterations, table, sampling_depth)
    primary = distance_matrices[0]
    support = distance_matrices[1:]

    heatmap_fig, similarity_df = _make_heatmap(
        distance_matrices, metric, correlation_method, color_scheme)
    heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg'))
    similarity_df.to_csv(
        os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'),
        sep='\t')

    tree = _cluster_samples(primary, support, clustering_method)
    tree.write(os.path.join(output_dir,
                            'sample-clustering-%s.tre' % clustering_method))

    emperor = _jackknifed_emperor(primary, support, metadata)
    emperor_dir = os.path.join(output_dir, 'emperor')
    emperor.copy_support_files(emperor_dir)
    with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh:
        fh.write(emperor.make_emperor(standalone=True))

    templates = list(map(
        lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page),
        ['index.html', 'heatmap.html', 'tree.html', 'emperor.html']))

    context = {
        'metric': metric,
        'clustering_method': clustering_method,
        'tabs': [{'url': 'emperor.html',
                  'title': 'PCoA'},
                 {'url': 'heatmap.html',
                  'title': 'Heatmap'},
                 {'url': 'tree.html',
                  'title': 'Clustering'}]
    }

    q2templates.render(templates, output_dir, context=context)
Ejemplo n.º 29
0
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode=None, metrics: set=None,
                      metadata: qiime2.Metadata=None, min_depth: int=1,
                      steps: int=10, iterations: int=10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))
    if metadata is not None:
        metadata_ids = metadata.ids()
        table_ids = set(table.ids(axis='sample'))
        if not table_ids.issubset(metadata_ids):
            raise ValueError('Missing samples in metadata: %r' %
                             table_ids.difference(metadata_ids))

    filenames, categories, empty_columns = [], [], []
    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            metadata_df = metadata.to_dataframe()
            metadata_df = metadata_df.loc[data.index]

            all_columns = metadata_df.columns
            metadata_df.dropna(axis='columns', how='all', inplace=True)
            empty_columns = set(all_columns) - set(metadata_df.columns)

            metadata_df.columns = pd.MultiIndex.from_tuples(
                [(c, '') for c in metadata_df.columns])
            merged = data.join(metadata_df, how='left')
            categories = metadata_df.columns.get_level_values(0)
            for category in categories:
                category_name = quote(category)
                reindexed_df, counts = _reindex_with_metadata(category,
                                                              categories,
                                                              merged)
                c_df = _compute_summary(reindexed_df, category, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, category_name)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': filenames,
                                'categories': list(categories),
                                'empty_columns': sorted(empty_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Ejemplo n.º 30
0
def alpha_rarefaction(output_dir: str,
                      table: biom.Table,
                      max_depth: int,
                      phylogeny: skbio.TreeNode = None,
                      metrics: set = None,
                      metadata: qiime2.Metadata = None,
                      min_depth: int = 1,
                      steps: int = 10,
                      iterations: int = 10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))

    if metadata is None:
        columns, filtered_columns = set(), set()
    else:
        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        # Drop metadata columns that aren't categorical, or consist solely of
        # missing values.
        pre_filtered_cols = set(metadata.columns)
        metadata = metadata.filter_columns(column_type='categorical',
                                           drop_all_missing=True)
        filtered_columns = pre_filtered_cols - set(metadata.columns)

        metadata_df = metadata.to_dataframe()
        metadata_df.columns = pd.MultiIndex.from_tuples([
            (c, '') for c in metadata_df.columns
        ])
        columns = metadata_df.columns.get_level_values(0)

    data = _compute_rarefaction_data(table, min_depth, max_depth, steps,
                                     iterations, phylogeny, metrics)

    filenames = []
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            merged = data.join(metadata_df, how='left')
            for column in columns:
                column_name = quote(column)
                reindexed_df, counts = _reindex_with_metadata(
                    column, columns, merged)
                c_df = _compute_summary(reindexed_df, column, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, column)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = [
                'depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values
            ]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'metrics': list(metrics),
                           'filenames': [quote(f) for f in filenames],
                           'columns': list(columns),
                           'steps': steps,
                           'filtered_columns': sorted(filtered_columns)
                       })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Ejemplo n.º 31
0
def collapse_biom(table: biom.Table, mapping: dict, divide=False, field=None):
    """Collapse a BIOM table in many-to-many mode.

    Parameters
    ----------
    table : biom.Table
        Table to collapse.
    mapping : dict of list of str
        Source-to-target(s) mapping.
    divide : bool, optional
        Whether divide per-target counts by number of targets per source.
    field : int, optional
        Index of field to be collapsed in a stratified table.

    Returns
    -------
    biom.Table
        Collapsed BIOM table.

    Raises
    ------
    ValueError
        Field index is not present in a feature ID.

    Notes
    -----
    Metadata will not be retained in the collapsed table.

    See Also
    --------
    .table.collapse_table
    """
    # generate metadata
    metadata = {}
    for id_ in table.ids('observation'):
        feature = id_
        if field:
            fields = feature.split('|')
            try:
                feature = fields[field]
            except IndexError:
                raise ValueError(
                    f'Feature "{feature}" has less than {field + 1} fields.')
        if feature not in mapping:
            continue
        targets = []
        for target in mapping[feature]:
            if field:
                fields[field] = target
                target = '|'.join(fields)
            targets.append(target)
        metadata[id_] = dict(part=targets)

    # filter table features
    table = table.filter(lambda data, id_, md: id_ in metadata,
                         axis='observation',
                         inplace=False)

    # stop if no feature left
    if table.is_empty():
        return table

    # add mapping to table metadata
    table.add_metadata(metadata, axis='observation')

    # determine collapsing method
    kwargs = dict(norm=False,
                  one_to_many=True,
                  axis='observation',
                  one_to_many_mode=('divide' if divide else 'add'))

    # collapse table in many-to-many mode
    table = table.collapse(lambda _, md: zip(md['part'], md['part']), **kwargs)

    # round to integers
    if divide:
        round_biom(table)

    # clean up
    table.del_metadata(keys=['Path'])
    return table