Example #1
0
def tabulate(output_dir: str,
             input: qiime2.Metadata,
             page_size: int = 100) -> None:
    if page_size < 1:
        raise ValueError('Cannot render less than one record per page.')

    df = input.to_dataframe()
    df.reset_index(inplace=True)
    table = df.to_json(orient='split')
    # JSON spec doesn't allow single quotes in string values, at all. It does
    # however allow unicode values.
    table = table.replace("'", r'\u0027')

    index = os.path.join(TEMPLATES, 'tabulate', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'table': table,
                           'page_size': page_size
                       })

    js = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.js')
    os.mkdir(os.path.join(output_dir, 'js'))
    shutil.copy(js, os.path.join(output_dir, 'js', 'datatables.min.js'))

    css = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.css')
    os.mkdir(os.path.join(output_dir, 'css'))
    shutil.copy(css, os.path.join(output_dir, 'css', 'datatables.min.css'))
Example #2
0
def visualize_stats(output_dir: str, filter_stats: pd.DataFrame) -> None:
    sums = filter_stats.sum()
    sums.name = 'Totals'
    filter_stats = filter_stats.append(sums)

    filter_stats.sort_values('total-input-reads',
                             inplace=True,
                             ascending=False)

    total_retained = filter_stats['total-retained-reads']
    total_input = filter_stats['total-input-reads']
    filter_stats['fraction-retained'] = total_retained / total_input

    # reorder such that retained fraction follows total-input-reads and
    # total-retained-reads
    columns = list(filter_stats.columns)[:-1]
    columns.insert(2, 'fraction-retained')
    filter_stats = filter_stats[columns]

    html = filter_stats.to_html(classes='table table-striped table-hover')
    html = html.replace('border="1"', 'border="0"')
    index = os.path.join(TEMPLATES, 'index.html')
    context = {'result': html}

    q2templates.render(index, output_dir, context=context)
Example #3
0
def tabulate(output_dir: str, data: pd.Series) -> None:
    prepped = []
    for _id, taxa in data.iteritems():
        prepped.append({'id': _id, 'taxa': taxa})

    index = os.path.join(TEMPLATES, 'tabulate', 'index.html')
    q2templates.render(index, output_dir, context={'data': prepped})
def core_features(output_dir, table: biom.Table, min_fraction: float = 0.5,
                  max_fraction: float = 1.0, steps: int = 11) -> None:
    if max_fraction < min_fraction:
        raise ValueError('min_fraction (%r) parameter must be less than '
                         'max_fraction (%r) parameter.' %
                         (min_fraction, max_fraction))

    index_fp = os.path.join(TEMPLATES, 'index.html')
    context = {
        'num_samples': table.shape[1],
        'num_features': table.shape[0]
    }

    if min_fraction == max_fraction:
        fractions = [min_fraction]
    else:
        fractions = np.linspace(min_fraction, max_fraction, steps)

    rounded_fractions = _round_fractions(fractions)

    data = []
    file_links = []
    for fraction, rounded_fraction in zip(fractions, rounded_fractions):
        core_features = _get_core_features(table, fraction)
        core_feature_count = len(core_features)
        data.append([fraction, core_feature_count])

        if core_feature_count > 0:
            core_feature_fn = 'core-features-%s.tsv' % rounded_fraction
            core_feature_fp = os.path.join(output_dir, core_feature_fn)

            file_links.append("<a href='./%s'>TSV</a>" % core_feature_fn)

            core_features.to_csv(core_feature_fp, sep='\t',
                                 index_label='Feature ID')
        else:
            file_links.append('No core features')

    df = pd.DataFrame(data, columns=['Fraction of samples', 'Feature count'])
    df['Fraction of features'] = df['Feature count'] / table.shape[0]
    df['Feature list'] = file_links

    # newer versions of seaborn don't like dataframes with fewer than two rows
    if len(fractions) > 1:
        ax = sns.regplot(data=df, x='Fraction of samples', y='Feature count',
                         fit_reg=False)

        # matplotlib will issue a UserWarning if attempting to set left and
        # right bounds to the same value.
        ax.set_xbound(min(fractions), max(fractions))
        ax.set_ybound(0, max(df['Feature count']) + 1)

        ax.get_figure().savefig(
            os.path.join(output_dir, 'core-feature-counts.svg'))
        context['show_plot'] = True

    context['table_html'] = q2templates.df_to_html(df, index=False,
                                                   escape=False)

    q2templates.render(index_fp, output_dir, context=context)
Example #5
0
def _visualize_anova(output_dir,
                     pairwise_tests=False,
                     model_results=False,
                     residuals=False,
                     pairwise_test_name='Pairwise t-tests'):
    pd.set_option('display.max_colwidth', -1)

    if pairwise_tests is not False:
        pairwise_tests.to_csv(os.path.join(output_dir, 'pairwise_tests.tsv'),
                              sep='\t')
        pairwise_tests = q2templates.df_to_html(pairwise_tests)

    model_results.to_csv(os.path.join(output_dir, 'model_results.tsv'),
                         sep='\t')
    model_results = q2templates.df_to_html(model_results)

    residuals.savefig(os.path.join(output_dir, 'residuals.png'),
                      bbox_inches='tight')
    residuals.savefig(os.path.join(output_dir, 'residuals.pdf'),
                      bbox_inches='tight')
    plt.close('all')

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'plot_name': 'ANOVA',
                           'model_results': model_results,
                           'pairwise_tests': pairwise_tests,
                           'residuals': residuals,
                           'pairwise_test_name': pairwise_test_name,
                       })
Example #6
0
def _generic_plot(output_dir: str, master: skbio.OrdinationResults,
                  metadata: qiime2.Metadata,
                  other_pcoa: skbio.OrdinationResults, plot_name,
                  custom_axes: str=None,
                  feature_metadata: qiime2.Metadata=None):

    mf = metadata.to_dataframe()
    if feature_metadata is not None:
        feature_metadata = feature_metadata.to_dataframe()

    if other_pcoa is None:
        procrustes = None
    else:
        procrustes = [other_pcoa]

    viz = Emperor(master, mf, feature_mapping_file=feature_metadata,
                  procrustes=procrustes, remote='.')

    if custom_axes is not None:
        viz.custom_axes = custom_axes

    if other_pcoa:
        viz.procrustes_names = ['reference', 'other']

    html = viz.make_emperor(standalone=True)
    viz.copy_support_files(output_dir)
    with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh:
        fh.write(html)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={'plot_name': plot_name})
Example #7
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # convert metadata to numeric values where applicable, drop the non-numeric
    # values, and then drop samples that contain NaNs
    df = metadata.to_dataframe()
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    # filter categorical columns
    pre_filtered_cols = set(df.columns)
    df = df.select_dtypes([numpy.number]).dropna()
    filtered_categorical_cols = pre_filtered_cols - set(df.columns)

    # filter 0 variance numerical columns
    pre_filtered_cols = set(df.columns)
    df = df.loc[:, df.var() != 0]
    filtered_zero_variance_cols = pre_filtered_cols - set(df.columns)

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index, strict=False)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = result.to_html(classes='table table-striped table-hover').replace(
        'border="1"', 'border="0"')

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'filtered_categorical_cols': ', '.join(filtered_categorical_cols),
        'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols),
        'result': result})
Example #8
0
def heatmap(output_dir: str,
            ranks: pd.DataFrame,
            microbe_metadata: qiime2.CategoricalMetadataColumn = None,
            metabolite_metadata: qiime2.CategoricalMetadataColumn = None,
            method: str = 'average',
            metric: str = 'euclidean',
            color_palette: str = 'seismic',
            margin_palette: str = 'cubehelix',
            x_labels: bool = False,
            y_labels: bool = False,
            level: int = -1) -> None:
    if microbe_metadata is not None:
        microbe_metadata = microbe_metadata.to_series()
    if metabolite_metadata is not None:
        metabolite_metadata = metabolite_metadata.to_series()

    hotmap = ranks_heatmap(ranks, microbe_metadata, metabolite_metadata,
                           method, metric, color_palette, margin_palette,
                           x_labels, y_labels, level)

    hotmap.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight')
    hotmap.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight')

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'title': 'Rank Heatmap',
                           'pdf_fp': 'heatmap.pdf',
                           'png_fp': 'heatmap.png'
                       })
Example #9
0
def tabulate(output_dir: str,
             input: qiime2.Metadata,
             page_size: int = 100) -> None:
    if page_size < 1:
        raise ValueError('Cannot render less than one record per page.')

    df = input.to_dataframe()
    df_columns = pd.MultiIndex.from_tuples([(n, t.type)
                                            for n, t in input.columns.items()],
                                           names=['column header', 'type'])
    df.columns = df_columns
    df.reset_index(inplace=True)
    table = df.to_json(orient='split')
    index = os.path.join(TEMPLATES, 'tabulate', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'table': table,
                           'page_size': page_size
                       })

    input.save(os.path.join(output_dir, 'metadata.tsv'))

    js = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.js')
    os.mkdir(os.path.join(output_dir, 'js'))
    shutil.copy(js, os.path.join(output_dir, 'js', 'datatables.min.js'))

    css = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.css')
    os.mkdir(os.path.join(output_dir, 'css'))
    shutil.copy(css, os.path.join(output_dir, 'css', 'datatables.min.css'))
def _visualize(output_dir, estimator, cm, importances=None,
               optimize_feature_selection=True, title='results'):

    pd.set_option('display.max_colwidth', -1)

    # summarize model accuracy and params
    if estimator is not None:
        result = _extract_estimator_parameters(estimator)
        result = q2templates.df_to_html(result.to_frame())
    else:
        result = False

    if cm is not None:
        cm.to_csv(join(
            output_dir, 'predictive_accuracy.tsv'), sep='\t', index=True)
        cm = q2templates.df_to_html(cm)

    if importances is not None:
        importances = sort_importances(importances)
        pd.set_option('display.float_format', '{:.3e}'.format)
        importances.to_csv(join(
            output_dir, 'feature_importance.tsv'), sep='\t', index=True)
        importances = q2templates.df_to_html(importances, index=True)
    else:
        importances = False

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'title': title,
        'result': result,
        'predictions': cm,
        'importances': importances,
        'classification': True,
        'optimize_feature_selection': optimize_feature_selection,
        'maturity_index': False})
Example #11
0
def summarize(
    output_dir: str,
    problem: zarr.hierarchy.Group,
    taxa: skbio.TreeNode = None,
    maxplot: int = 200,
    predictions: zarr.hierarchy.Group = None,
):
    context = build_context(output_dir, problem, predictions, taxa, maxplot)

    index = os.path.join(assets, "index.html")
    overview_template = os.path.join(assets, "overview.html")
    path_template = os.path.join(assets, "path.html")
    cv_template = os.path.join(assets, "cv.html")
    stabsel_template = os.path.join(assets, "stabsel.html")
    lam_fixed_template = os.path.join(assets, "lam-fixed.html")

    templates = [
        index,
        overview_template,
        path_template,
        cv_template,
        stabsel_template,
        lam_fixed_template,
    ]

    q2templates.render(templates, output_dir, context=context)
Example #12
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # convert metadata to numeric values where applicable, drop the non-numeric
    # values, and then drop samples that contain NaNs
    df = metadata.to_dataframe()
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    # filter categorical columns
    pre_filtered_cols = set(df.columns)
    df = df.select_dtypes([numpy.number]).dropna()
    filtered_categorical_cols = pre_filtered_cols - set(df.columns)

    # filter 0 variance numerical columns
    pre_filtered_cols = set(df.columns)
    df = df.loc[:, df.var() != 0]
    filtered_zero_variance_cols = pre_filtered_cols - set(df.columns)

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index, strict=False)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = result.to_html(classes='table table-striped table-hover').replace(
        'border="1"', 'border="0"')

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'filtered_categorical_cols': ', '.join(filtered_categorical_cols),
        'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols),
        'result': result})
Example #13
0
def tabulate_seqs(output_dir: str, data: DNAIterator) -> None:
    sequences = []
    seq_lengths = []
    with open(os.path.join(output_dir, 'sequences.fasta'), 'w') as fh:
        for sequence in data:
            skbio.io.write(sequence, format='fasta', into=fh)
            str_seq = str(sequence)
            seq_len = len(str_seq)
            sequences.append({
                'id': sequence.metadata['id'],
                'len': seq_len,
                'url': _blast_url_template % str_seq,
                'seq': str_seq
            })
            seq_lengths.append(seq_len)
    seq_len_stats = _compute_descriptive_stats(seq_lengths)
    _write_tsvs_of_descriptive_stats(seq_len_stats, output_dir)

    index = os.path.join(TEMPLATES, 'tabulate_seqs_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'data': sequences,
                           'stats': seq_len_stats
                       })

    js = os.path.join(TEMPLATES, 'tabulate_seqs_assets', 'js',
                      'tsorter.min.js')
    os.mkdir(os.path.join(output_dir, 'js'))
    shutil.copy(js, os.path.join(output_dir, 'js', 'tsorter.min.js'))
Example #14
0
def _generic_plot(output_dir: str, master: skbio.OrdinationResults,
                  metadata: qiime2.Metadata,
                  other_pcoa: skbio.OrdinationResults, plot_name,
                  custom_axes: str=None):

    mf = metadata.to_dataframe()

    if other_pcoa is None:
        procrustes = None
    else:
        procrustes = [other_pcoa]

    viz = Emperor(master, mf, procrustes=procrustes, remote='.')

    if custom_axes is not None:
        viz.custom_axes = custom_axes

    if other_pcoa:
        viz.procrustes_names = ['reference', 'other']

    html = viz.make_emperor(standalone=True)
    viz.copy_support_files(output_dir)
    with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh:
        fh.write(html)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={'plot_name': plot_name})
def _visualize(output_dir, estimator, cm, roc,
               optimize_feature_selection=True, title='results'):

    pd.set_option('display.max_colwidth', None)

    # summarize model accuracy and params
    if estimator is not None:
        result = _extract_estimator_parameters(estimator)
        result = q2templates.df_to_html(result.to_frame())
    else:
        result = False

    if cm is not None:
        cm.to_csv(join(
            output_dir, 'predictive_accuracy.tsv'), sep='\t', index=True)
        cm = q2templates.df_to_html(cm)

    if roc is not None:
        roc = True

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'title': title,
        'result': result,
        'predictions': cm,
        'roc': roc,
        'optimize_feature_selection': optimize_feature_selection})
Example #16
0
def barplot(output_dir: str, table: pd.DataFrame, taxonomy: pd.Series,
            metadata: Metadata) -> None:
    metadata = metadata.to_dataframe()
    filenames = []
    collapsed_tables = _extract_to_level(taxonomy, table)

    for level, df in enumerate(collapsed_tables, 1):
        # Join collapsed table with metadata
        taxa_cols = df.columns.values.tolist()
        df = df.join(metadata, how='left')
        df = df.reset_index(drop=False)  # Move SampleID index into columns
        df = df.fillna('')  # JS sort works best with empty strings vs null
        all_cols = df.columns.values.tolist()

        filename = 'lvl-%d.jsonp' % level
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('Level %d'," % level)
            json.dump(taxa_cols, fh)
            fh.write(",")
            json.dump(all_cols, fh)
            fh.write(",")
            df.to_json(fh, orient='records')
            fh.write(");")

    # Now that the tables have been collapsed, write out the index template
    index = os.path.join(TEMPLATES, 'barplot', 'index.html')
    q2templates.render(index, output_dir, context={'filenames': filenames})

    # Copy assets for rendering figure
    shutil.copytree(os.path.join(TEMPLATES, 'barplot', 'dst'),
                    os.path.join(output_dir, 'dist'))
def heatmap(output_dir, table: pd.DataFrame,
            sample_metadata: qiime2.CategoricalMetadataColumn = None,
            feature_metadata: qiime2.CategoricalMetadataColumn = None,
            normalize: bool = True, title: str = None,
            metric: str = 'euclidean', method: str = 'average',
            cluster: str = 'both', color_scheme: str = 'rocket') -> None:
    if table.empty:
        raise ValueError('Cannot visualize an empty table.')

    if sample_metadata is not None:
        table = _munge_sample_metadata(sample_metadata, table, cluster)

    # relabel feature table feature IDs with feature metadata column values
    if feature_metadata is not None:
        table = _munge_feature_metadata(feature_metadata, table, cluster)

    cbar_label = 'frequency'
    if normalize:
        table = table.apply(lambda x: np.log10(x + 1))
        cbar_label = 'log10 frequency'

    # Hard-coded values for reasonable plots
    scaletron, labelsize, dpi = 50, 8, 100
    sns.set(rc={'xtick.labelsize': labelsize, 'ytick.labelsize': labelsize,
                'figure.dpi': dpi})
    width, height = table.shape[1] / scaletron, table.shape[0] / scaletron

    heatmap_plot = sns.clustermap(table, method=method, metric=metric,
                                  **_clustering_map[cluster],
                                  cmap=color_scheme,
                                  xticklabels=True, yticklabels=True,
                                  cbar_kws={'label': cbar_label})
    if title is not None:
        heatmap_plot.fig.suptitle(title)

    hm = heatmap_plot.ax_heatmap.get_position()
    cbar = heatmap_plot.cax.get_position()
    row = heatmap_plot.ax_row_dendrogram.get_position()
    col = heatmap_plot.ax_col_dendrogram.get_position()

    # Resize the plot to set cell aspect-ratio to square
    heatmap_plot.ax_heatmap.set_position([hm.x0, hm.y0, width, height])
    heatmap_plot.cax.set_position([cbar.x0, hm.y0 + height, cbar.width,
                                   cbar.height])
    heatmap_plot.ax_row_dendrogram.set_position([row.x0, row.y0, row.width,
                                                 height])
    heatmap_plot.ax_col_dendrogram.set_position([col.x0, hm.y0 + height, width,
                                                 col.height])

    # https://stackoverflow.com/a/34697479/3776794
    plt.setp(heatmap_plot.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
    plt.setp(heatmap_plot.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

    for ext in ['png', 'svg']:
        img_fp = os.path.join(output_dir, 'feature-table-heatmap.%s' % ext)
        heatmap_plot.savefig(img_fp)

    index_fp = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index_fp, output_dir, context={'normalize': normalize})
Example #18
0
def _visualize(output_dir, multiple_group_test=False, pairwise_tests=False,
               paired_difference_tests=False, plot=False, summary=False,
               errors=False, model_summary=False, model_results=False,
               raw_data=False, plot_name='Pairwise difference boxplot',
               pairwise_test_name='Pairwise group comparison tests'):

    pd.set_option('display.max_colwidth', -1)

    if summary is not False:
        summary = q2templates.df_to_html(summary.to_frame())

    if multiple_group_test is not False:
        multiple_group_test = multiple_group_test.to_frame().transpose()
        multiple_group_test = q2templates.df_to_html(multiple_group_test)

    if pairwise_tests is not False:
        pairwise_tests.to_csv(os.path.join(output_dir, 'pairwise_tests.tsv'),
                              sep='\t')
        pairwise_tests = q2templates.df_to_html(pairwise_tests)

    if raw_data is not False:
        raw_data.to_csv(os.path.join(output_dir, 'raw-data.tsv'), sep='\t')
        raw_data = True

    if paired_difference_tests is not False:
        paired_difference_tests.to_csv(os.path.join(
            output_dir, 'paired_difference_tests.tsv'), sep='\t')
        paired_difference_tests = q2templates.df_to_html(
            paired_difference_tests)

    if model_summary is not False:
        model_summary.to_csv(os.path.join(output_dir, 'model_summary.tsv'),
                             sep='\t')
        model_summary = q2templates.df_to_html(model_summary)

    if model_results is not False:
        model_results.to_csv(os.path.join(output_dir, 'model_results.tsv'),
                             sep='\t')
        model_results = q2templates.df_to_html(model_results)

    if plot is not False:
        plot.savefig(os.path.join(output_dir, 'plot.png'), bbox_inches='tight')
        plot.savefig(os.path.join(output_dir, 'plot.pdf'), bbox_inches='tight')
        plt.close('all')

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'errors': errors,
        'summary': summary,
        'model_summary': model_summary,
        'model_results': model_results,
        'multiple_group_test': multiple_group_test,
        'pairwise_tests': pairwise_tests,
        'paired_difference_tests': paired_difference_tests,
        'plot': plot,
        'plot_name': plot_name,
        'raw_data': raw_data,
        'pairwise_test_name': pairwise_test_name,
    })
def _visualize_maturity_index(table, metadata, group_by, column,
                              predicted_column, importances, estimator,
                              accuracy, output_dir, maz_stats=True):

    pd.set_option('display.max_colwidth', -1)

    maturity = '{0} maturity'.format(column)
    maz = '{0} MAZ score'.format(column)

    # save feature importance data and convert to html
    importances = sort_importances(importances)
    importances.to_csv(
        join(output_dir, 'feature_importance.tsv'), index=True, sep='\t')
    importance = q2templates.df_to_html(importances, index=True)

    # save predicted values, maturity, and MAZ score data
    maz_md = metadata[[group_by, column, predicted_column, maturity, maz]]
    maz_md.to_csv(join(output_dir, 'maz_scores.tsv'), sep='\t')
    if maz_stats:
        maz_aov = _two_way_anova(table, metadata, maz, group_by, column)[0]
        maz_aov.to_csv(join(output_dir, 'maz_aov.tsv'), sep='\t')
        maz_pairwise = _pairwise_stats(
            table, metadata, maz, group_by, column)
        maz_pairwise.to_csv(join(output_dir, 'maz_pairwise.tsv'), sep='\t')

    # plot control/treatment predicted vs. actual values
    g = _lmplot_from_dataframe(
        metadata, column, predicted_column, group_by)
    g.savefig(join(output_dir, 'maz_predictions.png'), bbox_inches='tight')
    g.savefig(join(output_dir, 'maz_predictions.pdf'), bbox_inches='tight')
    plt.close('all')

    # plot barplots of MAZ score vs. column (e.g., age)
    g = _boxplot_from_dataframe(metadata, column, maz, group_by)
    g.get_figure().savefig(
        join(output_dir, 'maz_boxplots.png'), bbox_inches='tight')
    g.get_figure().savefig(
        join(output_dir, 'maz_boxplots.pdf'), bbox_inches='tight')
    plt.close('all')

    # plot heatmap of column (e.g., age) vs. abundance of top features
    top = table[list(importances.index)]
    g = _clustermap_from_dataframe(top, metadata, group_by, column)
    g.savefig(join(output_dir, 'maz_heatmaps.png'), bbox_inches='tight')
    g.savefig(join(output_dir, 'maz_heatmaps.pdf'), bbox_inches='tight')

    result = _extract_estimator_parameters(estimator)
    result.append(pd.Series([accuracy], index=['Accuracy score']))
    result = q2templates.df_to_html(result.to_frame())

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'title': 'maturity index predictions',
        'result': result,
        'predictions': None,
        'importances': importance,
        'classification': False,
        'optimize_feature_selection': True,
        'maturity_index': True})
Example #20
0
def tabulate_seqs(output_dir: str, data: DNAIterator) -> None:
    sequences = []
    for sequence in data:
        str_seq = str(sequence)
        sequences.append({"id": sequence.metadata["id"], "url": _blast_url_template % str_seq, "seq": str_seq})

    index = os.path.join(TEMPLATES, "tabulate_seqs_assets", "index.html")
    q2templates.render(index, output_dir, context={"data": sequences})
Example #21
0
def core_features(output_dir, table: biom.Table, min_fraction: float=0.5,
                  max_fraction: float=1.0, steps: int=11) -> None:
    if max_fraction < min_fraction:
        raise ValueError('min_fraction (%r) parameter must be less than '
                         'max_fraction (%r) parameter.' %
                         (min_fraction, max_fraction))

    index_fp = os.path.join(TEMPLATES, 'index.html')
    context = {
        'num_samples': table.shape[1],
        'num_features': table.shape[0]
    }

    if min_fraction == max_fraction:
        fractions = [min_fraction]
    else:
        fractions = np.linspace(min_fraction, max_fraction, steps)

    rounded_fractions = _round_fractions(fractions)

    data = []
    file_links = []
    for fraction, rounded_fraction in zip(fractions, rounded_fractions):
        core_features = _get_core_features(table, fraction)
        core_feature_count = len(core_features)
        data.append([fraction, core_feature_count])

        if core_feature_count > 0:
            core_feature_fn = 'core-features-%s.tsv' % rounded_fraction
            core_feature_fp = os.path.join(output_dir, core_feature_fn)

            file_links.append("<a href='./%s'>TSV</a>" % core_feature_fn)

            core_features.to_csv(core_feature_fp, sep='\t',
                                 index_label='Feature ID')
        else:
            file_links.append('No core features')

    df = pd.DataFrame(data, columns=['Fraction of samples', 'Feature count'])
    df['Fraction of features'] = df['Feature count'] / table.shape[0]
    df['Feature list'] = file_links

    ax = sns.regplot(data=df, x='Fraction of samples', y='Feature count',
                     fit_reg=False)

    # matplotlib will issue a UserWarning if attempting to set left and right
    # bounds to the same value.
    if min_fraction != max_fraction:
        ax.set_xbound(min(fractions), max(fractions))
    ax.set_ybound(0, max(df['Feature count']) + 1)

    ax.get_figure().savefig(
        os.path.join(output_dir, 'core-feature-counts.svg'))

    context['table_html'] = q2templates.df_to_html(df, index=False,
                                                   escape=False)

    q2templates.render(index_fp, output_dir, context=context)
Example #22
0
def _visualize(output_dir, multiple_group_test=False, pairwise_tests=False,
               paired_difference_tests=False, plot=False, summary=False,
               model_summary=False, model_results=False):

    pd.set_option('display.max_colwidth', -1)

    if summary is not False:
        summary = summary.to_frame().to_html(classes=(
            "table table-striped table-hover")).replace(
                'border="1"', 'border="0"')

    if multiple_group_test is not False:
        multiple_group_test = multiple_group_test.to_frame().transpose()
        multiple_group_test = multiple_group_test.to_html(classes=(
            "table table-striped table-hover")).replace(
                'border="1"', 'border="0"')

    if pairwise_tests is not False:
        pairwise_tests.to_csv(join(output_dir, 'pairwise_tests.tsv'), sep='\t')
        pairwise_tests = pairwise_tests.to_html(classes=(
            "table table-striped table-hover")).replace(
                'border="1"', 'border="0"')

    if paired_difference_tests is not False:
        paired_difference_tests.to_csv(join(
            output_dir, 'paired_difference_tests.tsv'), sep='\t')
        paired_difference_tests = paired_difference_tests.to_html(classes=(
            "table table-striped table-hover")).replace(
                'border="1"', 'border="0"')

    if model_summary is not False:
        model_summary.to_csv(join(output_dir, 'model_summary.tsv'), sep='\t')
        model_summary = model_summary.to_html(classes=(
            "table table-striped table-hover")).replace(
                'border="1"', 'border="0"')

    if model_results is not False:
        model_results.to_csv(join(output_dir, 'model_results.tsv'), sep='\t')
        model_results = model_results.to_html(classes=(
            "table table-striped table-hover")).replace(
                'border="1"', 'border="0"')

    if plot is not False:
        plot.savefig(join(output_dir, 'plot.png'), bbox_inches='tight')
        plot.savefig(join(output_dir, 'plot.pdf'), bbox_inches='tight')
        plt.close('all')

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'summary': summary,
        'model_summary': model_summary,
        'model_results': model_results,
        'multiple_group_test': multiple_group_test,
        'pairwise_tests': pairwise_tests,
        'paired_difference_tests': paired_difference_tests,
        'plot': plot,
    })
def _visualize_knn(output_dir, params: pd.Series):
    result = q2templates.df_to_html(params.to_frame())
    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'title': 'Estimator Summary',
        'result': result,
        'predictions': None,
        'importances': None,
        'classification': True,
        'optimize_feature_selection': False})
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str,
                     clustering_method: str, metadata: qiime2.Metadata,
                     sampling_depth: int, iterations: int=10,
                     phylogeny: skbio.TreeNode=None,
                     correlation_method: str='spearman',
                     color_scheme: str='BrBG') -> None:
    if metric in phylogenetic_metrics():
        if phylogeny is None:
            raise ValueError("A phylogenetic metric (%s) was requested, "
                             "but a phylogenetic tree was not provided. "
                             "Phylogeny must be provided when using a "
                             "phylogenetic diversity metric." % metric)
        beta_func = functools.partial(beta_phylogenetic, phylogeny=phylogeny)
    else:
        beta_func = beta

    distance_matrices = _get_multiple_rarefaction(
        beta_func, metric, iterations, table, sampling_depth)
    primary = distance_matrices[0]
    support = distance_matrices[1:]

    heatmap_fig, similarity_df = _make_heatmap(
        distance_matrices, metric, correlation_method, color_scheme)
    heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg'))
    similarity_df.to_csv(
        os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'),
        sep='\t')

    tree = _cluster_samples(primary, support, clustering_method)
    tree.write(os.path.join(output_dir,
                            'sample-clustering-%s.tre' % clustering_method))

    emperor = _jackknifed_emperor(primary, support, metadata)
    emperor_dir = os.path.join(output_dir, 'emperor')
    emperor.copy_support_files(emperor_dir)
    with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh:
        fh.write(emperor.make_emperor(standalone=True))

    templates = list(map(
        lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page),
        ['index.html', 'heatmap.html', 'tree.html', 'emperor.html']))

    context = {
        'metric': metric,
        'clustering_method': clustering_method,
        'tabs': [{'url': 'emperor.html',
                  'title': 'PCoA'},
                 {'url': 'heatmap.html',
                  'title': 'Heatmap'},
                 {'url': 'tree.html',
                  'title': 'Clustering'}]
    }

    q2templates.render(templates, output_dir, context=context)
Example #25
0
def heatmap(output_dir, table: pd.DataFrame,
            metadata: qiime2.CategoricalMetadataColumn=None,
            normalize: bool=True, title: str=None, metric: str='euclidean',
            method: str='average', cluster: str='both',
            color_scheme: str='rocket') -> None:
    if table.empty:
        raise ValueError('Cannot visualize an empty table.')

    if metadata is not None:
        table = _munge_metadata(metadata, table, cluster)

    cbar_label = 'frequency'
    if normalize:
        table = table.apply(lambda x: np.log10(x + 1))
        cbar_label = 'log10 frequency'

    # Hard-coded values for reasonable plots
    scaletron, labelsize, dpi = 50, 8, 100
    sns.set(rc={'xtick.labelsize': labelsize, 'ytick.labelsize': labelsize,
                'figure.dpi': dpi})
    width, height = table.shape[1] / scaletron, table.shape[0] / scaletron

    heatmap_plot = sns.clustermap(table, method=method, metric=metric,
                                  **_clustering_map[cluster],
                                  cmap=color_scheme,
                                  xticklabels=True, yticklabels=True,
                                  cbar_kws={'label': cbar_label})
    if title is not None:
        heatmap_plot.fig.suptitle(title)

    hm = heatmap_plot.ax_heatmap.get_position()
    cbar = heatmap_plot.cax.get_position()
    row = heatmap_plot.ax_row_dendrogram.get_position()
    col = heatmap_plot.ax_col_dendrogram.get_position()

    # Resize the plot to set cell aspect-ratio to square
    heatmap_plot.ax_heatmap.set_position([hm.x0, hm.y0, width, height])
    heatmap_plot.cax.set_position([cbar.x0, hm.y0 + height, cbar.width,
                                   cbar.height])
    heatmap_plot.ax_row_dendrogram.set_position([row.x0, row.y0, row.width,
                                                 height])
    heatmap_plot.ax_col_dendrogram.set_position([col.x0, hm.y0 + height, width,
                                                 col.height])

    # https://stackoverflow.com/a/34697479/3776794
    plt.setp(heatmap_plot.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
    plt.setp(heatmap_plot.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

    for ext in ['png', 'svg']:
        img_fp = os.path.join(output_dir, 'feature-table-heatmap.%s' % ext)
        heatmap_plot.savefig(img_fp)

    index_fp = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index_fp, output_dir, context={'normalize': normalize})
Example #26
0
def save_animated_map(output_dir, lat_min, lat_max, data, column):
    # save fig, which is really a legend
    plt.savefig(join(output_dir, 'colorbar.png'), bbox_inches='tight')
    # copy all js/css utilities
    in_path = partial(join, TEMPLATES, 'animated_map')
    copytree(in_path('static'),
             join(output_dir, 'static'))
    # save template
    q2templates.render(in_path('index.html'), output_dir, context={
        'lat_min': lat_min, 'lat_max': lat_max, 'data': data,
        'column': column})
Example #27
0
def mapviz(output_dir, results=None, title='Coordinates'):
    if results is not None:
        results.to_csv(join(
            output_dir, 'results.tsv'), sep='\t', index=True)
        results = q2templates.df_to_html(results)
    else:
        results = False

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'results': results,
        'title': title})
Example #28
0
def summarize(output_dir: str, problem : zarr.hierarchy.Group):
    print(TEMPLATES)

    beta = pd.DataFrame(data={'label':problem['label'],'beta':problem['solution/LAMfixed/refit']})
    beta.to_csv(os.path.join(output_dir,'beta.csv'),header=True, index=False)
    show_plot = False
    if show_plot : 
        x = np.linspace(0,1)
        y = x**2
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)
        ax.plot(x, y, color='blue')
        fig.savefig(os.path.join(output_dir, 'test-plot.png'))
    
    

    html = q2templates.df_to_html(beta, index=False)
    context = {
        'dico': {
            'un': 1, 'deux':2
        },
        'result': html,
        'n_features':len(beta),
        'beta' : beta,
        'show_plot': show_plot,
        'tabs': [{'title': 'Overview',
                  'url': 'overview.html'},
                 {'title': 'LAM fixed',
                  'url': 'lam-fixed.html'}],
        'dangers': [],
        'warnings': [],
    }
    
    
    index = os.path.join(TEMPLATES, 'assets', 'index.html')
    overview_template = os.path.join(TEMPLATES, 'assets', 'overview.html')
    quality_template = os.path.join(TEMPLATES, 'assets', 'quality-plot.html')
    templates = [index, overview_template, quality_template]
    q2templates.render(templates, output_dir, context=context)

    shutil.copytree(os.path.join(TEMPLATES, 'assets', 'dist'),
                    os.path.join(output_dir, 'dist'))

    


    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
        fh.write("app.init(")
        json.dump({'selected param' : 10}, fh)
        fh.write(',')
        beta.to_json(fh)
        fh.write(');')
Example #29
0
def seq_depth(output_dir: str,
              table: pd.DataFrame,
              metadata: qiime2.Metadata,
              mypar: float = 4) -> None:

    table_path = os.path.join(output_dir, 'table.tsv')
    metadata_path = os.path.join(output_dir, 'metadata.tsv')

    table.to_csv(table_path)
    metadata.save(metadata_path)

    cmd_path = os.path.join(TEMPLATES, 'seq_depth.R')

    print(os.path.exists(table_path))
    print(os.path.exists(metadata_path))

    cmd = [
        'Rscript', cmd_path, '{0}'.format(output_dir),
        '{0}'.format(table_path), '{0}'.format(metadata_path)
    ]
    #cmd = 'Rscript {0} arg1={1} arg2={2} arg3={3}'.format(cmd_path, output_dir, table_path, metadata_path)
    #cmd = 'Rscript assets/seq_depth.R arg1=$1 arg2=$2 arg3=$3'
    proc = subprocess.run(cmd, check=True)
    index = os.path.join(TEMPLATES, 'index.html')

    # Errors filepath, load in as list
    errors_fp = os.path.join(output_dir, 'warnings.txt')
    with open(errors_fp, 'r') as errors_f:
        errors = [e for e in errors_f]

    # Load in depths as a pandas data frame, then transfer to html
    depths = pd.read_csv(os.path.join(output_dir, 'mytable.tsv'), sep="\t")
    depths = q2templates.df_to_html(depths)

    # Load in plot
    plot_fp = os.path.join(output_dir, 'myplot.png')

    q2templates.render(index,
                       output_dir,
                       context={
                           'errors': errors,
                           'summary': None,
                           'model_summary': None,
                           'model_results': depths,
                           'multiple_group_test': None,
                           'pairwise_tests': None,
                           'paired_difference_tests': None,
                           'plot': True,
                           'plot_name': "My Plot",
                           'raw_data': None,
                           'pairwise_test_name': None,
                       })
Example #30
0
def report(output_dir: str, pcoa: skbio.OrdinationResults, metadata: Metadata,
           alpha: pd.Series, table: biom.Table, taxonomy: pd.Series,
           samples: list) -> None:
    metadata = metadata.to_dataframe()

    _insanity_checker(samples, metadata, table, alpha, pcoa)

    index = os.path.join(TEMPLATES, 'report', 'index.html')
    q2templates.render(index, output_dir, context={'name': 'foo'})

    # Copy assets for rendering figure
    shutil.copytree(os.path.join(TEMPLATES, 'report', 'resources'),
                    os.path.join(output_dir, 'resources'))
Example #31
0
def paired_heatmap(output_dir: str,
                   ranks: pd.DataFrame,
                   microbes_table: biom.Table,
                   metabolites_table: biom.Table,
                   features: str = None,
                   top_k_microbes: int = 2,
                   keep_top_samples: bool = True,
                   microbe_metadata: qiime2.CategoricalMetadataColumn = None,
                   normalize: str = 'log10',
                   color_palette: str = 'magma',
                   top_k_metabolites: int = 50,
                   level: int = -1,
                   row_center: bool = True) -> None:
    if microbe_metadata is not None:
        microbe_metadata = microbe_metadata.to_series()

    ranks = ranks.T

    if row_center:
        ranks = ranks - ranks.mean(axis=0)

    select_microbes, select_metabolites, hotmaps = paired_heatmaps(
        ranks, microbes_table, metabolites_table, microbe_metadata, features,
        top_k_microbes, top_k_metabolites, keep_top_samples, level, normalize,
        color_palette)

    hotmaps.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight')
    hotmaps.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight')
    select_microbes.to_csv(join(output_dir, 'select_microbes.tsv'), sep='\t')
    select_metabolites.to_csv(join(output_dir, 'select_metabolites.tsv'),
                              sep='\t')

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'title':
                           'Paired Feature Abundance Heatmaps',
                           'pdf_fp':
                           'heatmap.pdf',
                           'png_fp':
                           'heatmap.png',
                           'table1_fp':
                           'select_microbes.tsv',
                           'download1_text':
                           'Download microbe abundances as TSV',
                           'table2_fp':
                           'select_metabolites.tsv',
                           'download2_text':
                           'Download top k metabolite abundances as TSV'
                       })
Example #32
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)

    # drop non-numeric columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric')
    non_numeric_cols = pre_filtered_cols - set(metadata.columns)

    # Drop samples that have any missing values.
    # TODO use Metadata API if more filtering is supported in the future.
    df = metadata.to_dataframe()
    df = df.dropna()
    metadata = qiime2.Metadata(df)

    # filter 0 variance numerical columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_zero_variance=True,
                                       drop_all_missing=True)
    zero_variance_cols = pre_filtered_cols - set(metadata.columns)

    df = metadata.to_dataframe()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'initial_dm_length':
                           initial_dm_length,
                           'filtered_dm_length':
                           filtered_dm_length,
                           'non_numeric_cols':
                           ', '.join(sorted(non_numeric_cols)),
                           'zero_variance_cols':
                           ', '.join(sorted(zero_variance_cols)),
                           'result':
                           result
                       })
Example #33
0
def barplot(output_dir: str, table: pd.DataFrame, taxonomy: pd.Series,
            metadata: Metadata = None) -> None:

    if metadata is None:
        metadata = Metadata(pd.DataFrame({'id': table.index}).set_index('id'))

    ids_not_in_metadata = set(table.index) - set(metadata.ids)
    if ids_not_in_metadata:
        raise ValueError('Sample IDs found in the table are missing in the '
                         f'metadata: {ids_not_in_metadata!r}.')

    metadata = metadata.to_dataframe()
    jsonp_files, csv_files = [], []
    collapsed_tables = _extract_to_level(taxonomy, table)

    for level, df in enumerate(collapsed_tables, 1):
        # Stash column labels before manipulating dataframe
        taxa_cols = df.columns.values.tolist()
        # Join collapsed table with metadata
        df = df.join(metadata, how='left')
        df = df.reset_index(drop=False)  # Move index into columns
        # Our JS sort works best with empty strings vs nulls
        df = df.fillna('')
        all_cols = df.columns.values.tolist()

        jsonp_file = 'level-%d.jsonp' % level
        csv_file = 'level-%d.csv' % level

        jsonp_files.append(jsonp_file)
        csv_files.append(csv_file)

        df.to_csv(os.path.join(output_dir, csv_file), index=False)

        with open(os.path.join(output_dir, jsonp_file), 'w') as fh:
            fh.write('load_data(%d,' % level)
            json.dump(taxa_cols, fh)
            fh.write(',')
            json.dump(all_cols, fh)
            fh.write(',')
            df.to_json(fh, orient='records')
            fh.write(');')

    # Now that the tables have been collapsed, write out the index template
    index = os.path.join(TEMPLATES, 'barplot', 'index.html')
    q2templates.render(index, output_dir, context={'jsonp_files': jsonp_files})

    # Copy assets for rendering figure
    shutil.copytree(os.path.join(TEMPLATES, 'barplot', 'dist'),
                    os.path.join(output_dir, 'dist'))
Example #34
0
def summarize(output_dir: str, problem: zarr.hierarchy.Group):

    context = build_context(output_dir, problem)

    index = os.path.join(assets, 'index.html')
    overview_template = os.path.join(assets, 'overview.html')
    path_template = os.path.join(assets, 'path.html')
    cv_template = os.path.join(assets, 'cv.html')
    stabsel_template = os.path.join(assets, 'stabsel.html')
    lam_fixed_template = os.path.join(assets, 'lam-fixed.html')
    templates = [
        index, overview_template, path_template, cv_template, stabsel_template,
        lam_fixed_template
    ]
    q2templates.render(templates, output_dir, context=context)
Example #35
0
def kNN_LOOCV_F_measures(output_dir: str,
                         nearest_neighbors: dict, class_weight: DataFrame):
    y = nearest_neighbors['taxonomies']
    indices = nearest_neighbors['neighbors']
    weights = class_weight.T['Weight'].to_dict()
    uniform = _loocv(y, indices, weights, True)
    bespoke = _loocv(y, indices, weights)
    index = os.path.join(TEMPLATES, 'index.html')
    f_measures = DataFrame({'F-measure': [bespoke, uniform, bespoke-uniform]},
                           index=['Weighted', 'Uniform', 'Difference'])
    f_measures = q2templates.df_to_html(f_measures)
    q2templates.render(index, output_dir, context={
        'title': 'Indicators of Taxonomic Weight Importance',
        'f_measures': f_measures,
    })
Example #36
0
def save_viz(viz, output_dir):
    """Saves an Empress visualization to a filepath.

    Parameters
    ----------
    viz : empress.Empress
    output_dir : str
    """
    with open(os.path.join(output_dir, 'empress.html'), 'w') as htmlfile:
        htmlfile.write(str(viz))

    viz.copy_support_files(output_dir)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir)
Example #37
0
def tabulate_seqs(output_dir: str, data: DNAIterator) -> None:
    sequences = []
    with open(os.path.join(output_dir, 'sequences.fasta'), 'w') as fh:
        for sequence in data:
            skbio.io.write(sequence, format='fasta', into=fh)
            str_seq = str(sequence)
            sequences.append({'id': sequence.metadata['id'],
                              'url': _blast_url_template % str_seq,
                              'seq': str_seq})

    index = os.path.join(TEMPLATES, 'tabulate_seqs_assets', 'index.html')
    q2templates.render(index, output_dir, context={'data': sequences})

    js = os.path.join(
        TEMPLATES, 'tabulate_seqs_assets', 'js', 'tsorter.min.js')
    os.mkdir(os.path.join(output_dir, 'js'))
    shutil.copy(js, os.path.join(output_dir, 'js', 'tsorter.min.js'))
Example #38
0
def plot(output_dir: str, pcoa: skbio.OrdinationResults,
         metadata: qiime2.Metadata, custom_axis: str=None) -> None:

    mf = metadata.to_dataframe()
    viz = Emperor(pcoa, mf, remote='.')

    if custom_axis is not None:
        # put custom_axis inside a list to workaround the type system not
        # supporting lists of types
        html = viz.make_emperor(standalone=True, custom_axes=[custom_axis])
    else:
        html = viz.make_emperor(standalone=True)
    viz.copy_support_files(output_dir)
    with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh:
        fh.write(html)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir)
Example #39
0
def summarize(output_dir: str, table: biom.Table) -> None:
    number_of_samples = table.shape[1]
    number_of_features = table.shape[0]

    sample_summary, sample_frequencies = _frequency_summary(table, axis="sample")
    if number_of_samples > 1:
        sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False, rug=True)
        sample_frequencies_ax.set_xlabel("Frequency per sample")
        sample_frequencies_ax.get_figure().savefig(os.path.join(output_dir, "sample-frequencies.pdf"))
        sample_frequencies_ax.get_figure().savefig(os.path.join(output_dir, "sample-frequencies.png"))
        plt.gcf().clear()

    feature_summary, feature_frequencies = _frequency_summary(table, axis="observation")
    if number_of_features > 1:
        feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False, rug=True)
        feature_frequencies_ax.set_xlabel("Frequency per feature")
        feature_frequencies_ax.set_xscale("log")
        feature_frequencies_ax.set_yscale("log")
        feature_frequencies_ax.get_figure().savefig(os.path.join(output_dir, "feature-frequencies.pdf"))
        feature_frequencies_ax.get_figure().savefig(os.path.join(output_dir, "feature-frequencies.png"))

    sample_summary_table = _format_html_table(sample_summary.to_frame("Frequency"))
    feature_summary_table = _format_html_table(feature_summary.to_frame("Frequency"))

    index = os.path.join(TEMPLATES, "summarize_assets", "index.html")
    context = {
        "number_of_samples": number_of_samples,
        "number_of_features": number_of_features,
        "total_frequencies": int(np.sum(sample_frequencies)),
        "sample_summary_table": sample_summary_table,
        "feature_summary_table": feature_summary_table,
    }

    sample_frequencies.sort_values(inplace=True)
    sample_frequencies.to_csv(os.path.join(output_dir, "sample-frequency-detail.csv"))

    sample_frequencies_table = _format_html_table(sample_frequencies.to_frame("Frequency"))
    sample_frequency_template = os.path.join(TEMPLATES, "summarize_assets", "sample-frequency-detail.html")

    context.update({"sample_frequencies_table": sample_frequencies_table})
    templates = [index, sample_frequency_template]
    q2templates.render(templates, output_dir, context=context)
Example #40
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)

    # drop non-numeric columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric')
    non_numeric_cols = pre_filtered_cols - set(metadata.columns)

    # filter 0 variance numerical columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_zero_variance=True,
                                       drop_all_missing=True)
    zero_variance_cols = pre_filtered_cols - set(metadata.columns)

    # Drop samples that have any missing values.
    # TODO use Metadata API if this type of filtering is supported in the
    # future.
    df = metadata.to_dataframe()
    df = df.dropna(axis='index', how='any')

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'non_numeric_cols': ', '.join(sorted(non_numeric_cols)),
        'zero_variance_cols': ', '.join(sorted(zero_variance_cols)),
        'result': result})
Example #41
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: str = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            metadata.get_column(i.name())

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations),
               str(n_jobs), results_fp]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
Example #42
0
def summarize(output_dir: str, table: biom.Table,
              sample_metadata: qiime2.Metadata=None) -> None:
    number_of_features, number_of_samples = table.shape

    sample_summary, sample_frequencies = _frequency_summary(
        table, axis='sample')
    if number_of_samples > 1:
        # Calculate the bin count, with a minimum of 5 bins
        IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile']
        if IQR == 0.0:
            bins = 5
        else:
            # Freedman–Diaconis rule
            bin_width = (2 * IQR) / (number_of_samples ** (1/3))

            bins = max((sample_summary['Maximum frequency'] -
                        sample_summary['Minimum frequency']) / bin_width, 5)

        sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False,
                                             rug=True, bins=int(round(bins)))
        sample_frequencies_ax.get_xaxis().set_major_formatter(
            matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        sample_frequencies_ax.set_xlabel('Frequency per sample')
        sample_frequencies_ax.set_ylabel('Number of samples')
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.pdf'))
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.png'))
        plt.gcf().clear()

    feature_summary, feature_frequencies = _frequency_summary(
        table, axis='observation')
    if number_of_features > 1:
        feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False,
                                              rug=False)
        feature_frequencies_ax.set_xlabel('Frequency per feature')
        feature_frequencies_ax.set_ylabel('Number of features')
        feature_frequencies_ax.set_xscale('log')
        feature_frequencies_ax.set_yscale('log')
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.pdf'))
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.png'))

    sample_summary_table = q2templates.df_to_html(
        sample_summary.apply('{:,}'.format).to_frame('Frequency'))
    feature_summary_table = q2templates.df_to_html(
        feature_summary.apply('{:,}'.format).to_frame('Frequency'))

    index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html')
    context = {
        'number_of_samples': number_of_samples,
        'number_of_features': number_of_features,
        'total_frequencies': int(np.sum(sample_frequencies)),
        'sample_summary_table': sample_summary_table,
        'feature_summary_table': feature_summary_table,
    }

    feature_qualitative_data = _compute_qualitative_summary(table)
    sample_frequencies.sort_values(inplace=True, ascending=False)
    feature_frequencies.sort_values(inplace=True, ascending=False)
    sample_frequencies.to_csv(
        os.path.join(output_dir, 'sample-frequency-detail.csv'))
    feature_frequencies.to_csv(
        os.path.join(output_dir, 'feature-frequency-detail.csv'))

    feature_frequencies = feature_frequencies.astype(int) \
        .apply('{:,}'.format).to_frame('Frequency')
    feature_frequencies['# of Samples Observed In'] = \
        pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format)
    feature_frequencies_table = q2templates.df_to_html(feature_frequencies)
    overview_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'overview.html')
    sample_frequency_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html')
    feature_frequency_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html')

    context.update({'max_count': sample_frequencies.max(),
                    'feature_frequencies_table': feature_frequencies_table,
                    'feature_qualitative_data': feature_qualitative_data,
                    'tabs': [{'url': 'overview.html',
                              'title': 'Overview'},
                             {'url': 'sample-frequency-detail.html',
                              'title': 'Interactive Sample Detail'},
                             {'url': 'feature-frequency-detail.html',
                              'title': 'Feature Detail'}]})
    templates = [index, sample_frequency_template,
                 feature_frequency_template, overview_template]
    q2templates.render(templates, output_dir, context=context)

    shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'),
                    os.path.join(output_dir, 'dist'))

    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
        fh.write("app.init(")
        if sample_metadata:
            sample_metadata = sample_metadata.filter_ids(
                sample_frequencies.index)
            # TODO use Metadata.to_json() API if/when it exists in the future.
            sample_metadata.to_dataframe().to_json(fh)
        else:
            fh.write('{}')
        fh.write(', ')
        sample_frequencies.to_json(fh)
        fh.write(');')
Example #43
0
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode=None, metrics: set=None,
                      metadata: qiime2.Metadata=None, min_depth: int=1,
                      steps: int=10, iterations: int=10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))
    if metadata is not None:
        metadata_ids = metadata.ids()
        table_ids = set(table.ids(axis='sample'))
        if not table_ids.issubset(metadata_ids):
            raise ValueError('Missing samples in metadata: %r' %
                             table_ids.difference(metadata_ids))

    filenames, categories, empty_columns = [], [], []
    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            metadata_df = metadata.to_dataframe()
            metadata_df = metadata_df.loc[data.index]

            all_columns = metadata_df.columns
            metadata_df.dropna(axis='columns', how='all', inplace=True)
            empty_columns = set(all_columns) - set(metadata_df.columns)

            metadata_df.columns = pd.MultiIndex.from_tuples(
                [(c, '') for c in metadata_df.columns])
            merged = data.join(metadata_df, how='left')
            categories = metadata_df.columns.get_level_values(0)
            for category in categories:
                category_name = quote(category)
                reindexed_df, counts = _reindex_with_metadata(category,
                                                              categories,
                                                              merged)
                c_df = _compute_summary(reindexed_df, category, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, category_name)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': filenames,
                                'categories': list(categories),
                                'empty_columns': sorted(empty_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Example #44
0
def ancom(output_dir: str,
          table: pd.DataFrame,
          metadata: qiime2.CategoricalMetadataColumn,
          transform_function: str = 'clr',
          difference_function: str = None) -> None:
    metadata = metadata.filter_ids(table.index)
    if metadata.has_missing_values():
        missing_data_sids = metadata.get_ids(where_values_missing=True)
        missing_data_sids = ', '.join(sorted(missing_data_sids))
        raise ValueError('Metadata column is missing values for the '
                         'following samples. Values need to be added for '
                         'these samples, or the samples need to be removed '
                         'from the table: %s' % missing_data_sids)
    ancom_results = skbio_ancom(table,
                                metadata.to_series(),
                                significance_test=f_oneway)
    ancom_results[0].sort_values(by='W', ascending=False, inplace=True)
    ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'},
                            inplace=True)
    significant_features = ancom_results[0][
        ancom_results[0]['Reject null hypothesis']]

    context = dict()
    if not significant_features.empty:
        context['significant_features'] = q2templates.df_to_html(
            significant_features['W'].to_frame())
        context['percent_abundances'] = q2templates.df_to_html(
            ancom_results[1].loc[significant_features.index])

    metadata = metadata.to_series()
    cats = list(set(metadata))
    transform_function_name = transform_function
    transform_function = _transform_functions[transform_function]
    transformed_table = table.apply(
        transform_function, axis=1, result_type='broadcast')

    if difference_function is None:
        if len(cats) == 2:
            difference_function = 'mean_difference'
        else:  # len(categories) > 2
            difference_function = 'f_statistic'

    _d_func = _difference_functions[difference_function]

    def diff_func(x):
        args = _d_func(*[x[metadata == c] for c in cats])
        if isinstance(args, tuple):
            return args[0]
        else:
            return args
    # effectively doing a groupby operation wrt to the metadata
    fold_change = transformed_table.apply(diff_func, axis=0)
    if not pd.isnull(fold_change).all():
        volcano_results = pd.DataFrame({transform_function_name: fold_change,
                                        'W': ancom_results[0].W})
        volcano_results = volcano_results.reset_index(drop=False)

        spec = {
            '$schema': 'https://vega.github.io/schema/vega/v4.json',
            'width': 300,
            'height': 300,
            'data': [
                {'name': 'values',
                 'values': volcano_results.to_dict(orient='records')}],
            'scales': [
                {'name': 'xScale',
                 'domain': {'data': 'values',
                            'field': transform_function_name},
                 'range': 'width'},
                {'name': 'yScale',
                 'domain': {'data': 'values', 'field': 'W'},
                 'range': 'height'}],
            'axes': [
                {'scale': 'xScale', 'orient': 'bottom',
                 'title': transform_function_name},
                {'scale': 'yScale', 'orient': 'left', 'title': 'W'}],
            'marks': [
              {'type': 'symbol',
               'from': {'data': 'values'},
               'encode': {
                   'hover': {
                       'fill': {'value': '#FF0000'},
                       'opacity': {'value': 1}},
                   'enter': {
                       'x': {'scale': 'xScale',
                             'field': transform_function_name},
                       'y': {'scale': 'yScale', 'field': 'W'}},
                   'update': {
                       'fill': {'value': 'black'},
                       'opacity': {'value': 0.3},
                       'tooltip': {
                           'signal': "{{'title': datum['index'], '{0}': "
                                     "datum['{0}'], 'W': datum['W']}}".format(
                                         transform_function_name)}}}}]}
        context['vega_spec'] = json.dumps(spec)

    copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir)
    ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'),
                            header=True, index=True, sep='\t')
    ancom_results[1].to_csv(os.path.join(output_dir,
                                         'percent-abundances.tsv'),
                            header=True, index=True, sep='\t')
    index = os.path.join(TEMPLATES, 'ancom', 'index.html')
    q2templates.render(index, output_dir, context=context)
Example #45
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.MetadataCategory,
                            method: str='permanova',
                            pairwise: bool=False,
                            permutations: int=999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Cast metadata to numeric (if applicable), which gives better sorting
    # in boxplots. Then filter any samples that are not in the distance matrix,
    # and drop samples with have no data for this metadata
    # category, including those with empty strings as values.
    metadata = pd.to_numeric(metadata.to_series(), errors='ignore')
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.index)
    filtered_dm_length = distance_matrix.shape[0]

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style("white")
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in sorted(metadata.groupby(metadata))])

    for group_id in groupings:
        group_distances, x_ticklabels = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    result_html = result.to_frame().to_html(classes=("table table-striped "
                                                     "table-hover"))
    result_html = result_html.replace('border="1"', 'border="0"')

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([group1_id,
                                     group2_id,
                                     pairwise_result['sample size'],
                                     permutations,
                                     pairwise_result['test statistic'],
                                     pairwise_result['p-value']])
        columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations',
                   result['test statistic name'], 'p-value']
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(
            output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = pairwise_results.to_html(
            classes=("table table-striped table-hover"))
        pairwise_results_html = pairwise_results_html.replace(
            'border="1"', 'border="0"')
    else:
        pairwise_results_html = None

    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'groupings': groupings,
        'result': result_html,
        'pairwise_results': pairwise_results_html
    })
Example #46
0
def beta_correlation(output_dir: str,
                     distance_matrix: skbio.DistanceMatrix,
                     metadata: qiime2.MetadataCategory,
                     method: str='spearman',
                     permutations: int=999) -> None:
    test_statistics = {'spearman': 'rho', 'pearson': 'r'}
    alt_hypothesis = 'two-sided'
    try:
        metadata = pd.to_numeric(metadata.to_series(), errors='raise')
    except ValueError as e:
        raise ValueError('Only numeric data can be used with the Mantel test. '
                         'Non-numeric data was encountered in the sample '
                         'metadata. Orignal error message follows:\n%s' %
                         str(e))

    initial_metadata_length = len(metadata)
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()
    filtered_metadata_length = len(metadata)

    ids_with_missing_metadata = set(distance_matrix.ids) - set(metadata.index)
    if len(ids_with_missing_metadata) > 0:
        raise ValueError('All samples in distance matrix must be present '
                         'and contain data in the sample metadata. The '
                         'following samples were present in the distance '
                         'matrix, but were missing from the sample metadata '
                         'or had no data: %s' %
                         ', '.join(ids_with_missing_metadata))

    metadata_distances = _metadata_distance(metadata)
    r, p, n = skbio.stats.distance.mantel(
        distance_matrix, metadata_distances, method=method,
        permutations=permutations, alternative=alt_hypothesis, strict=True)

    result = pd.Series([method.title(), n, permutations, alt_hypothesis,
                        metadata.name, r, p],
                       index=['Method', 'Sample size', 'Permutations',
                              'Alternative hypothesis', 'Metadata category',
                              '%s %s' % (method.title(),
                                         test_statistics[method]),
                              'p-value'],
                       name='Mantel test results')
    result_html = result.to_frame().to_html(classes=("table table-striped "
                                                     "table-hover"))
    result_html = result_html.replace('border="1"', 'border="0"')

    scatter_data = []
    for id1, id2 in itertools.combinations(distance_matrix.ids, 2):
        scatter_data.append((distance_matrix[id1, id2],
                             metadata_distances[id1, id2]))
    x = 'Input distance'
    y = 'Euclidean distance of\n%s' % metadata.name
    scatter_data = pd.DataFrame(scatter_data, columns=[x, y])
    fig = sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False).get_figure()
    fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.png'))
    fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.pdf'))

    index = os.path.join(
        TEMPLATES, 'beta_correlation_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_metadata_length': initial_metadata_length,
        'filtered_metadata_length': filtered_metadata_length,
        'result': result_html
    })
Example #47
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.CategoricalMetadataColumn,
                            method: str = 'permanova',
                            pairwise: bool = False,
                            permutations: int = 999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)
    metadata = metadata.drop_missing_values()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata due to missing values, and keep track of how many samples
    # survived the filtering so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.ids)
    filtered_dm_length = distance_matrix.shape[0]

    metadata = metadata.to_series()

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style('white')
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    # TODO: update to use a grouping API and natsort API on
    # CategoricalMetadataColumn, if those become available.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in natsorted(metadata.groupby(metadata))])

    pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1',
                                          'Group2', 'Distance'])
    for group_id in groupings:
        group_distances, x_ticklabels, group_pairs_summary = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        group_pairs_summary = pd.DataFrame(
            group_pairs_summary, columns=['SubjectID1', 'SubjectID2',
                                          'Group1', 'Group2', 'Distance'])

        pairs_summary = pd.concat([pairs_summary, group_pairs_summary])

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t')

    result_html = q2templates.df_to_html(result.to_frame())

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([group1_id,
                                     group2_id,
                                     pairwise_result['sample size'],
                                     permutations,
                                     pairwise_result['test statistic'],
                                     pairwise_result['p-value']])
        columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations',
                   result['test statistic name'], 'p-value']
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(
            output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = q2templates.df_to_html(pairwise_results)
    else:
        pairwise_results_html = None

    # repartition groupings for rendering
    group_ids = list(groupings.keys())
    row_count, group_count = 3, len(group_ids)  # Start at three plots per row
    while group_count % row_count != 0:
        row_count = row_count - 1

    group_rows = [group_ids[g:g+row_count] for g in range(0, group_count,
                                                          row_count)]

    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'group_rows': group_rows,
        'bootstrap_group_col_size': int(12 / row_count),
        'result': result_html,
        'pairwise_results': pairwise_results_html
    })
Example #48
0
def mantel(output_dir: str, dm1: skbio.DistanceMatrix,
           dm2: skbio.DistanceMatrix, method: str = 'spearman',
           permutations: int = 999, intersect_ids: bool = False,
           label1: str = 'Distance Matrix 1',
           label2: str = 'Distance Matrix 2') -> None:
    test_statistics = {'spearman': 'rho', 'pearson': 'r'}
    alt_hypothesis = 'two-sided'

    # The following code to handle mismatched IDs, and subsequently filter the
    # distance matrices, is not technically necessary because skbio's mantel
    # function will raise an error on mismatches with `strict=True`, and will
    # handle intersection if `strict=False`. However, we need to handle the ID
    # matching explicitly to find *which* IDs are mismatched -- the error
    # message coming from scikit-bio doesn't describe those. We also need to
    # have the mismatched IDs to display as a warning in the viz if
    # `intersect_ids=True`. Finally, the distance matrices are explicitly
    # filtered to matching IDs only because their data are used elsewhere in
    # this function (e.g. extracting scatter plot data).

    # Find the symmetric difference between ID sets.
    ids1 = set(dm1.ids)
    ids2 = set(dm2.ids)
    mismatched_ids = ids1 ^ ids2

    if not intersect_ids and mismatched_ids:
        raise ValueError(
            'The following ID(s) are not contained in both distance matrices. '
            'This sometimes occurs when mismatched files are passed. If this '
            'is not the case, you can use `intersect_ids` to discard these '
            'mismatches and apply the Mantel test to only those IDs that are '
            'found in both distance matrices.\n\n%s'
            % ', '.join(sorted(mismatched_ids)))

    if mismatched_ids:
        matched_ids = ids1 & ids2
        # Run in `strict` mode because the matches should all be found in both
        # matrices.
        dm1 = dm1.filter(matched_ids, strict=True)
        dm2 = dm2.filter(matched_ids, strict=True)

    # Run in `strict` mode because all IDs should be matched at this point.
    r, p, sample_size = skbio.stats.distance.mantel(
            dm1, dm2, method=method, permutations=permutations,
            alternative=alt_hypothesis, strict=True)

    result = pd.Series([method.title(), sample_size, permutations,
                       alt_hypothesis, r, p],
                       index=['Method', 'Sample size', 'Permutations',
                              'Alternative hypothesis',
                              '%s %s' % (method.title(),
                                         test_statistics[method]),
                              'p-value'],
                       name='Mantel test results')
    table_html = q2templates.df_to_html(result.to_frame())

    # We know the distance matrices have matching ID sets at this point, so we
    # can safely generate all pairs of IDs using one of the matrices' ID sets
    # (it doesn't matter which one).
    scatter_data = []
    for id1, id2 in itertools.combinations(dm1.ids, 2):
        scatter_data.append((dm1[id1, id2], dm2[id1, id2]))

    plt.figure()
    x = 'Pairwise Distance (%s)' % label1
    y = 'Pairwise Distance (%s)' % label2
    scatter_data = pd.DataFrame(scatter_data, columns=[x, y])
    sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False)
    plt.savefig(os.path.join(output_dir, 'mantel-scatter.svg'))

    context = {
        'table': table_html,
        'sample_size': sample_size,
        'mismatched_ids': mismatched_ids
    }
    index = os.path.join(
        TEMPLATES, 'mantel_assets', 'index.html')
    q2templates.render(index, output_dir, context=context)
Example #49
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    # Metadata column filtering could be done in one pass, but this visualizer
    # displays separate warnings for non-categorical columns, and categorical
    # columns that didn't satisfy the requirements of the statistics being
    # computed.
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='categorical')
    non_categorical_columns = pre_filtered_cols - set(metadata.columns)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(
        drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata does not contain any columns that satisfy this "
            "visualizer's requirements. There must be at least one metadata "
            "column that contains categorical data, isn't empty, doesn't "
            "consist of unique values, and doesn't consist of exactly one "
            "value.")

    metric_name = alpha_diversity.name

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    filtered_group_comparisons = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat([alpha_diversity, metadata_column.to_series()],
                         axis=1, join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_column.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[metric_name]))

        escaped_column = quote(column)
        escaped_column = escaped_column.replace('/', '%2F')
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        # perform Kruskal-Wallis across all groups
        kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

        # perform pairwise Kruskal-Wallis across all pairs of groups and
        # correct for multiple comparisons
        kw_H_pairwise = []
        for i in range(len(names)):
            for j in range(i):
                try:
                    H, p = scipy.stats.mstats.kruskalwallis(groups[i],
                                                            groups[j])
                    kw_H_pairwise.append([names[j], names[i], H, p])
                except ValueError:
                    filtered_group_comparisons.append(
                        ['%s:%s' % (column, names[i]),
                         '%s:%s' % (column, names[j])])
        kw_H_pairwise = pd.DataFrame(
            kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
        kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
        kw_H_pairwise['q-value'] = multipletests(
            kw_H_pairwise['p-value'], method='fdr_bh')[1]
        kw_H_pairwise.sort_index(inplace=True)
        pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column
        pairwise_path = os.path.join(output_dir, pairwise_fn)
        kw_H_pairwise.to_csv(pairwise_path)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            series = pd.Series(groups, index=names)

            fh.write("load_data('%s'," % column)
            series.to_json(fh, orient='split')
            fh.write(",")
            json.dump({'initial': initial_data_length,
                       'filtered': filtered_data_length}, fh)
            fh.write(",")
            json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
            fh.write(",'")
            table = q2templates.df_to_html(kw_H_pairwise)
            fh.write(table.replace('\n', '').replace("'", "\\'"))
            fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))

    index = os.path.join(
        TEMPLATES, 'alpha_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'columns': [quote(fn) for fn in filenames],
        'non_categorical_columns': ', '.join(sorted(non_categorical_columns)),
        'filtered_columns': ', '.join(sorted(filtered_columns)),
        'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])})

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Example #50
0
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode = None, metrics: set = None,
                      metadata: qiime2.Metadata = None, min_depth: int = 1,
                      steps: int = 10, iterations: int = 10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))

    if metadata is None:
        columns, filtered_columns = set(), set()
    else:
        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        # Drop metadata columns that aren't categorical, or consist solely of
        # missing values.
        pre_filtered_cols = set(metadata.columns)
        metadata = metadata.filter_columns(column_type='categorical',
                                           drop_all_missing=True)
        filtered_columns = pre_filtered_cols - set(metadata.columns)

        metadata_df = metadata.to_dataframe()
        if metadata_df.empty or len(metadata.columns) == 0:
            raise ValueError("All metadata filtered after dropping columns "
                             "that contained non-categorical data.")
        metadata_df.columns = pd.MultiIndex.from_tuples(
            [(c, '') for c in metadata_df.columns])
        columns = metadata_df.columns.get_level_values(0)

    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)

    filenames = []
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            merged = data.join(metadata_df, how='left')
            for column in columns:
                column_name = quote(column)
                reindexed_df, counts = _reindex_with_metadata(column,
                                                              columns,
                                                              merged)
                c_df = _compute_summary(reindexed_df, column, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, column)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': [quote(f) for f in filenames],
                                'columns': list(columns),
                                'steps': steps,
                                'filtered_columns': sorted(filtered_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Example #51
0
def alpha_correlation(output_dir: str,
                      alpha_diversity: pd.Series,
                      metadata: qiime2.Metadata,
                      method: str = 'spearman') -> None:
    try:
        alpha_correlation_fn = _alpha_correlation_fns[method]
    except KeyError:
        raise ValueError('Unknown alpha correlation method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_alpha_correlation_fns.keys())))

    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric',
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata contains only non-numeric or empty columns. This "
            "visualizer requires at least one numeric metadata column to "
            "execute.")

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        # create a dataframe containing the data to be correlated, and drop
        # any samples that have no data in either column
        df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1,
                       join='inner')

        # compute correlation
        correlation_result = alpha_correlation_fn(df[metadata_column.name],
                                                  df[alpha_diversity.name])

        warning = None
        if alpha_diversity.shape[0] != df.shape[0]:
            warning = {'initial': alpha_diversity.shape[0],
                       'method': method.title(),
                       'filtered': df.shape[0]}

        escaped_column = quote(column)
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('%s'," % column)
            df.to_json(fh, orient='split')
            fh.write(",")
            json.dump(warning, fh)
            fh.write(",")
            json.dump({
                'method': method.title(),
                'testStat': '%1.4f' % correlation_result[0],
                'pVal': '%1.4f' % correlation_result[1],
                'sampleSize': df.shape[0]}, fh)
            fh.write(");")

    index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'columns': [quote(fn) for fn in filenames],
        'filtered_columns': ', '.join(sorted(filtered_columns))})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Example #52
0
def alpha_correlation(output_dir: str,
                      alpha_diversity: pd.Series,
                      metadata: qiime2.Metadata,
                      method: str='spearman') -> None:
    try:
        alpha_correlation_fn = _alpha_correlation_fns[method]
    except KeyError:
        raise ValueError('Unknown alpha correlation method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_alpha_correlation_fns.keys())))
    metadata_df = metadata.to_dataframe()
    metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore')
    pre_filtered_cols = set(metadata_df.columns)
    metadata_df = metadata_df.select_dtypes(include=[np.number])
    post_filtered_cols = set(metadata_df.columns)
    filtered_categories = pre_filtered_cols - post_filtered_cols

    categories = metadata_df.columns

    if len(categories) == 0:
        raise ValueError('Only non-numeric data is present in metadata file.')

    filenames = []
    for category in categories:
        metadata_category = metadata_df[category]
        metadata_category = metadata_category[alpha_diversity.index]
        metadata_category = metadata_category.dropna()

        # create a dataframe containing the data to be correlated, and drop
        # any samples that have no data in either column
        df = pd.concat([metadata_category, alpha_diversity], axis=1,
                       join='inner')

        # compute correlation
        correlation_result = alpha_correlation_fn(df[metadata_category.name],
                                                  df[alpha_diversity.name])

        warning = None
        if alpha_diversity.shape[0] != df.shape[0]:
            warning = {'initial': alpha_diversity.shape[0],
                       'method': method.title(),
                       'filtered': df.shape[0]}

        escaped_category = quote(category)
        filename = 'category-%s.jsonp' % escaped_category
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('%s'," % category)
            df.to_json(fh, orient='split')
            fh.write(",")
            json.dump(warning, fh)
            fh.write(",")
            json.dump({
                'method': method.title(),
                'testStat': '%1.4f' % correlation_result[0],
                'pVal': '%1.4f' % correlation_result[1],
                'sampleSize': df.shape[0]}, fh)
            fh.write(");")

    index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'categories': [quote(fn) for fn in filenames],
        'filtered_categories': ', '.join(filtered_categories)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dst'),
                    os.path.join(output_dir, 'dist'))
Example #53
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    metadata_df = metadata.to_dataframe()
    metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore')
    pre_filtered_cols = set(metadata_df.columns)
    metadata_df = metadata_df.select_dtypes(exclude=[np.number])
    post_filtered_cols = set(metadata_df.columns)
    filtered_numeric_categories = pre_filtered_cols - post_filtered_cols
    filtered_group_comparisons = []

    categories = metadata_df.columns
    metric_name = alpha_diversity.name

    if len(categories) == 0:
        raise ValueError('Only numeric data is present in metadata file.')

    filenames = []
    filtered_categories = []
    for category in categories:
        metadata_category = metadata.get_category(category).to_series()
        metadata_category = metadata_category[alpha_diversity.index]
        metadata_category = metadata_category.replace(r'', np.nan).dropna()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat([alpha_diversity, metadata_category], axis=1,
                         join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_category.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[alpha_diversity.name]))

        if (len(groups) > 1 and len(groups) != len(data.index)):
            escaped_category = quote(category)
            filename = 'category-%s.jsonp' % escaped_category
            filenames.append(filename)

            # perform Kruskal-Wallis across all groups
            kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

            # perform pairwise Kruskal-Wallis across all pairs of groups and
            # correct for multiple comparisons
            kw_H_pairwise = []
            for i in range(len(names)):
                for j in range(i):
                    try:
                        H, p = scipy.stats.mstats.kruskalwallis(groups[i],
                                                                groups[j])
                        kw_H_pairwise.append([names[j], names[i], H, p])
                    except ValueError:
                        filtered_group_comparisons.append(
                            ['%s:%s' % (category, names[i]),
                             '%s:%s' % (category, names[j])])
            kw_H_pairwise = pd.DataFrame(
                kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
            kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
            kw_H_pairwise['q-value'] = multipletests(
                kw_H_pairwise['p-value'], method='fdr_bh')[1]
            kw_H_pairwise.sort_index(inplace=True)
            pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_category
            pairwise_path = os.path.join(output_dir, pairwise_fn)
            kw_H_pairwise.to_csv(pairwise_path)

            with open(os.path.join(output_dir, filename), 'w') as fh:
                df = pd.Series(groups, index=names)

                fh.write("load_data('%s'," % category)
                df.to_json(fh, orient='split')
                fh.write(",")
                json.dump({'initial': initial_data_length,
                           'filtered': filtered_data_length}, fh)
                fh.write(",")
                json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
                fh.write(",'")
                table = kw_H_pairwise.to_html(classes="table table-striped "
                                              "table-hover")
                table = table.replace('border="1"', 'border="0"')
                fh.write(table.replace('\n', ''))
                fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))
        else:
            filtered_categories.append(category)

    index = os.path.join(
        TEMPLATES, 'alpha_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'categories': [quote(fn) for fn in filenames],
        'filtered_numeric_categories': ', '.join(filtered_numeric_categories),
        'filtered_categories': ', '.join(filtered_categories),
        'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])})

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dst'),
        os.path.join(output_dir, 'dist'))