Ejemplo n.º 1
0
def _visualize(output_dir, estimator, cm, importances=None,
               optimize_feature_selection=True, title='results'):

    pd.set_option('display.max_colwidth', -1)

    # summarize model accuracy and params
    if estimator is not None:
        result = _extract_estimator_parameters(estimator)
        result = q2templates.df_to_html(result.to_frame())
    else:
        result = False

    if cm is not None:
        cm.to_csv(join(
            output_dir, 'predictive_accuracy.tsv'), sep='\t', index=True)
        cm = q2templates.df_to_html(cm)

    if importances is not None:
        importances = sort_importances(importances)
        pd.set_option('display.float_format', '{:.3e}'.format)
        importances.to_csv(join(
            output_dir, 'feature_importance.tsv'), sep='\t', index=True)
        importances = q2templates.df_to_html(importances, index=True)
    else:
        importances = False

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'title': title,
        'result': result,
        'predictions': cm,
        'importances': importances,
        'classification': True,
        'optimize_feature_selection': optimize_feature_selection,
        'maturity_index': False})
Ejemplo n.º 2
0
def _visualize_anova(output_dir,
                     pairwise_tests=False,
                     model_results=False,
                     residuals=False,
                     pairwise_test_name='Pairwise t-tests'):
    pd.set_option('display.max_colwidth', -1)

    if pairwise_tests is not False:
        pairwise_tests.to_csv(os.path.join(output_dir, 'pairwise_tests.tsv'),
                              sep='\t')
        pairwise_tests = q2templates.df_to_html(pairwise_tests)

    model_results.to_csv(os.path.join(output_dir, 'model_results.tsv'),
                         sep='\t')
    model_results = q2templates.df_to_html(model_results)

    residuals.savefig(os.path.join(output_dir, 'residuals.png'),
                      bbox_inches='tight')
    residuals.savefig(os.path.join(output_dir, 'residuals.pdf'),
                      bbox_inches='tight')
    plt.close('all')

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'plot_name': 'ANOVA',
                           'model_results': model_results,
                           'pairwise_tests': pairwise_tests,
                           'residuals': residuals,
                           'pairwise_test_name': pairwise_test_name,
                       })
Ejemplo n.º 3
0
def _visualize(output_dir, estimator, cm, roc,
               optimize_feature_selection=True, title='results'):

    pd.set_option('display.max_colwidth', None)

    # summarize model accuracy and params
    if estimator is not None:
        result = _extract_estimator_parameters(estimator)
        result = q2templates.df_to_html(result.to_frame())
    else:
        result = False

    if cm is not None:
        cm.to_csv(join(
            output_dir, 'predictive_accuracy.tsv'), sep='\t', index=True)
        cm = q2templates.df_to_html(cm)

    if roc is not None:
        roc = True

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'title': title,
        'result': result,
        'predictions': cm,
        'roc': roc,
        'optimize_feature_selection': optimize_feature_selection})
Ejemplo n.º 4
0
def _visualize_maturity_index(table, metadata, group_by, column,
                              predicted_column, importances, estimator,
                              accuracy, output_dir, maz_stats=True):

    pd.set_option('display.max_colwidth', -1)

    maturity = '{0} maturity'.format(column)
    maz = '{0} MAZ score'.format(column)

    # save feature importance data and convert to html
    importances = sort_importances(importances)
    importances.to_csv(
        join(output_dir, 'feature_importance.tsv'), index=True, sep='\t')
    importance = q2templates.df_to_html(importances, index=True)

    # save predicted values, maturity, and MAZ score data
    maz_md = metadata[[group_by, column, predicted_column, maturity, maz]]
    maz_md.to_csv(join(output_dir, 'maz_scores.tsv'), sep='\t')
    if maz_stats:
        maz_aov = _two_way_anova(table, metadata, maz, group_by, column)[0]
        maz_aov.to_csv(join(output_dir, 'maz_aov.tsv'), sep='\t')
        maz_pairwise = _pairwise_stats(
            table, metadata, maz, group_by, column)
        maz_pairwise.to_csv(join(output_dir, 'maz_pairwise.tsv'), sep='\t')

    # plot control/treatment predicted vs. actual values
    g = _lmplot_from_dataframe(
        metadata, column, predicted_column, group_by)
    g.savefig(join(output_dir, 'maz_predictions.png'), bbox_inches='tight')
    g.savefig(join(output_dir, 'maz_predictions.pdf'), bbox_inches='tight')
    plt.close('all')

    # plot barplots of MAZ score vs. column (e.g., age)
    g = _boxplot_from_dataframe(metadata, column, maz, group_by)
    g.get_figure().savefig(
        join(output_dir, 'maz_boxplots.png'), bbox_inches='tight')
    g.get_figure().savefig(
        join(output_dir, 'maz_boxplots.pdf'), bbox_inches='tight')
    plt.close('all')

    # plot heatmap of column (e.g., age) vs. abundance of top features
    top = table[list(importances.index)]
    g = _clustermap_from_dataframe(top, metadata, group_by, column)
    g.savefig(join(output_dir, 'maz_heatmaps.png'), bbox_inches='tight')
    g.savefig(join(output_dir, 'maz_heatmaps.pdf'), bbox_inches='tight')

    result = _extract_estimator_parameters(estimator)
    result.append(pd.Series([accuracy], index=['Accuracy score']))
    result = q2templates.df_to_html(result.to_frame())

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'title': 'maturity index predictions',
        'result': result,
        'predictions': None,
        'importances': importance,
        'classification': False,
        'optimize_feature_selection': True,
        'maturity_index': True})
Ejemplo n.º 5
0
    def test_defaults(self):
        df = pd.DataFrame({'col1': ['foo', 'bar', 'baz'], 'col2': [1, 2, 4.2]})

        obs = df_to_html(df)

        self.assertIn('border="0"', obs)
        self.assertIn('table table-striped table-hover', obs)
Ejemplo n.º 6
0
def core_features(output_dir, table: biom.Table, min_fraction: float = 0.5,
                  max_fraction: float = 1.0, steps: int = 11) -> None:
    if max_fraction < min_fraction:
        raise ValueError('min_fraction (%r) parameter must be less than '
                         'max_fraction (%r) parameter.' %
                         (min_fraction, max_fraction))

    index_fp = os.path.join(TEMPLATES, 'index.html')
    context = {
        'num_samples': table.shape[1],
        'num_features': table.shape[0]
    }

    if min_fraction == max_fraction:
        fractions = [min_fraction]
    else:
        fractions = np.linspace(min_fraction, max_fraction, steps)

    rounded_fractions = _round_fractions(fractions)

    data = []
    file_links = []
    for fraction, rounded_fraction in zip(fractions, rounded_fractions):
        core_features = _get_core_features(table, fraction)
        core_feature_count = len(core_features)
        data.append([fraction, core_feature_count])

        if core_feature_count > 0:
            core_feature_fn = 'core-features-%s.tsv' % rounded_fraction
            core_feature_fp = os.path.join(output_dir, core_feature_fn)

            file_links.append("<a href='./%s'>TSV</a>" % core_feature_fn)

            core_features.to_csv(core_feature_fp, sep='\t',
                                 index_label='Feature ID')
        else:
            file_links.append('No core features')

    df = pd.DataFrame(data, columns=['Fraction of samples', 'Feature count'])
    df['Fraction of features'] = df['Feature count'] / table.shape[0]
    df['Feature list'] = file_links

    # newer versions of seaborn don't like dataframes with fewer than two rows
    if len(fractions) > 1:
        ax = sns.regplot(data=df, x='Fraction of samples', y='Feature count',
                         fit_reg=False)

        # matplotlib will issue a UserWarning if attempting to set left and
        # right bounds to the same value.
        ax.set_xbound(min(fractions), max(fractions))
        ax.set_ybound(0, max(df['Feature count']) + 1)

        ax.get_figure().savefig(
            os.path.join(output_dir, 'core-feature-counts.svg'))
        context['show_plot'] = True

    context['table_html'] = q2templates.df_to_html(df, index=False,
                                                   escape=False)

    q2templates.render(index_fp, output_dir, context=context)
Ejemplo n.º 7
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # convert metadata to numeric values where applicable, drop the non-numeric
    # values, and then drop samples that contain NaNs
    df = metadata.to_dataframe()
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    # filter categorical columns
    pre_filtered_cols = set(df.columns)
    df = df.select_dtypes([numpy.number]).dropna()
    filtered_categorical_cols = pre_filtered_cols - set(df.columns)

    # filter 0 variance numerical columns
    pre_filtered_cols = set(df.columns)
    df = df.loc[:, df.var() != 0]
    filtered_zero_variance_cols = pre_filtered_cols - set(df.columns)

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index, strict=False)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'filtered_categorical_cols': ', '.join(filtered_categorical_cols),
        'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols),
        'result': result})
Ejemplo n.º 8
0
def core_features(output_dir, table: biom.Table, min_fraction: float=0.5,
                  max_fraction: float=1.0, steps: int=11) -> None:
    if max_fraction < min_fraction:
        raise ValueError('min_fraction (%r) parameter must be less than '
                         'max_fraction (%r) parameter.' %
                         (min_fraction, max_fraction))

    index_fp = os.path.join(TEMPLATES, 'index.html')
    context = {
        'num_samples': table.shape[1],
        'num_features': table.shape[0]
    }

    if min_fraction == max_fraction:
        fractions = [min_fraction]
    else:
        fractions = np.linspace(min_fraction, max_fraction, steps)

    rounded_fractions = _round_fractions(fractions)

    data = []
    file_links = []
    for fraction, rounded_fraction in zip(fractions, rounded_fractions):
        core_features = _get_core_features(table, fraction)
        core_feature_count = len(core_features)
        data.append([fraction, core_feature_count])

        if core_feature_count > 0:
            core_feature_fn = 'core-features-%s.tsv' % rounded_fraction
            core_feature_fp = os.path.join(output_dir, core_feature_fn)

            file_links.append("<a href='./%s'>TSV</a>" % core_feature_fn)

            core_features.to_csv(core_feature_fp, sep='\t',
                                 index_label='Feature ID')
        else:
            file_links.append('No core features')

    df = pd.DataFrame(data, columns=['Fraction of samples', 'Feature count'])
    df['Fraction of features'] = df['Feature count'] / table.shape[0]
    df['Feature list'] = file_links

    ax = sns.regplot(data=df, x='Fraction of samples', y='Feature count',
                     fit_reg=False)

    # matplotlib will issue a UserWarning if attempting to set left and right
    # bounds to the same value.
    if min_fraction != max_fraction:
        ax.set_xbound(min(fractions), max(fractions))
    ax.set_ybound(0, max(df['Feature count']) + 1)

    ax.get_figure().savefig(
        os.path.join(output_dir, 'core-feature-counts.svg'))

    context['table_html'] = q2templates.df_to_html(df, index=False,
                                                   escape=False)

    q2templates.render(index_fp, output_dir, context=context)
Ejemplo n.º 9
0
def _visualize(output_dir,
               estimator,
               cm,
               accuracy,
               importances=None,
               optimize_feature_selection=True,
               title='results'):

    # Need to sort out how to save estimator as sklearn.pipeline
    # This will be possible once qiime2 support pipeline actions

    pd.set_option('display.max_colwidth', -1)

    # summarize model accuracy and params
    result = pd.Series(estimator.get_params(), name='Parameter setting')
    result = q2templates.df_to_html(result.to_frame())

    cm.to_csv(join(output_dir, 'predictive_accuracy.tsv'),
              sep='\t',
              index=True)
    cm = q2templates.df_to_html(cm)

    if importances is not None:
        importances = sort_importances(importances)
        pd.set_option('display.float_format', '{:.3e}'.format)
        importances.to_csv(join(output_dir, 'feature_importance.tsv'),
                           sep='\t',
                           index=False)
        importances = q2templates.df_to_html(importances, index=False)

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'title': title,
                           'result': result,
                           'predictions': cm,
                           'importances': importances,
                           'classification': True,
                           'optimize_feature_selection':
                           optimize_feature_selection,
                           'maturity_index': False
                       })
Ejemplo n.º 10
0
def _visualize_knn(output_dir, params: pd.Series):
    result = q2templates.df_to_html(params.to_frame())
    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'title': 'Estimator Summary',
        'result': result,
        'predictions': None,
        'importances': None,
        'classification': True,
        'optimize_feature_selection': False})
Ejemplo n.º 11
0
    def test_no_truncation(self):
        long_cell = 'baz' * 100
        df = pd.DataFrame({'col': ['foo', 'bar', long_cell]})

        obs = df_to_html(df)

        self.assertIn('col', obs)
        self.assertIn('foo', obs)
        self.assertIn('bar', obs)
        self.assertIn(long_cell, obs)
Ejemplo n.º 12
0
def _get_html(output_dir, datafiles):
    html = {}
    for direction in datafiles:
        html[direction] = {}
        for stats_type in datafiles[direction]:
            filename = datafiles[direction][stats_type]
            filename = os.path.join(output_dir, filename)
            data_df = pd.read_csv(filename, sep='\t')
            html[direction][stats_type] = q2templates.df_to_html(data_df,
                                                                 index=False)
    return html
Ejemplo n.º 13
0
def summarize_Qiita_metadata_category_and_contexts(
        output_dir: str = None, category: str = 'sample_type'):
    counts, caches = _fetch_Qiita_summaries(category=category)
    counts = counts.to_frame()
    counts = DataFrame({
        category: counts.index,
        'count': counts.values.T[0]
    },
                       columns=[category, 'count'])
    sample_types = q2templates.df_to_html(counts, bold_rows=False, index=False)
    contexts = q2templates.df_to_html(caches, index=False)
    title = 'Available in Qiita'
    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'title': title,
                           'sample_types': sample_types,
                           'contexts': contexts
                       })
Ejemplo n.º 14
0
def mapviz(output_dir, results=None, title='Coordinates'):
    if results is not None:
        results.to_csv(join(
            output_dir, 'results.tsv'), sep='\t', index=True)
        results = q2templates.df_to_html(results)
    else:
        results = False

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'results': results,
        'title': title})
Ejemplo n.º 15
0
def _build_seq_len_table(qscores: pd.DataFrame) -> str:
    sequence_lengths = qscores.notnull().sum(axis=1).copy()
    stats = _compute_stats_of_df(sequence_lengths)

    stats[stats.index != 'count'] = \
        stats[stats.index != 'count'].astype(int).apply('{} nts'.format)

    stats.rename(index={'50%': '50% (Median)',
                        'count': 'Total Sequences Sampled'},
                 inplace=True)
    frame = stats.to_frame(name="")
    return q2templates.df_to_html(frame)
Ejemplo n.º 16
0
def seq_depth(output_dir: str,
              table: pd.DataFrame,
              metadata: qiime2.Metadata,
              mypar: float = 4) -> None:

    table_path = os.path.join(output_dir, 'table.tsv')
    metadata_path = os.path.join(output_dir, 'metadata.tsv')

    table.to_csv(table_path)
    metadata.save(metadata_path)

    cmd_path = os.path.join(TEMPLATES, 'seq_depth.R')

    print(os.path.exists(table_path))
    print(os.path.exists(metadata_path))

    cmd = [
        'Rscript', cmd_path, '{0}'.format(output_dir),
        '{0}'.format(table_path), '{0}'.format(metadata_path)
    ]
    #cmd = 'Rscript {0} arg1={1} arg2={2} arg3={3}'.format(cmd_path, output_dir, table_path, metadata_path)
    #cmd = 'Rscript assets/seq_depth.R arg1=$1 arg2=$2 arg3=$3'
    proc = subprocess.run(cmd, check=True)
    index = os.path.join(TEMPLATES, 'index.html')

    # Errors filepath, load in as list
    errors_fp = os.path.join(output_dir, 'warnings.txt')
    with open(errors_fp, 'r') as errors_f:
        errors = [e for e in errors_f]

    # Load in depths as a pandas data frame, then transfer to html
    depths = pd.read_csv(os.path.join(output_dir, 'mytable.tsv'), sep="\t")
    depths = q2templates.df_to_html(depths)

    # Load in plot
    plot_fp = os.path.join(output_dir, 'myplot.png')

    q2templates.render(index,
                       output_dir,
                       context={
                           'errors': errors,
                           'summary': None,
                           'model_summary': None,
                           'model_results': depths,
                           'multiple_group_test': None,
                           'pairwise_tests': None,
                           'paired_difference_tests': None,
                           'plot': True,
                           'plot_name': "My Plot",
                           'raw_data': None,
                           'pairwise_test_name': None,
                       })
Ejemplo n.º 17
0
def summarize(output_dir: str, problem : zarr.hierarchy.Group):
    print(TEMPLATES)

    beta = pd.DataFrame(data={'label':problem['label'],'beta':problem['solution/LAMfixed/refit']})
    beta.to_csv(os.path.join(output_dir,'beta.csv'),header=True, index=False)
    show_plot = False
    if show_plot : 
        x = np.linspace(0,1)
        y = x**2
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)
        ax.plot(x, y, color='blue')
        fig.savefig(os.path.join(output_dir, 'test-plot.png'))
    
    

    html = q2templates.df_to_html(beta, index=False)
    context = {
        'dico': {
            'un': 1, 'deux':2
        },
        'result': html,
        'n_features':len(beta),
        'beta' : beta,
        'show_plot': show_plot,
        'tabs': [{'title': 'Overview',
                  'url': 'overview.html'},
                 {'title': 'LAM fixed',
                  'url': 'lam-fixed.html'}],
        'dangers': [],
        'warnings': [],
    }
    
    
    index = os.path.join(TEMPLATES, 'assets', 'index.html')
    overview_template = os.path.join(TEMPLATES, 'assets', 'overview.html')
    quality_template = os.path.join(TEMPLATES, 'assets', 'quality-plot.html')
    templates = [index, overview_template, quality_template]
    q2templates.render(templates, output_dir, context=context)

    shutil.copytree(os.path.join(TEMPLATES, 'assets', 'dist'),
                    os.path.join(output_dir, 'dist'))

    


    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
        fh.write("app.init(")
        json.dump({'selected param' : 10}, fh)
        fh.write(',')
        beta.to_json(fh)
        fh.write(');')
Ejemplo n.º 18
0
def _visualize(output_dir, multiple_group_test=False, pairwise_tests=False,
               paired_difference_tests=False, plot=False, summary=False,
               errors=False, model_summary=False, model_results=False,
               raw_data=False, plot_name='Pairwise difference boxplot',
               pairwise_test_name='Pairwise group comparison tests'):

    pd.set_option('display.max_colwidth', -1)

    if summary is not False:
        summary = q2templates.df_to_html(summary.to_frame())

    if multiple_group_test is not False:
        multiple_group_test = multiple_group_test.to_frame().transpose()
        multiple_group_test = q2templates.df_to_html(multiple_group_test)

    if pairwise_tests is not False:
        pairwise_tests.to_csv(os.path.join(output_dir, 'pairwise_tests.tsv'),
                              sep='\t')
        pairwise_tests = q2templates.df_to_html(pairwise_tests)

    if raw_data is not False:
        raw_data.to_csv(os.path.join(output_dir, 'raw-data.tsv'), sep='\t')
        raw_data = True

    if paired_difference_tests is not False:
        paired_difference_tests.to_csv(os.path.join(
            output_dir, 'paired_difference_tests.tsv'), sep='\t')
        paired_difference_tests = q2templates.df_to_html(
            paired_difference_tests)

    if model_summary is not False:
        model_summary.to_csv(os.path.join(output_dir, 'model_summary.tsv'),
                             sep='\t')
        model_summary = q2templates.df_to_html(model_summary)

    if model_results is not False:
        model_results.to_csv(os.path.join(output_dir, 'model_results.tsv'),
                             sep='\t')
        model_results = q2templates.df_to_html(model_results)

    if plot is not False:
        plot.savefig(os.path.join(output_dir, 'plot.png'), bbox_inches='tight')
        plot.savefig(os.path.join(output_dir, 'plot.pdf'), bbox_inches='tight')
        plt.close('all')

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'errors': errors,
        'summary': summary,
        'model_summary': model_summary,
        'model_results': model_results,
        'multiple_group_test': multiple_group_test,
        'pairwise_tests': pairwise_tests,
        'paired_difference_tests': paired_difference_tests,
        'plot': plot,
        'plot_name': plot_name,
        'raw_data': raw_data,
        'pairwise_test_name': pairwise_test_name,
    })
Ejemplo n.º 19
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)

    # drop non-numeric columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric')
    non_numeric_cols = pre_filtered_cols - set(metadata.columns)

    # Drop samples that have any missing values.
    # TODO use Metadata API if more filtering is supported in the future.
    df = metadata.to_dataframe()
    df = df.dropna()
    metadata = qiime2.Metadata(df)

    # filter 0 variance numerical columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_zero_variance=True,
                                       drop_all_missing=True)
    zero_variance_cols = pre_filtered_cols - set(metadata.columns)

    df = metadata.to_dataframe()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'initial_dm_length':
                           initial_dm_length,
                           'filtered_dm_length':
                           filtered_dm_length,
                           'non_numeric_cols':
                           ', '.join(sorted(non_numeric_cols)),
                           'zero_variance_cols':
                           ', '.join(sorted(zero_variance_cols)),
                           'result':
                           result
                       })
Ejemplo n.º 20
0
def visualize_in_qzv(core, cfg, output_dir):
    """Visualize in qzv"""
    TEMPLATES = pkg_resources.resource_filename('q2_coremicrobiome', 'coremic_assets')

    outputfile = cfg['outputfile']
    # data
    inppstr = format_inputs_qzv(cfg)
    usr_inputs = q2templates.df_to_html(inppstr, index=False)
    # downloadable file
    inputs_path = os.path.join(output_dir, outputfile+'InputParams.tsv')
    inppstr.to_csv(inputs_path, sep='\t', index=False)

    # results
    outpstr = format_results_qzv(core)
    results = ''
    if len(outpstr.index) == 0:
        results = "<b>No core microbes found.</b> <br>\
        <i>Please try relaxing the following:</i> <br>\
                --p-min-frac Minimum fractional presence in the interest group <br>\
                --p-max-p pvalue cutoff <br>\
        <i>or changing the:</i> <br>\
                normalization using --p-make-relative and/or --p-quantile-normalize <br>\
                method for multiple testing correction using --p-p-val-adj"
    else:
        results = q2templates.df_to_html(outpstr, index=False)
    # downloadable file
    results_path = os.path.join(output_dir, outputfile+'Results.tsv')
    outpstr.to_csv(results_path, sep='\t', index=False)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'outputfile': outputfile,
        'usr_inputs': usr_inputs,
        'results': results
    })
    return None
Ejemplo n.º 21
0
def kNN_LOOCV_F_measures(output_dir: str,
                         nearest_neighbors: dict, class_weight: DataFrame):
    y = nearest_neighbors['taxonomies']
    indices = nearest_neighbors['neighbors']
    weights = class_weight.T['Weight'].to_dict()
    uniform = _loocv(y, indices, weights, True)
    bespoke = _loocv(y, indices, weights)
    index = os.path.join(TEMPLATES, 'index.html')
    f_measures = DataFrame({'F-measure': [bespoke, uniform, bespoke-uniform]},
                           index=['Weighted', 'Uniform', 'Difference'])
    f_measures = q2templates.df_to_html(f_measures)
    q2templates.render(index, output_dir, context={
        'title': 'Indicators of Taxonomic Weight Importance',
        'f_measures': f_measures,
    })
Ejemplo n.º 22
0
def summarize_selections(output_dir: str, selections: IDSelection):
    table = _build_summary_table(selections)
    html_table = q2templates.df_to_html(table, index=False)

    table_fn = 'table.tsv'
    # Not using qiime2.Metadata b/c we don't have a meaningful ID col
    table.to_csv(os.path.join(output_dir, table_fn),
                 sep='\t',
                 encoding='utf-8')

    context = {
        'table': html_table,
        'table_fn': table_fn,
    }

    q2templates.render(SUMMARY_TEMPLATE, output_dir, context=context)
Ejemplo n.º 23
0
    def test_defaults_override(self):
        df = pd.DataFrame({
            'col1': ['foo', 'bar', 'baz'],
            'col2': [1, 2, 4.2]
        },
                          index=['id1', 'id2', 'id3'])

        obs = df_to_html(df,
                         border=1,
                         classes=('class1', 'class2'),
                         index=False)

        self.assertIn('border="1"', obs)
        self.assertIn('class1 class2', obs)
        self.assertNotIn('id1', obs)
        self.assertNotIn('id2', obs)
        self.assertNotIn('id3', obs)
Ejemplo n.º 24
0
def _visualize(output_dir, results, plot):

    pd.set_option('display.max_colwidth', -1)

    # save results
    results.to_csv(join(output_dir, 'evaluate_seqs_results.tsv'), sep='\t')
    results = q2templates.df_to_html(results, index=True)

    plot.savefig(join(output_dir, 'evaluate_seqs.png'), bbox_inches='tight')
    plot.savefig(join(output_dir, 'evaluate_seqs.pdf'), bbox_inches='tight')

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={
        'title': 'Sequence Evaluation Results',
        'running_title': 'evaluate_seqs',
        'results': results,
    })
Ejemplo n.º 25
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: int = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            column = metadata.get_column(i.name())
            if column.has_missing_values():
                raise ValueError(
                    'adonis requires metadata columns with no '
                    'NaN values (missing values in column `%s`.)' %
                    (column.name, ))

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = [
            'run_adonis.R', dm_fp, md_fp, formula,
            str(permutations),
            str(n_jobs), results_fp
        ]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
Ejemplo n.º 26
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)

    # drop non-numeric columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric')
    non_numeric_cols = pre_filtered_cols - set(metadata.columns)

    # filter 0 variance numerical columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_zero_variance=True,
                                       drop_all_missing=True)
    zero_variance_cols = pre_filtered_cols - set(metadata.columns)

    # Drop samples that have any missing values.
    # TODO use Metadata API if this type of filtering is supported in the
    # future.
    df = metadata.to_dataframe()
    df = df.dropna(axis='index', how='any')

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'non_numeric_cols': ', '.join(sorted(non_numeric_cols)),
        'zero_variance_cols': ', '.join(sorted(zero_variance_cols)),
        'result': result})
Ejemplo n.º 27
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: str = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            metadata.get_column(i.name())

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations),
               str(n_jobs), results_fp]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
Ejemplo n.º 28
0
def summarize(output_dir: str, data: _PlotQualView, n: int = 10000) -> None:
    paired = data.paired
    data = data.directory_format
    dangers = []
    warnings = []

    manifest = pd.read_csv(os.path.join(str(data), data.manifest.pathspec),
                           header=0,
                           comment='#')
    manifest.filename = manifest.filename.apply(
        lambda x: os.path.join(str(data), x))

    fwd = manifest[manifest.direction == 'forward'].filename.tolist()
    rev = manifest[manifest.direction == 'reverse'].filename.tolist()

    per_sample_fastq_counts = {}
    reads = rev if not fwd and rev else fwd
    file_records = []
    for file in reads:
        count = 0
        for seq in _read_fastq_seqs(file):
            count += 1
        sample_id = manifest.loc[manifest.filename == file,
                                 'sample-id'].iloc[0]
        per_sample_fastq_counts[sample_id] = count
        file_records.append((file, sample_id))

    result = pd.Series(per_sample_fastq_counts)
    result.name = 'Sequence count'
    result.index.name = 'Sample name'
    result.sort_values(inplace=True, ascending=False)
    result.to_csv(os.path.join(output_dir, 'per-sample-fastq-counts.csv'),
                  header=True,
                  index=True)
    sequence_count = result.sum()

    if n > sequence_count:
        n = sequence_count
        warnings.append('A subsample value was provided that is greater than '
                        'the amount of sequences across all samples. The plot '
                        'was generated using all available sequences.')

    subsample_ns = sorted(random.sample(range(sequence_count), n))
    link = _link_sample_n_to_file(file_records, per_sample_fastq_counts,
                                  subsample_ns)
    if paired:
        sample_map = [(file, rev[fwd.index(file)], link[file])
                      for file in link]
        quality_scores, min_seq_len = _subsample_paired(sample_map)
    else:
        sample_map = [(file, link[file]) for file in link]
        quality_scores, min_seq_len = _subsample_single(sample_map)

    forward_scores = pd.DataFrame(quality_scores['forward'])
    forward_stats = _compute_stats_of_df(forward_scores)

    if (forward_stats.loc['50%'] > 45).any():
        dangers.append('Some of the PHRED quality values are out of range. '
                       'This is likely because an incorrect PHRED offset '
                       'was chosen on import of your raw data. You can learn '
                       'how to choose your PHRED offset during import in the '
                       'importing tutorial.')
    if paired:
        reverse_scores = pd.DataFrame(quality_scores['reverse'])
        reverse_stats = _compute_stats_of_df(reverse_scores)

    show_plot = len(fwd) > 1
    if show_plot:
        ax = sns.distplot(result, kde=False)
        ax.set_xlabel('Number of sequences')
        ax.set_ylabel('Frequency')
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, 'demultiplex-summary.png'))
        fig.savefig(os.path.join(output_dir, 'demultiplex-summary.pdf'))

    html = q2templates.df_to_html(result.to_frame())
    index = os.path.join(TEMPLATES, 'assets', 'index.html')
    overview_template = os.path.join(TEMPLATES, 'assets', 'overview.html')
    quality_template = os.path.join(TEMPLATES, 'assets', 'quality-plot.html')
    context = {
        'result_data': {
            'min': result.min(),
            'median': result.median(),
            'mean': result.mean(),
            'max': result.max(),
            'sum': sequence_count
        },
        'result':
        html,
        'show_plot':
        show_plot,
        'paired':
        paired,
        'tabs': [{
            'title': 'Overview',
            'url': 'overview.html'
        }, {
            'title': 'Interactive Quality Plot',
            'url': 'quality-plot.html'
        }],
        'dangers':
        dangers,
        'warnings':
        warnings,
    }
    templates = [index, overview_template, quality_template]
    q2templates.render(templates, output_dir, context=context)

    shutil.copytree(os.path.join(TEMPLATES, 'assets', 'dist'),
                    os.path.join(output_dir, 'dist'))

    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
        fh.write("app.init(")
        json.dump(
            {
                'n': int(n),
                'totalSeqCount': int(sequence_count),
                'minSeqLen': min_seq_len
            }, fh)
        fh.write(',')
        forward_stats.to_json(fh)
        if paired:
            fh.write(',')
            reverse_stats.to_json(fh)
        fh.write(');')
Ejemplo n.º 29
0
def build_context(output_dir, problem):

    labels = problem['label']
    features = pd.DataFrame(problem['data/X'], columns=labels)
    y = pd.DataFrame({'y': problem['data/y']})
    c = pd.DataFrame(problem['data/C'], columns=labels)
    features.to_csv(os.path.join(output_dir, 'features.csv'),
                    header=True,
                    index=False)
    y.to_csv(os.path.join(output_dir, 'samples.csv'), header=True, index=False)
    c.to_csv(os.path.join(output_dir, 'constraints.csv'),
             header=True,
             index=False)

    context = {
        'path': False,
        'cv': False,
        'stabsel': False,
        'lam': False,
        'labels': labels,
        'tabs': [{
            'title': 'Overview',
            'url': 'overview.html'
        }]
    }
    dico = {
        'formulation':
        name_formulation(problem['formulation'].attrs.asdict(), output_dir),
        'concomitant':
        problem['formulation'].attrs['concomitant'],
        'n':
        len(problem['data/X']),
        'd':
        len(problem['data/X'][0]),
        'k':
        len(problem['data/C'])
    }
    context['dico'] = dico

    dico_ms = problem['model_selection'].attrs.asdict()

    if dico_ms['PATH']:
        context['path'] = True
        context['tabs'].append({'title': 'Lambda-path', 'url': 'path.html'})
        dico_path = {
            **problem['model_selection/PATHparameters'].attrs.asdict(),
            **problem['solution/PATH'].attrs.asdict()
        }
        dico_path['lambdas'] = problem['solution/PATH/LAMBDAS']
        dico_path['lamin'] = min(dico_path['lambdas'])
        dico_path['Nlam'] = len(dico_path['lambdas'])
        data = pd.DataFrame(np.array(problem['solution/PATH/BETAS']),
                            index=dico_path['lambdas'],
                            columns=labels)
        data.to_csv(os.path.join(output_dir, 'path.csv'),
                    header=True,
                    index=True)
        SIGMAS = None
        if dico['concomitant']:
            SIGMAS = problem['solution/PATH/SIGMAS']

        context['dicopath'] = dico_path

        plot_path(np.array(problem['solution/PATH/BETAS']), SIGMAS,
                  problem['solution/PATH/LAMBDAS'], output_dir, labels)

    if dico_ms['CV']:
        context['cv'] = True
        xGraph, yGraph, standard_error = problem[
            'solution/CV/xGraph'], problem['solution/CV/yGraph'], problem[
                'solution/CV/standard_error']
        context['tabs'].append({'title': 'Cross-Validation', 'url': 'cv.html'})
        dico_cv = {
            **problem['model_selection/CVparameters'].attrs.asdict(),
            **problem['solution/CV'].attrs.asdict()
        }
        dico_cv['lamin'] = min(xGraph)
        dico_cv['Nlam'] = len(xGraph)
        beta = pd.DataFrame(data={
            'label': problem['label'],
            'beta': problem['solution/CV/refit']
        })
        beta.to_csv(os.path.join(output_dir, 'CV-beta.csv'),
                    header=True,
                    index=False)
        selected_param = np.array(problem['solution/CV/selected_param'])
        beta_support = beta[selected_param]
        dico_cv['htmlbeta'] = q2templates.df_to_html(beta_support, index=False)

        context['dicocv'] = dico_cv

        if (dico_cv['oneSE']): lam = dico_cv['lambda_1SE']
        else: lam = dico_cv['lambda_min']

        plot_beta(
            np.array(problem['solution/CV/refit']), selected_param, output_dir,
            labels, 'cv-refit.png',
            r"Refitted coefficients of $\beta$ after CV model selection finds $\lambda$ = "
            + str(lam))
        plot_cv(xGraph, yGraph, dico_cv['index_1SE'], dico_cv['index_min'],
                standard_error, output_dir, 'cv-graph.png')

    if dico_ms['StabSel']:
        context['stabsel'] = True
        context['tabs'].append({
            'title': 'Stability Selection',
            'url': 'stabsel.html'
        })
        dico_stabsel = {
            **problem['model_selection/StabSelparameters'].attrs.asdict(),
            **problem['solution/StabSel'].attrs.asdict()
        }

        stability = pd.DataFrame(
            data={
                'label': problem['label'],
                'stability-probability':
                problem['solution/StabSel/distribution']
            })
        stability.to_csv(os.path.join(output_dir, 'StabSel-prob.csv'),
                         header=True,
                         index=False)
        selected_param = np.array(problem['solution/StabSel/selected_param'])
        stability_support = stability[selected_param]

        dico_stabsel['nsel'] = len(stability_support)
        dico_stabsel['htmlstab'] = q2templates.df_to_html(stability_support,
                                                          index=False)

        context['dicostabsel'] = dico_stabsel

        plot_beta(
            np.array(problem['solution/StabSel/refit']), selected_param,
            output_dir, labels, 'stabsel-refit.png',
            r"Refitted coefficients of $\beta$ after stability selection")
        plot_stability(problem['solution/StabSel/distribution'],
                       selected_param, dico_stabsel['threshold'],
                       dico_stabsel['method'], labels, output_dir,
                       'stabsel-graph.png')

    if dico_ms['LAMfixed']:
        context['lam'] = True
        context['tabs'].append({'title': 'LAM fixed', 'url': 'lam-fixed.html'})
        dico_lam = {
            **problem['model_selection/LAMfixedparameters'].attrs.asdict(),
            **problem['solution/LAMfixed'].attrs.asdict()
        }
        dico_lam['lamtype'] = problem[
            'model_selection/LAMfixedparameters'].attrs['lam']

        beta = pd.DataFrame(data={
            'label': problem['label'],
            'beta': problem['solution/LAMfixed/refit']
        })
        beta.to_csv(os.path.join(output_dir, 'LAM-beta.csv'),
                    header=True,
                    index=False)
        selected_param = np.array(problem['solution/LAMfixed/selected_param'])
        beta_support = beta[selected_param]
        dico_lam['htmlbeta'] = q2templates.df_to_html(beta_support,
                                                      index=False)

        context['dicolam'] = dico_lam

        plot_beta(
            np.array(problem['solution/LAMfixed/beta']), None, output_dir,
            labels, 'lam-beta.png',
            r"Coefficients of $\beta$ at $\lambda$ = " + str(dico_lam['lam']))
        plot_beta(
            np.array(problem['solution/LAMfixed/refit']), selected_param,
            output_dir, labels, 'lam-refit.png',
            r"Reffited coefficients of $\beta$ at $\lambda$ = " +
            str(dico_lam['lam']))

    return context
Ejemplo n.º 30
0
def summarize(output_dir: str, table: biom.Table,
              sample_metadata: qiime2.Metadata=None) -> None:
    number_of_features, number_of_samples = table.shape

    sample_summary, sample_frequencies = _frequency_summary(
        table, axis='sample')
    if number_of_samples > 1:
        # Calculate the bin count, with a minimum of 5 bins
        IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile']
        if IQR == 0.0:
            bins = 5
        else:
            # Freedman–Diaconis rule
            bin_width = (2 * IQR) / (number_of_samples ** (1/3))

            bins = max((sample_summary['Maximum frequency'] -
                        sample_summary['Minimum frequency']) / bin_width, 5)

        sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False,
                                             rug=True, bins=int(round(bins)))
        sample_frequencies_ax.get_xaxis().set_major_formatter(
            matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        sample_frequencies_ax.set_xlabel('Frequency per sample')
        sample_frequencies_ax.set_ylabel('Number of samples')
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.pdf'))
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.png'))
        plt.gcf().clear()

    feature_summary, feature_frequencies = _frequency_summary(
        table, axis='observation')
    if number_of_features > 1:
        feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False,
                                              rug=False)
        feature_frequencies_ax.set_xlabel('Frequency per feature')
        feature_frequencies_ax.set_ylabel('Number of features')
        feature_frequencies_ax.set_xscale('log')
        feature_frequencies_ax.set_yscale('log')
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.pdf'))
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.png'))

    sample_summary_table = q2templates.df_to_html(
        sample_summary.apply('{:,}'.format).to_frame('Frequency'))
    feature_summary_table = q2templates.df_to_html(
        feature_summary.apply('{:,}'.format).to_frame('Frequency'))

    index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html')
    context = {
        'number_of_samples': number_of_samples,
        'number_of_features': number_of_features,
        'total_frequencies': int(np.sum(sample_frequencies)),
        'sample_summary_table': sample_summary_table,
        'feature_summary_table': feature_summary_table,
    }

    feature_qualitative_data = _compute_qualitative_summary(table)
    sample_frequencies.sort_values(inplace=True, ascending=False)
    feature_frequencies.sort_values(inplace=True, ascending=False)
    sample_frequencies.to_csv(
        os.path.join(output_dir, 'sample-frequency-detail.csv'))
    feature_frequencies.to_csv(
        os.path.join(output_dir, 'feature-frequency-detail.csv'))

    feature_frequencies = feature_frequencies.astype(int) \
        .apply('{:,}'.format).to_frame('Frequency')
    feature_frequencies['# of Samples Observed In'] = \
        pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format)
    feature_frequencies_table = q2templates.df_to_html(feature_frequencies)
    overview_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'overview.html')
    sample_frequency_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html')
    feature_frequency_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html')

    context.update({'max_count': sample_frequencies.max(),
                    'feature_frequencies_table': feature_frequencies_table,
                    'feature_qualitative_data': feature_qualitative_data,
                    'tabs': [{'url': 'overview.html',
                              'title': 'Overview'},
                             {'url': 'sample-frequency-detail.html',
                              'title': 'Interactive Sample Detail'},
                             {'url': 'feature-frequency-detail.html',
                              'title': 'Feature Detail'}]})
    templates = [index, sample_frequency_template,
                 feature_frequency_template, overview_template]
    q2templates.render(templates, output_dir, context=context)

    shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'),
                    os.path.join(output_dir, 'dist'))

    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
        fh.write("app.init(")
        if sample_metadata:
            sample_metadata = sample_metadata.filter_ids(
                sample_frequencies.index)
            # TODO use Metadata.to_json() API if/when it exists in the future.
            sample_metadata.to_dataframe().to_json(fh)
        else:
            fh.write('{}')
        fh.write(', ')
        sample_frequencies.to_json(fh)
        fh.write(');')
Ejemplo n.º 31
0
def mantel(output_dir: str, dm1: skbio.DistanceMatrix,
           dm2: skbio.DistanceMatrix, method: str = 'spearman',
           permutations: int = 999, intersect_ids: bool = False,
           label1: str = 'Distance Matrix 1',
           label2: str = 'Distance Matrix 2') -> None:
    test_statistics = {'spearman': 'rho', 'pearson': 'r'}
    alt_hypothesis = 'two-sided'

    # The following code to handle mismatched IDs, and subsequently filter the
    # distance matrices, is not technically necessary because skbio's mantel
    # function will raise an error on mismatches with `strict=True`, and will
    # handle intersection if `strict=False`. However, we need to handle the ID
    # matching explicitly to find *which* IDs are mismatched -- the error
    # message coming from scikit-bio doesn't describe those. We also need to
    # have the mismatched IDs to display as a warning in the viz if
    # `intersect_ids=True`. Finally, the distance matrices are explicitly
    # filtered to matching IDs only because their data are used elsewhere in
    # this function (e.g. extracting scatter plot data).

    # Find the symmetric difference between ID sets.
    ids1 = set(dm1.ids)
    ids2 = set(dm2.ids)
    mismatched_ids = ids1 ^ ids2

    if not intersect_ids and mismatched_ids:
        raise ValueError(
            'The following ID(s) are not contained in both distance matrices. '
            'This sometimes occurs when mismatched files are passed. If this '
            'is not the case, you can use `intersect_ids` to discard these '
            'mismatches and apply the Mantel test to only those IDs that are '
            'found in both distance matrices.\n\n%s'
            % ', '.join(sorted(mismatched_ids)))

    if mismatched_ids:
        matched_ids = ids1 & ids2
        # Run in `strict` mode because the matches should all be found in both
        # matrices.
        dm1 = dm1.filter(matched_ids, strict=True)
        dm2 = dm2.filter(matched_ids, strict=True)

    # Run in `strict` mode because all IDs should be matched at this point.
    r, p, sample_size = skbio.stats.distance.mantel(
            dm1, dm2, method=method, permutations=permutations,
            alternative=alt_hypothesis, strict=True)

    result = pd.Series([method.title(), sample_size, permutations,
                       alt_hypothesis, r, p],
                       index=['Method', 'Sample size', 'Permutations',
                              'Alternative hypothesis',
                              '%s %s' % (method.title(),
                                         test_statistics[method]),
                              'p-value'],
                       name='Mantel test results')
    table_html = q2templates.df_to_html(result.to_frame())

    # We know the distance matrices have matching ID sets at this point, so we
    # can safely generate all pairs of IDs using one of the matrices' ID sets
    # (it doesn't matter which one).
    scatter_data = []
    for id1, id2 in itertools.combinations(dm1.ids, 2):
        scatter_data.append((dm1[id1, id2], dm2[id1, id2]))

    plt.figure()
    x = 'Pairwise Distance (%s)' % label1
    y = 'Pairwise Distance (%s)' % label2
    scatter_data = pd.DataFrame(scatter_data, columns=[x, y])
    sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False)
    plt.savefig(os.path.join(output_dir, 'mantel-scatter.svg'))

    context = {
        'table': table_html,
        'sample_size': sample_size,
        'mismatched_ids': mismatched_ids
    }
    index = os.path.join(
        TEMPLATES, 'mantel_assets', 'index.html')
    q2templates.render(index, output_dir, context=context)
Ejemplo n.º 32
0
def mantel(output_dir: str,
           dm1: skbio.DistanceMatrix,
           dm2: skbio.DistanceMatrix,
           method: str = 'spearman',
           permutations: int = 999,
           intersect_ids: bool = False,
           label1: str = 'Distance Matrix 1',
           label2: str = 'Distance Matrix 2') -> None:
    test_statistics = {'spearman': 'rho', 'pearson': 'r'}
    alt_hypothesis = 'two-sided'

    # The following code to handle mismatched IDs, and subsequently filter the
    # distance matrices, is not technically necessary because skbio's mantel
    # function will raise an error on mismatches with `strict=True`, and will
    # handle intersection if `strict=False`. However, we need to handle the ID
    # matching explicitly to find *which* IDs are mismatched -- the error
    # message coming from scikit-bio doesn't describe those. We also need to
    # have the mismatched IDs to display as a warning in the viz if
    # `intersect_ids=True`. Finally, the distance matrices are explicitly
    # filtered to matching IDs only because their data are used elsewhere in
    # this function (e.g. extracting scatter plot data).

    # Find the symmetric difference between ID sets.
    ids1 = set(dm1.ids)
    ids2 = set(dm2.ids)
    mismatched_ids = ids1 ^ ids2

    if not intersect_ids and mismatched_ids:
        raise ValueError(
            'The following ID(s) are not contained in both distance matrices. '
            'This sometimes occurs when mismatched files are passed. If this '
            'is not the case, you can use `intersect_ids` to discard these '
            'mismatches and apply the Mantel test to only those IDs that are '
            'found in both distance matrices.\n\n%s' %
            ', '.join(sorted(mismatched_ids)))

    if mismatched_ids:
        matched_ids = ids1 & ids2
        # Run in `strict` mode because the matches should all be found in both
        # matrices.
        dm1 = dm1.filter(matched_ids, strict=True)
        dm2 = dm2.filter(matched_ids, strict=True)

    # Run in `strict` mode because all IDs should be matched at this point.
    r, p, sample_size = skbio.stats.distance.mantel(dm1,
                                                    dm2,
                                                    method=method,
                                                    permutations=permutations,
                                                    alternative=alt_hypothesis,
                                                    strict=True)

    result = pd.Series(
        [method.title(), sample_size, permutations, alt_hypothesis, r, p],
        index=[
            'Method', 'Sample size', 'Permutations', 'Alternative hypothesis',
            '%s %s' % (method.title(), test_statistics[method]), 'p-value'
        ],
        name='Mantel test results')
    table_html = q2templates.df_to_html(result.to_frame())

    # We know the distance matrices have matching ID sets at this point, so we
    # can safely generate all pairs of IDs using one of the matrices' ID sets
    # (it doesn't matter which one).
    scatter_data = []
    for id1, id2 in itertools.combinations(dm1.ids, 2):
        scatter_data.append((dm1[id1, id2], dm2[id1, id2]))

    plt.figure()
    x = 'Pairwise Distance (%s)' % label1
    y = 'Pairwise Distance (%s)' % label2
    scatter_data = pd.DataFrame(scatter_data, columns=[x, y])
    sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False)
    plt.savefig(os.path.join(output_dir, 'mantel-scatter.svg'))

    context = {
        'table': table_html,
        'sample_size': sample_size,
        'mismatched_ids': mismatched_ids
    }
    index = os.path.join(TEMPLATES, 'mantel_assets', 'index.html')
    q2templates.render(index, output_dir, context=context)
Ejemplo n.º 33
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.CategoricalMetadataColumn,
                            method: str = 'permanova',
                            pairwise: bool = False,
                            permutations: int = 999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)
    metadata = metadata.drop_missing_values()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata due to missing values, and keep track of how many samples
    # survived the filtering so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.ids)
    filtered_dm_length = distance_matrix.shape[0]

    metadata = metadata.to_series()

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style('white')
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    # TODO: update to use a grouping API and natsort API on
    # CategoricalMetadataColumn, if those become available.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in natsorted(metadata.groupby(metadata))])

    pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1',
                                          'Group2', 'Distance'])
    for group_id in groupings:
        group_distances, x_ticklabels, group_pairs_summary = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        group_pairs_summary = pd.DataFrame(
            group_pairs_summary, columns=['SubjectID1', 'SubjectID2',
                                          'Group1', 'Group2', 'Distance'])

        pairs_summary = pd.concat([pairs_summary, group_pairs_summary])

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t')

    result_html = q2templates.df_to_html(result.to_frame())

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([group1_id,
                                     group2_id,
                                     pairwise_result['sample size'],
                                     permutations,
                                     pairwise_result['test statistic'],
                                     pairwise_result['p-value']])
        columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations',
                   result['test statistic name'], 'p-value']
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(
            output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = q2templates.df_to_html(pairwise_results)
    else:
        pairwise_results_html = None

    # repartition groupings for rendering
    group_ids = list(groupings.keys())
    row_count, group_count = 3, len(group_ids)  # Start at three plots per row
    while group_count % row_count != 0:
        row_count = row_count - 1

    group_rows = [group_ids[g:g+row_count] for g in range(0, group_count,
                                                          row_count)]

    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'group_rows': group_rows,
        'bootstrap_group_col_size': int(12 / row_count),
        'result': result_html,
        'pairwise_results': pairwise_results_html
    })
Ejemplo n.º 34
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.MetadataCategory,
                            method: str='permanova',
                            pairwise: bool=False,
                            permutations: int=999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Cast metadata to numeric (if applicable), which gives better sorting
    # in boxplots. Then filter any samples that are not in the distance matrix,
    # and drop samples with have no data for this metadata
    # category, including those with empty strings as values.
    metadata = pd.to_numeric(metadata.to_series(), errors='ignore')
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.index)
    filtered_dm_length = distance_matrix.shape[0]

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style("white")
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in sorted(metadata.groupby(metadata))])

    for group_id in groupings:
        group_distances, x_ticklabels = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    result_html = q2templates.df_to_html(result.to_frame())

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([group1_id,
                                     group2_id,
                                     pairwise_result['sample size'],
                                     permutations,
                                     pairwise_result['test statistic'],
                                     pairwise_result['p-value']])
        columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations',
                   result['test statistic name'], 'p-value']
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(
            output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = q2templates.df_to_html(pairwise_results)
    else:
        pairwise_results_html = None

    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'groupings': groupings,
        'result': result_html,
        'pairwise_results': pairwise_results_html
    })
Ejemplo n.º 35
0
def visualize_stats(output_dir: str, deblur_stats: pd.DataFrame) -> None:
    total_artifact = deblur_stats['reads-hit-artifact']
    total_input = deblur_stats['reads-raw']
    deblur_stats['fraction-artifact'] = total_artifact / total_input

    minsize_drop = deblur_stats['reads-raw'] - deblur_stats['reads-derep']
    deblur_stats['fraction-artifact-with-minsize'] = \
        (total_artifact + minsize_drop) / total_input

    total_not_ref = deblur_stats['reads-missed-reference']
    total_deblur = deblur_stats['reads-deblur']
    total_chim = deblur_stats['reads-chimeric']
    deblur_stats['fraction-missed-reference'] = \
        total_not_ref / (total_deblur - total_chim)

    # reorder such that retained fractions follow total-input-reads and
    # total-retained-reads
    columns = list(deblur_stats.columns)[:-3]
    columns.insert(1, 'fraction-missed-reference')
    columns.insert(1, 'fraction-artifact')
    columns.insert(1, 'fraction-artifact-with-minsize')
    deblur_stats = deblur_stats[columns]

    deblur_stats.sort_values('fraction-artifact-with-minsize', inplace=True,
                             ascending=False)

    deblur_stats = deblur_stats.reset_index()
    html = q2templates.df_to_html(deblur_stats)
    html = html.replace('table-hover"', 'table-hover" id="stats"')

    # ghetto force in tooltips
    description_sources = STATS_DESCRIPTIONS.copy()
    description_sources.update(COMPUTED_DESCRIPTIONS)
    htmlparts = html.splitlines()
    headstart = None
    headend = None
    for idx, line in enumerate(htmlparts):
        if '<thead>' in line:
            headstart = idx
        elif '</thead>' in line:
            headend = idx

    regex = re.compile("<th>(.*?)</th>")
    new_header = []
    for entry in htmlparts[headstart:headend]:
        new_entry = entry[:]
        if '<th>' in entry and entry.strip() != '<th></th>':
            label = regex.findall(entry)[0]
            desc = description_sources[label]
            label = ('<th data-toggle="tooltip" '
                     'title="%s" '
                     'data-tsorter="numeric">%s</th>' % (desc, label))
            new_entry = label
        new_header.append(new_entry)
    htmlparts[headstart:headend] = new_header
    html = '\n'.join(htmlparts)

    index = os.path.join(TEMPLATES, 'index.html')

    context = {
        'result': html
    }

    js = os.path.join(TEMPLATES, 'js', 'tsorter.min.js')
    os.mkdir(os.path.join(output_dir, 'js'))
    shutil.copy(js, os.path.join(output_dir, 'js', 'tsorter.min.js'))

    q2templates.render(index, output_dir, context=context)
Ejemplo n.º 36
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    # Metadata column filtering could be done in one pass, but this visualizer
    # displays separate warnings for non-categorical columns, and categorical
    # columns that didn't satisfy the requirements of the statistics being
    # computed.
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='categorical')
    non_categorical_columns = pre_filtered_cols - set(metadata.columns)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_all_unique=True,
                                       drop_zero_variance=True,
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata does not contain any columns that satisfy this "
            "visualizer's requirements. There must be at least one metadata "
            "column that contains categorical data, isn't empty, doesn't "
            "consist of unique values, and doesn't consist of exactly one "
            "value.")

    metric_name = alpha_diversity.name

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    filtered_group_comparisons = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat(
            [alpha_diversity, metadata_column.to_series()],
            axis=1,
            join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_column.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[metric_name]))

        escaped_column = quote(column)
        escaped_column = escaped_column.replace('/', '%2F')
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        # perform Kruskal-Wallis across all groups
        kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

        # perform pairwise Kruskal-Wallis across all pairs of groups and
        # correct for multiple comparisons
        kw_H_pairwise = []
        for i in range(len(names)):
            for j in range(i):
                try:
                    H, p = scipy.stats.mstats.kruskalwallis(
                        groups[i], groups[j])
                    kw_H_pairwise.append([names[j], names[i], H, p])
                except ValueError:
                    filtered_group_comparisons.append([
                        '%s:%s' % (column, names[i]),
                        '%s:%s' % (column, names[j])
                    ])
        kw_H_pairwise = pd.DataFrame(
            kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
        kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
        kw_H_pairwise['q-value'] = multipletests(kw_H_pairwise['p-value'],
                                                 method='fdr_bh')[1]
        kw_H_pairwise.sort_index(inplace=True)
        pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column
        pairwise_path = os.path.join(output_dir, pairwise_fn)
        kw_H_pairwise.to_csv(pairwise_path)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            series = pd.Series(groups, index=names)

            fh.write("load_data('%s'," % column)
            series.to_json(fh, orient='split')
            fh.write(",")
            json.dump(
                {
                    'initial': initial_data_length,
                    'filtered': filtered_data_length
                }, fh)
            fh.write(",")
            json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
            fh.write(",'")
            table = q2templates.df_to_html(kw_H_pairwise)
            fh.write(table.replace('\n', '').replace("'", "\\'"))
            fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))

    index = os.path.join(TEMPLATES, 'alpha_group_significance_assets',
                         'index.html')
    q2templates.render(
        index,
        output_dir,
        context={
            'columns': [quote(fn) for fn in filenames],
            'non_categorical_columns':
            ', '.join(sorted(non_categorical_columns)),
            'filtered_columns':
            ', '.join(sorted(filtered_columns)),
            'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])
        })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Ejemplo n.º 37
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    metadata_df = metadata.to_dataframe()
    metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore')
    pre_filtered_cols = set(metadata_df.columns)
    metadata_df = metadata_df.select_dtypes(exclude=[np.number])
    post_filtered_cols = set(metadata_df.columns)
    filtered_numeric_categories = pre_filtered_cols - post_filtered_cols
    filtered_group_comparisons = []

    categories = metadata_df.columns
    metric_name = alpha_diversity.name

    if len(categories) == 0:
        raise ValueError('Only numeric data is present in metadata file.')

    filenames = []
    filtered_categories = []
    for category in categories:
        metadata_category = metadata.get_category(category).to_series()
        metadata_category = metadata_category.loc[alpha_diversity.index]
        metadata_category = metadata_category.replace(r'', np.nan).dropna()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat([alpha_diversity, metadata_category], axis=1,
                         join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_category.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[alpha_diversity.name]))

        if (len(groups) > 1 and len(groups) != len(data.index)):
            escaped_category = quote(category)
            filename = 'category-%s.jsonp' % escaped_category
            filenames.append(filename)

            # perform Kruskal-Wallis across all groups
            kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

            # perform pairwise Kruskal-Wallis across all pairs of groups and
            # correct for multiple comparisons
            kw_H_pairwise = []
            for i in range(len(names)):
                for j in range(i):
                    try:
                        H, p = scipy.stats.mstats.kruskalwallis(groups[i],
                                                                groups[j])
                        kw_H_pairwise.append([names[j], names[i], H, p])
                    except ValueError:
                        filtered_group_comparisons.append(
                            ['%s:%s' % (category, names[i]),
                             '%s:%s' % (category, names[j])])
            kw_H_pairwise = pd.DataFrame(
                kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
            kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
            kw_H_pairwise['q-value'] = multipletests(
                kw_H_pairwise['p-value'], method='fdr_bh')[1]
            kw_H_pairwise.sort_index(inplace=True)
            pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_category
            pairwise_path = os.path.join(output_dir, pairwise_fn)
            kw_H_pairwise.to_csv(pairwise_path)

            with open(os.path.join(output_dir, filename), 'w') as fh:
                df = pd.Series(groups, index=names)

                fh.write("load_data('%s'," % category)
                df.to_json(fh, orient='split')
                fh.write(",")
                json.dump({'initial': initial_data_length,
                           'filtered': filtered_data_length}, fh)
                fh.write(",")
                json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
                fh.write(",'")
                table = q2templates.df_to_html(kw_H_pairwise)
                fh.write(table.replace('\n', '').replace("'", "\\'"))
                fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))
        else:
            filtered_categories.append(category)

    index = os.path.join(
        TEMPLATES, 'alpha_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'categories': [quote(fn) for fn in filenames],
        'filtered_numeric_categories': ', '.join(filtered_numeric_categories),
        'filtered_categories': ', '.join(filtered_categories),
        'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])})

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Ejemplo n.º 38
0
def _visualize(output_dir,
               title,
               running_title,
               results,
               false_negative_features=None,
               misclassifications=None,
               underclassifications=None,
               composition_regression=None,
               score_plot=None,
               mismatch_histogram=None,
               alignments=None):

    pd.set_option('display.max_colwidth', -1)

    # save results
    results.to_csv(join(output_dir, 'results.tsv'), sep='\t')
    results = q2templates.df_to_html(results, index=False)

    if false_negative_features is not None:
        false_negative_features.to_csv(join(output_dir,
                                            'false_negative_features.tsv'),
                                       sep='\t')
        false_negative_features = q2templates.df_to_html(
            false_negative_features, index=True)

    if misclassifications is not None:
        misclassifications.to_csv(join(output_dir, 'misclassifications.tsv'),
                                  sep='\t')
        misclassifications = q2templates.df_to_html(misclassifications,
                                                    index=True)

    if underclassifications is not None:
        underclassifications.to_csv(join(output_dir,
                                         'underclassifications.tsv'),
                                    sep='\t')
        underclassifications = q2templates.df_to_html(underclassifications,
                                                      index=True)

    if composition_regression is not None:
        composition_regression.savefig(join(output_dir,
                                            'composition_regression.png'),
                                       bbox_inches='tight')
        composition_regression.savefig(join(output_dir,
                                            'composition_regression.pdf'),
                                       bbox_inches='tight')

    if score_plot is not None:
        score_plot.savefig(join(output_dir, 'score_plot.png'),
                           bbox_inches='tight')
        score_plot.savefig(join(output_dir, 'score_plot.pdf'),
                           bbox_inches='tight')

    if mismatch_histogram is not None:
        mismatch_histogram.savefig(join(output_dir, 'mismatch_histogram.png'),
                                   bbox_inches='tight')
        mismatch_histogram.savefig(join(output_dir, 'mismatch_histogram.pdf'),
                                   bbox_inches='tight')

    if alignments is not None:
        alignments.to_csv(join(output_dir, 'alignments.tsv'), sep='\t')
        alignments = _plot_alignments_as_heatmap(alignments)
        alignments.savefig(join(output_dir, 'alignments.png'),
                           bbox_inches='tight')
        alignments.savefig(join(output_dir, 'alignments.pdf'),
                           bbox_inches='tight')

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'title': title,
                           'running_title': running_title,
                           'results': results,
                           'false_negative_features': false_negative_features,
                           'misclassifications': misclassifications,
                           'underclassifications': underclassifications,
                           'composition_regression': composition_regression,
                           'score_plot': score_plot,
                           'mismatch_histogram': mismatch_histogram,
                           'alignments': alignments,
                       })
Ejemplo n.º 39
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    # Metadata column filtering could be done in one pass, but this visualizer
    # displays separate warnings for non-categorical columns, and categorical
    # columns that didn't satisfy the requirements of the statistics being
    # computed.
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='categorical')
    non_categorical_columns = pre_filtered_cols - set(metadata.columns)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(
        drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata does not contain any columns that satisfy this "
            "visualizer's requirements. There must be at least one metadata "
            "column that contains categorical data, isn't empty, doesn't "
            "consist of unique values, and doesn't consist of exactly one "
            "value.")

    metric_name = alpha_diversity.name

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    filtered_group_comparisons = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat([alpha_diversity, metadata_column.to_series()],
                         axis=1, join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_column.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[metric_name]))

        escaped_column = quote(column)
        escaped_column = escaped_column.replace('/', '%2F')
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        # perform Kruskal-Wallis across all groups
        kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

        # perform pairwise Kruskal-Wallis across all pairs of groups and
        # correct for multiple comparisons
        kw_H_pairwise = []
        for i in range(len(names)):
            for j in range(i):
                try:
                    H, p = scipy.stats.mstats.kruskalwallis(groups[i],
                                                            groups[j])
                    kw_H_pairwise.append([names[j], names[i], H, p])
                except ValueError:
                    filtered_group_comparisons.append(
                        ['%s:%s' % (column, names[i]),
                         '%s:%s' % (column, names[j])])
        kw_H_pairwise = pd.DataFrame(
            kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
        kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
        kw_H_pairwise['q-value'] = multipletests(
            kw_H_pairwise['p-value'], method='fdr_bh')[1]
        kw_H_pairwise.sort_index(inplace=True)
        pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column
        pairwise_path = os.path.join(output_dir, pairwise_fn)
        kw_H_pairwise.to_csv(pairwise_path)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            series = pd.Series(groups, index=names)

            fh.write("load_data('%s'," % column)
            series.to_json(fh, orient='split')
            fh.write(",")
            json.dump({'initial': initial_data_length,
                       'filtered': filtered_data_length}, fh)
            fh.write(",")
            json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
            fh.write(",'")
            table = q2templates.df_to_html(kw_H_pairwise)
            fh.write(table.replace('\n', '').replace("'", "\\'"))
            fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))

    index = os.path.join(
        TEMPLATES, 'alpha_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'columns': [quote(fn) for fn in filenames],
        'non_categorical_columns': ', '.join(sorted(non_categorical_columns)),
        'filtered_columns': ', '.join(sorted(filtered_columns)),
        'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])})

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Ejemplo n.º 40
0
def summarize(output_dir: str,
              table: biom.Table,
              sample_metadata: qiime2.Metadata = None) -> None:
    number_of_features, number_of_samples = table.shape

    sample_summary, sample_frequencies = _frequency_summary(table,
                                                            axis='sample')
    if number_of_samples > 1:
        # Calculate the bin count, with a minimum of 5 bins
        IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile']
        if IQR == 0.0:
            bins = 5
        else:
            # Freedman–Diaconis rule
            bin_width = (2 * IQR) / (number_of_samples**(1 / 3))

            bins = max((sample_summary['Maximum frequency'] -
                        sample_summary['Minimum frequency']) / bin_width, 5)

        sample_frequencies_ax = sns.distplot(sample_frequencies,
                                             kde=False,
                                             rug=True,
                                             bins=int(round(bins)))
        sample_frequencies_ax.get_xaxis().set_major_formatter(
            matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        sample_frequencies_ax.set_xlabel('Frequency per sample')
        sample_frequencies_ax.set_ylabel('Number of samples')
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.pdf'))
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.png'))
        plt.gcf().clear()

    feature_summary, feature_frequencies = _frequency_summary(
        table, axis='observation')
    if number_of_features > 1:
        feature_frequencies_ax = sns.distplot(feature_frequencies,
                                              kde=False,
                                              rug=False)
        feature_frequencies_ax.set_xlabel('Frequency per feature')
        feature_frequencies_ax.set_ylabel('Number of features')
        feature_frequencies_ax.set_xscale('log')
        feature_frequencies_ax.set_yscale('log')
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.pdf'))
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.png'))

    sample_summary_table = q2templates.df_to_html(
        sample_summary.apply('{:,}'.format).to_frame('Frequency'))
    feature_summary_table = q2templates.df_to_html(
        feature_summary.apply('{:,}'.format).to_frame('Frequency'))

    index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html')
    context = {
        'number_of_samples': number_of_samples,
        'number_of_features': number_of_features,
        'total_frequencies': int(np.sum(sample_frequencies)),
        'sample_summary_table': sample_summary_table,
        'feature_summary_table': feature_summary_table,
    }

    feature_qualitative_data = _compute_qualitative_summary(table)
    sample_frequencies.sort_values(inplace=True, ascending=False)
    feature_frequencies.sort_values(inplace=True, ascending=False)
    sample_frequencies.to_csv(
        os.path.join(output_dir, 'sample-frequency-detail.csv'))
    feature_frequencies.to_csv(
        os.path.join(output_dir, 'feature-frequency-detail.csv'))

    feature_frequencies = feature_frequencies.astype(int) \
        .apply('{:,}'.format).to_frame('Frequency')
    feature_frequencies['# of Samples Observed In'] = \
        pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format)
    feature_frequencies_table = q2templates.df_to_html(feature_frequencies)
    overview_template = os.path.join(TEMPLATES, 'summarize_assets',
                                     'overview.html')
    sample_frequency_template = os.path.join(TEMPLATES, 'summarize_assets',
                                             'sample-frequency-detail.html')
    feature_frequency_template = os.path.join(TEMPLATES, 'summarize_assets',
                                              'feature-frequency-detail.html')

    context.update({
        'max_count':
        sample_frequencies.max(),
        'feature_frequencies_table':
        feature_frequencies_table,
        'feature_qualitative_data':
        feature_qualitative_data,
        'tabs': [{
            'url': 'overview.html',
            'title': 'Overview'
        }, {
            'url': 'sample-frequency-detail.html',
            'title': 'Interactive Sample Detail'
        }, {
            'url': 'feature-frequency-detail.html',
            'title': 'Feature Detail'
        }]
    })
    templates = [
        index, sample_frequency_template, feature_frequency_template,
        overview_template
    ]
    q2templates.render(templates, output_dir, context=context)

    shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'),
                    os.path.join(output_dir, 'dist'))

    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
        fh.write("app.init(")
        if sample_metadata:
            sample_metadata = sample_metadata.filter_ids(
                sample_frequencies.index)
            # TODO use Metadata.to_json() API if/when it exists in the future.
            sample_metadata.to_dataframe().to_json(fh)
        else:
            fh.write('{}')
        fh.write(', ')
        sample_frequencies.to_json(fh)
        fh.write(');')
Ejemplo n.º 41
0
def ancom(output_dir: str,
          table: pd.DataFrame,
          metadata: qiime2.CategoricalMetadataColumn,
          transform_function: str = 'clr',
          difference_function: str = None) -> None:
    metadata = metadata.filter_ids(table.index)
    if metadata.has_missing_values():
        missing_data_sids = metadata.get_ids(where_values_missing=True)
        missing_data_sids = ', '.join(sorted(missing_data_sids))
        raise ValueError('Metadata column is missing values for the '
                         'following samples. Values need to be added for '
                         'these samples, or the samples need to be removed '
                         'from the table: %s' % missing_data_sids)
    ancom_results = skbio_ancom(table,
                                metadata.to_series(),
                                significance_test=f_oneway)
    ancom_results[0].sort_values(by='W', ascending=False, inplace=True)
    ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'},
                            inplace=True)
    significant_features = ancom_results[0][
        ancom_results[0]['Reject null hypothesis']]

    context = dict()
    if not significant_features.empty:
        context['significant_features'] = q2templates.df_to_html(
            significant_features['W'].to_frame())
        context['percent_abundances'] = q2templates.df_to_html(
            ancom_results[1].loc[significant_features.index])

    metadata = metadata.to_series()
    cats = list(set(metadata))
    transform_function_name = transform_function
    transform_function = _transform_functions[transform_function]
    transformed_table = table.apply(
        transform_function, axis=1, result_type='broadcast')

    if difference_function is None:
        if len(cats) == 2:
            difference_function = 'mean_difference'
        else:  # len(categories) > 2
            difference_function = 'f_statistic'

    _d_func = _difference_functions[difference_function]

    def diff_func(x):
        args = _d_func(*[x[metadata == c] for c in cats])
        if isinstance(args, tuple):
            return args[0]
        else:
            return args
    # effectively doing a groupby operation wrt to the metadata
    fold_change = transformed_table.apply(diff_func, axis=0)
    if not pd.isnull(fold_change).all():
        volcano_results = pd.DataFrame({transform_function_name: fold_change,
                                        'W': ancom_results[0].W})
        volcano_results = volcano_results.reset_index(drop=False)

        spec = {
            '$schema': 'https://vega.github.io/schema/vega/v4.json',
            'width': 300,
            'height': 300,
            'data': [
                {'name': 'values',
                 'values': volcano_results.to_dict(orient='records')}],
            'scales': [
                {'name': 'xScale',
                 'domain': {'data': 'values',
                            'field': transform_function_name},
                 'range': 'width'},
                {'name': 'yScale',
                 'domain': {'data': 'values', 'field': 'W'},
                 'range': 'height'}],
            'axes': [
                {'scale': 'xScale', 'orient': 'bottom',
                 'title': transform_function_name},
                {'scale': 'yScale', 'orient': 'left', 'title': 'W'}],
            'marks': [
              {'type': 'symbol',
               'from': {'data': 'values'},
               'encode': {
                   'hover': {
                       'fill': {'value': '#FF0000'},
                       'opacity': {'value': 1}},
                   'enter': {
                       'x': {'scale': 'xScale',
                             'field': transform_function_name},
                       'y': {'scale': 'yScale', 'field': 'W'}},
                   'update': {
                       'fill': {'value': 'black'},
                       'opacity': {'value': 0.3},
                       'tooltip': {
                           'signal': "{{'title': datum['index'], '{0}': "
                                     "datum['{0}'], 'W': datum['W']}}".format(
                                         transform_function_name)}}}}]}
        context['vega_spec'] = json.dumps(spec)

    copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir)
    ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'),
                            header=True, index=True, sep='\t')
    ancom_results[1].to_csv(os.path.join(output_dir,
                                         'percent-abundances.tsv'),
                            header=True, index=True, sep='\t')
    index = os.path.join(TEMPLATES, 'ancom', 'index.html')
    q2templates.render(index, output_dir, context=context)