Beispiel #1
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.MetadataCategory,
                            method: str='permanova',
                            permutations: int=999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Cast metadata to numeric (if applicable), which gives better sorting
    # in boxplots. Then filter any samples that are not in the distance matrix,
    # and drop samples with have no data for this metadata
    # category, including those with empty strings as values.
    metadata = pd.to_numeric(metadata.to_series(), errors='ignore')
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.index)
    filtered_dm_length = distance_matrix.shape[0]

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style("white")
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in sorted(metadata.groupby(metadata))])

    for group_id in groupings:
        group_distances, x_ticklabels = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    result = result.to_frame().to_html(classes="table table-striped "
                                       "table-hover")
    result = result.replace('border="1"', 'border="0"')
    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'groupings': groupings,
        'result': result
    })
Beispiel #2
0
def beta_correlation(output_dir: str,
                     distance_matrix: skbio.DistanceMatrix,
                     metadata: qiime2.MetadataCategory,
                     method: str='spearman',
                     permutations: int=999) -> None:
    test_statistics = {'spearman': 'rho', 'pearson': 'r'}
    alt_hypothesis = 'two-sided'
    try:
        metadata = pd.to_numeric(metadata.to_series(), errors='raise')
    except ValueError as e:
        raise ValueError('Only numeric data can be used with the Mantel test. '
                         'Non-numeric data was encountered in the sample '
                         'metadata. Orignal error message follows:\n%s' %
                         str(e))

    initial_metadata_length = len(metadata)
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()
    filtered_metadata_length = len(metadata)

    ids_with_missing_metadata = set(distance_matrix.ids) - set(metadata.index)
    if len(ids_with_missing_metadata) > 0:
        raise ValueError('All samples in distance matrix must be present '
                         'and contain data in the sample metadata. The '
                         'following samples were present in the distance '
                         'matrix, but were missing from the sample metadata '
                         'or had no data: %s' %
                         ', '.join(ids_with_missing_metadata))

    metadata_distances = _metadata_distance(metadata)
    r, p, n = skbio.stats.distance.mantel(
        distance_matrix, metadata_distances, method=method,
        permutations=permutations, alternative=alt_hypothesis, strict=True)

    result = pd.Series([method.title(), n, permutations, alt_hypothesis,
                        metadata.name, r, p],
                       index=['Method', 'Sample size', 'Permutations',
                              'Alternative hypothesis', 'Metadata category',
                              '%s %s' % (method.title(),
                                         test_statistics[method]),
                              'p-value'],
                       name='Mantel test results')
    result_html = result.to_frame().to_html(classes=("table table-striped "
                                                     "table-hover"))
    result_html = result_html.replace('border="1"', 'border="0"')

    scatter_data = []
    for id1, id2 in itertools.combinations(distance_matrix.ids, 2):
        scatter_data.append((distance_matrix[id1, id2],
                             metadata_distances[id1, id2]))
    x = 'Input distance'
    y = 'Euclidean distance of\n%s' % metadata.name
    scatter_data = pd.DataFrame(scatter_data, columns=[x, y])
    fig = sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False).get_figure()
    fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.png'))
    fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.pdf'))

    index = os.path.join(
        TEMPLATES, 'beta_correlation_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_metadata_length': initial_metadata_length,
        'filtered_metadata_length': filtered_metadata_length,
        'result': result_html
    })
Beispiel #3
0
def beta_correlation(output_dir: str,
                     distance_matrix: skbio.DistanceMatrix,
                     metadata: qiime2.MetadataCategory,
                     method: str = 'spearman',
                     permutations: int = 999) -> None:
    test_statistics = {'spearman': 'rho', 'pearson': 'r'}
    alt_hypothesis = 'two-sided'
    try:
        metadata = pd.to_numeric(metadata.to_series(), errors='raise')
    except ValueError as e:
        raise ValueError('Only numeric data can be used with the Mantel test. '
                         'Non-numeric data was encountered in the sample '
                         'metadata. Orignal error message follows:\n%s' %
                         str(e))

    initial_metadata_length = len(metadata)
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()
    filtered_metadata_length = len(metadata)

    ids_with_missing_metadata = set(distance_matrix.ids) - set(metadata.index)
    if len(ids_with_missing_metadata) > 0:
        raise ValueError('All samples in distance matrix must be present '
                         'and contain data in the sample metadata. The '
                         'following samples were present in the distance '
                         'matrix, but were missing from the sample metadata '
                         'or had no data: %s' %
                         ', '.join(ids_with_missing_metadata))

    metadata_distances = _metadata_distance(metadata)
    r, p, n = skbio.stats.distance.mantel(distance_matrix,
                                          metadata_distances,
                                          method=method,
                                          permutations=permutations,
                                          alternative=alt_hypothesis,
                                          strict=True)

    result = pd.Series(
        [method.title(), n, permutations, alt_hypothesis, metadata.name, r, p],
        index=[
            'Method', 'Sample size', 'Permutations', 'Alternative hypothesis',
            'Metadata category',
            '%s %s' % (method.title(), test_statistics[method]), 'p-value'
        ],
        name='Mantel test results')
    result_html = result.to_frame().to_html(classes=("table table-striped "
                                                     "table-hover"))
    result_html = result_html.replace('border="1"', 'border="0"')

    scatter_data = []
    for id1, id2 in itertools.combinations(distance_matrix.ids, 2):
        scatter_data.append(
            (distance_matrix[id1, id2], metadata_distances[id1, id2]))
    x = 'Input distance'
    y = 'Euclidean distance of\n%s' % metadata.name
    plt.figure()
    scatter_data = pd.DataFrame(scatter_data, columns=[x, y])
    sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False)
    plt.savefig(os.path.join(output_dir, 'beta-correlation-scatter.png'))
    plt.savefig(os.path.join(output_dir, 'beta-correlation-scatter.pdf'))

    index = os.path.join(TEMPLATES, 'beta_correlation_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'initial_metadata_length': initial_metadata_length,
                           'filtered_metadata_length':
                           filtered_metadata_length,
                           'result': result_html
                       })
Beispiel #4
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.MetadataCategory,
                            method: str='permanova',
                            pairwise: bool=False,
                            permutations: int=999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Cast metadata to numeric (if applicable), which gives better sorting
    # in boxplots. Then filter any samples that are not in the distance matrix,
    # and drop samples with have no data for this metadata
    # category, including those with empty strings as values.
    metadata = pd.to_numeric(metadata.to_series(), errors='ignore')
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.index)
    filtered_dm_length = distance_matrix.shape[0]

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style("white")
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in sorted(metadata.groupby(metadata))])

    for group_id in groupings:
        group_distances, x_ticklabels = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    result_html = result.to_frame().to_html(classes=("table table-striped "
                                                     "table-hover"))
    result_html = result_html.replace('border="1"', 'border="0"')

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([group1_id,
                                     group2_id,
                                     pairwise_result['sample size'],
                                     permutations,
                                     pairwise_result['test statistic'],
                                     pairwise_result['p-value']])
        columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations',
                   result['test statistic name'], 'p-value']
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(
            output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = pairwise_results.to_html(
            classes=("table table-striped table-hover"))
        pairwise_results_html = pairwise_results_html.replace(
            'border="1"', 'border="0"')
    else:
        pairwise_results_html = None

    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'groupings': groupings,
        'result': result_html,
        'pairwise_results': pairwise_results_html
    })