def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str='permanova', permutations: int=999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Cast metadata to numeric (if applicable), which gives better sorting # in boxplots. Then filter any samples that are not in the distance matrix, # and drop samples with have no data for this metadata # category, including those with empty strings as values. metadata = pd.to_numeric(metadata.to_series(), errors='ignore') metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.index) filtered_dm_length = distance_matrix.shape[0] # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style("white") # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in sorted(metadata.groupby(metadata))]) for group_id in groupings: group_distances, x_ticklabels = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() result = result.to_frame().to_html(classes="table table-striped " "table-hover") result = result.replace('border="1"', 'border="0"') index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'groupings': groupings, 'result': result })
def beta_correlation(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str='spearman', permutations: int=999) -> None: test_statistics = {'spearman': 'rho', 'pearson': 'r'} alt_hypothesis = 'two-sided' try: metadata = pd.to_numeric(metadata.to_series(), errors='raise') except ValueError as e: raise ValueError('Only numeric data can be used with the Mantel test. ' 'Non-numeric data was encountered in the sample ' 'metadata. Orignal error message follows:\n%s' % str(e)) initial_metadata_length = len(metadata) metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() filtered_metadata_length = len(metadata) ids_with_missing_metadata = set(distance_matrix.ids) - set(metadata.index) if len(ids_with_missing_metadata) > 0: raise ValueError('All samples in distance matrix must be present ' 'and contain data in the sample metadata. The ' 'following samples were present in the distance ' 'matrix, but were missing from the sample metadata ' 'or had no data: %s' % ', '.join(ids_with_missing_metadata)) metadata_distances = _metadata_distance(metadata) r, p, n = skbio.stats.distance.mantel( distance_matrix, metadata_distances, method=method, permutations=permutations, alternative=alt_hypothesis, strict=True) result = pd.Series([method.title(), n, permutations, alt_hypothesis, metadata.name, r, p], index=['Method', 'Sample size', 'Permutations', 'Alternative hypothesis', 'Metadata category', '%s %s' % (method.title(), test_statistics[method]), 'p-value'], name='Mantel test results') result_html = result.to_frame().to_html(classes=("table table-striped " "table-hover")) result_html = result_html.replace('border="1"', 'border="0"') scatter_data = [] for id1, id2 in itertools.combinations(distance_matrix.ids, 2): scatter_data.append((distance_matrix[id1, id2], metadata_distances[id1, id2])) x = 'Input distance' y = 'Euclidean distance of\n%s' % metadata.name scatter_data = pd.DataFrame(scatter_data, columns=[x, y]) fig = sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False).get_figure() fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.png')) fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.pdf')) index = os.path.join( TEMPLATES, 'beta_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_metadata_length': initial_metadata_length, 'filtered_metadata_length': filtered_metadata_length, 'result': result_html })
def beta_correlation(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str = 'spearman', permutations: int = 999) -> None: test_statistics = {'spearman': 'rho', 'pearson': 'r'} alt_hypothesis = 'two-sided' try: metadata = pd.to_numeric(metadata.to_series(), errors='raise') except ValueError as e: raise ValueError('Only numeric data can be used with the Mantel test. ' 'Non-numeric data was encountered in the sample ' 'metadata. Orignal error message follows:\n%s' % str(e)) initial_metadata_length = len(metadata) metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() filtered_metadata_length = len(metadata) ids_with_missing_metadata = set(distance_matrix.ids) - set(metadata.index) if len(ids_with_missing_metadata) > 0: raise ValueError('All samples in distance matrix must be present ' 'and contain data in the sample metadata. The ' 'following samples were present in the distance ' 'matrix, but were missing from the sample metadata ' 'or had no data: %s' % ', '.join(ids_with_missing_metadata)) metadata_distances = _metadata_distance(metadata) r, p, n = skbio.stats.distance.mantel(distance_matrix, metadata_distances, method=method, permutations=permutations, alternative=alt_hypothesis, strict=True) result = pd.Series( [method.title(), n, permutations, alt_hypothesis, metadata.name, r, p], index=[ 'Method', 'Sample size', 'Permutations', 'Alternative hypothesis', 'Metadata category', '%s %s' % (method.title(), test_statistics[method]), 'p-value' ], name='Mantel test results') result_html = result.to_frame().to_html(classes=("table table-striped " "table-hover")) result_html = result_html.replace('border="1"', 'border="0"') scatter_data = [] for id1, id2 in itertools.combinations(distance_matrix.ids, 2): scatter_data.append( (distance_matrix[id1, id2], metadata_distances[id1, id2])) x = 'Input distance' y = 'Euclidean distance of\n%s' % metadata.name plt.figure() scatter_data = pd.DataFrame(scatter_data, columns=[x, y]) sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False) plt.savefig(os.path.join(output_dir, 'beta-correlation-scatter.png')) plt.savefig(os.path.join(output_dir, 'beta-correlation-scatter.pdf')) index = os.path.join(TEMPLATES, 'beta_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_metadata_length': initial_metadata_length, 'filtered_metadata_length': filtered_metadata_length, 'result': result_html })
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str='permanova', pairwise: bool=False, permutations: int=999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Cast metadata to numeric (if applicable), which gives better sorting # in boxplots. Then filter any samples that are not in the distance matrix, # and drop samples with have no data for this metadata # category, including those with empty strings as values. metadata = pd.to_numeric(metadata.to_series(), errors='ignore') metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.index) filtered_dm_length = distance_matrix.shape[0] # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style("white") # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in sorted(metadata.groupby(metadata))]) for group_id in groupings: group_distances, x_ticklabels = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() result_html = result.to_frame().to_html(classes=("table table-striped " "table-hover")) result_html = result_html.replace('border="1"', 'border="0"') if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value']]) columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value'] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join( output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = pairwise_results.to_html( classes=("table table-striped table-hover")) pairwise_results_html = pairwise_results_html.replace( 'border="1"', 'border="0"') else: pairwise_results_html = None index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'groupings': groupings, 'result': result_html, 'pairwise_results': pairwise_results_html })