def test_format_correlation_vector(self): """Test formatting correlation vector works correctly.""" # One row, zero permutations. exp = 'Sample ID\tSample ID\tCorrelation coefficient\t' + \ 'Parametric p-value\tParametric p-value ' + \ '(Bonferroni-corrected)\tNonparametric p-value\t' + \ 'Nonparametric p-value (Bonferroni-corrected)\t' + \ 'CI (lower)\tCI (upper)\nS1\tT1\t0.7778\t0.0000\t' + \ '0.0000\tN/A\tN/A\t0.5000\t1.0000\n' obs = format_correlation_vector(self.corr_vec1, 0) self.assertEqual(obs, exp) # Undefined confidence interval. exp = 'Sample ID\tSample ID\tCorrelation coefficient\t' + \ 'Parametric p-value\tParametric p-value ' + \ '(Bonferroni-corrected)\tNonparametric p-value\t' + \ 'Nonparametric p-value (Bonferroni-corrected)\t' + \ 'CI (lower)\tCI (upper)\nS1\tT1\t0.7778\t0.0000\t' + \ '0.0000\tN/A\tN/A\tN/A\tN/A\n' obs = format_correlation_vector(self.corr_vec3, 0) self.assertEqual(obs, exp) # Multiple rows, 999 permutations. exp = 'Sample ID\tSample ID\tCorrelation coefficient\t' + \ 'Parametric p-value\tParametric p-value ' + \ '(Bonferroni-corrected)\tNonparametric p-value\t' + \ 'Nonparametric p-value (Bonferroni-corrected)\t' + \ 'CI (lower)\tCI (upper)\nS1\tT1\t0.7778\t0.0000\t' + \ '0.0000\t0.000\t0.000\t0.5000\t1.0000\nS2\tT2\t0.1000\t' + \ '0.0500\t0.1500\t0.040\t0.120\t-0.1000\t0.2000\nS3\tT3\t' + \ '100.6800\t0.9000\t1.0000\t1.000\t1.000\t-0.4000\t-0.2000\n' obs = format_correlation_vector(self.corr_vec2, 999) self.assertEqual(obs, exp)
def test_format_correlation_vector_with_header(self): """Test formatting correlation vector with a header works correctly.""" exp = '#foo\nSample ID\tSample ID\tCorrelation coefficient\t' + \ 'Parametric p-value\tParametric p-value ' + \ '(Bonferroni-corrected)\tNonparametric p-value\t' + \ 'Nonparametric p-value (Bonferroni-corrected)\t' + \ 'CI (lower)\tCI (upper)\nS1\tT1\t0.7778\t0.0000\t' + \ '0.0000\tN/A\tN/A\t0.5000\t1.0000\n' obs = format_correlation_vector(self.corr_vec1, 0, '#foo') self.assertEqual(obs, exp)
def test_format_correlation_vector_small_num_permutations(self): """Test formatting corr vector with small num of permutations.""" exp = 'Sample ID\tSample ID\tCorrelation coefficient\t' + \ 'Parametric p-value\tParametric p-value ' + \ '(Bonferroni-corrected)\tNonparametric p-value\t' + \ 'Nonparametric p-value (Bonferroni-corrected)\t' + \ 'CI (lower)\tCI (upper)\nS1\tT1\t0.7778\t0.0000\t' + \ '0.0000\tToo few iters to compute p-value (num_iters=2)\t' + \ 'Too few iters to compute p-value (num_iters=2)\t' + \ '0.5000\t1.0000\n' obs = format_correlation_vector(self.corr_vec1, 2) self.assertEqual(obs, exp)
def compare_taxa_summaries(taxa_summary1, taxa_summary2, comparison_mode, correlation_type='pearson', tail_type='two-sided', num_permutations=999, confidence_level=0.95, perform_detailed_comparisons=False, sample_id_map=None, expected_sample_id=None): """Compares two taxa summaries using the specified comparison mode. Taxa summaries are compared by computing the correlation coefficient between samples in the two taxa summaries based on the abundance of various taxa. The two taxa summaries are sorted and filled such that taxa that are missing in one summary but are present in the other are represented with zero abundance. Returns a four-element tuple containing the following values: the sorted and filled taxa summaries (in a format ready to be written to a file) and a report detailing the correlation between the taxa summaries (also in a format ready to be written to a file), including the correlation coefficient, parametric and nonparametric p-values, and confidence interval for the overall comparison. If perform_detailed_comparisons is True, a correlation vector is returned (also in a format ready to be written to a file) where each line shows the samples that were compared and the associated correlation coefficient, parametric and nonparametric p-values (uncorrected and Bonferroni-corrected), and the confidence interval. If perform_detailed_comparisons is False, None will be returned for this value. Arguments: taxa_summary1 - the first taxa summary to compare. This should be a tuple containing the sample IDs, taxa, and taxonomic data (i.e. the output of qiime.parse.parse_taxa_summary_table) taxa_summary2 - the second taxa summary to compare comparison_mode - the type of comparison to perform on the two taxa summaries. Can be either 'paired' or 'expected'. If 'paired', the samples that match between the two taxa summaries will be compared, unless a sample_id_map is specified. If 'expected', each sample in the first taxa summary will be compared to an 'expected' sample in the second taxa summary. If 'expected', the second taxa summary must only contain a single sample unless expected_sample_id is provided correlation_type - the type of correlation coefficient to calculate when comparing samples in the taxa summaries. Can be either 'pearson' or 'spearman' tail_type - if 'two-sided', a two-sided test is performed. 'high' for a one-tailed test for positive association, or 'low' for a one-tailed test for negative association. This parameter affects both the parametric and nonparametric tests, but the confidence interval will always be two-sided num_permutations - the number of permutations to use in the nonparametric test. Must be a number greater than or equal to 0. If 0, the nonparametric test will not be performed. In this case, the nonparametric p-value will be 'N/A' in the formatted results string confidence_level - the confidence level to use when constructing the confidence interval. Must be between 0 and 1 (exclusive) perform_detailed_comparisons - if True, computes the correlation between pairs of samples in addition to computing the overall correlation between taxa summaries sample_id_map - a dictionary mapping original sample IDs to new sample IDs. New sample IDs that match will be compared. All original sample IDs must be mapped. This argument is only used if the comparison mode is 'paired'. If not provided, only matching sample IDs between the two taxa summaries will be compared expected_sample_id - the sample ID in taxa_summary2 to compare all samples in taxa_summary1 to. This argument is only used if the comparison mode is 'expected'. If not provided, taxa_summary2 must only contain a single sample, and all samples in taxa_summary1 will be compared to it """ # Perform some initial error checking before getting into the heavy # processing. if correlation_type not in correlation_types: raise ValueError("Invalid correlation type '%s'. Must be one of %r." % (correlation_type, correlation_types)) if tail_type not in tail_types: raise ValueError("Invalid tail type '%s'. Must be one of %r." % (tail_type, tail_types)) if num_permutations < 0: raise ValueError("Invalid number of permutations: %d. Must be greater " "than or equal to zero." % num_permutations) if confidence_level <= 0 or confidence_level >= 1: raise ValueError("Invalid confidence level: %.4f. Must be between " "zero and one (exclusive)." % confidence_level) # Define some comments to be put in the result strings. header = "# Correlation coefficient: %s.\n" % correlation_type header += "# The parametric p-value(s) were calculated using a " if tail_type == 'two-sided': tail_type_desc = tail_type elif tail_type == 'high': tail_type_desc = "one-sided (positive association)" elif tail_type == 'low': tail_type_desc = "one-sided (negative association)" header += tail_type_desc + " test of significance using a " + \ "t-distribution.\n" if num_permutations > 0: header += "# The nonparametric p-value(s) were calculated using " + \ "a " + tail_type_desc + " permutation test with " + \ str(num_permutations) + " permutations.\n" header += "# The confidence interval(s) were constructed at a " + \ "confidence level of " + str(confidence_level * 100) + \ "% using Fisher's z-transformation (see Sokal and Rohlf " + \ "3rd edition pg. 575). The confidence interval(s) are two-sided." spearman_overall_warning = "# Since there were 10 or fewer " + \ "observations when calculating Spearman's rank correlation " + \ "coefficient, the parametric p-value is " spearman_detailed_warning = "# Since there were 10 or fewer taxa in " + \ "the sorted and filled taxa summary files, the parametric " + \ "p-values and Bonferroni-corrected parametric p-values are " spearman_warning_suffix = "not accurate when using the " + \ "t-distribution. Please see Biometry (Sokal and Rohlf, " + \ "3rd edition) page 600 for more details." spearman_overall_warning += spearman_warning_suffix spearman_detailed_warning += spearman_warning_suffix # Sort and fill the taxa summaries so that we can compare them. filled_ts1, filled_ts2 = _sort_and_fill_taxa_summaries([taxa_summary1, taxa_summary2]) if comparison_mode == 'paired': # Make sure that each sample is paired up to the sample it needs to be # compared against according to the sample ID map. compatible_ts1, compatible_ts2 = _make_compatible_taxa_summaries( filled_ts1, filled_ts2, sample_id_map) overall_corr, corr_vec = _compute_correlation(compatible_ts1, compatible_ts2, comparison_mode, correlation_type, tail_type, num_permutations, confidence_level, perform_detailed_comparisons) # Calculate the length of the vectors that were used to compute # correlation of. num_overall_observations = len(compatible_ts1[0]) * \ len(compatible_ts1[1]) # Report the number of samples that matched. header += "\n# Number of samples that matched between the taxa " + \ "summary files: %d" % len(compatible_ts1[0]) elif comparison_mode == 'expected': overall_corr, corr_vec = _compute_correlation(filled_ts1, filled_ts2, comparison_mode, correlation_type, tail_type, num_permutations, confidence_level, perform_detailed_comparisons, expected_sample_id) num_overall_observations = len(filled_ts1[0]) * len(filled_ts1[1]) else: raise ValueError("Invalid comparison mode '%s'. Must be one of %r." % (comparison_mode, comparison_modes)) # Format the overall correlation into a string that is writable to a file. # Include a warning in the header if the correlation coefficient was # spearman and the number of observations was <= 10. overall_corr_str_header = header if correlation_type == 'spearman' and num_overall_observations <= 10: overall_corr_str_header += '\n' + spearman_overall_warning overall_corr_str = format_correlation_info(overall_corr[0], overall_corr[1], overall_corr[2], overall_corr[3], num_permutations, overall_corr_str_header) # Format the correlation vector. corr_vec_str = None if perform_detailed_comparisons: detailed_header = header if correlation_type == 'spearman' and len(filled_ts1[1]) <= 10: detailed_header += '\n' + spearman_detailed_warning corr_vec_str = format_correlation_vector(corr_vec, num_permutations, detailed_header) return (format_taxa_summary(filled_ts1), format_taxa_summary(filled_ts2), overall_corr_str, corr_vec_str)