コード例 #1
0
ファイル: test_format.py プロジェクト: Gaby1212/qiime
    def test_format_taxa_summary(self):
        """Test formatting a taxa summary works correctly."""
        # More than one sample.
        taxa_summary = (["Even7", "Even8"], ["Eukarya"], array([[1.0, 1.0]]))
        exp = "Taxon\tEven7\tEven8\nEukarya\t1.0\t1.0\n"
        obs = format_taxa_summary(taxa_summary)
        self.assertEqual(obs, exp)

        # More than one taxon.
        taxa_summary = (["Expected"], ["Eukarya", "Bacteria", "Archaea"], array([[0.5], [0.6], [0.4]]))
        exp = "Taxon\tExpected\nEukarya\t0.5\nBacteria\t0.6\nArchaea\t0.4\n"
        obs = format_taxa_summary(taxa_summary)
        self.assertEqual(obs, exp)
コード例 #2
0
ファイル: test_format.py プロジェクト: oscaredd/qiime
    def test_format_taxa_summary(self):
        """Test formatting a taxa summary works correctly."""
        # More than one sample.
        taxa_summary = (['Even7','Even8'], ['Eukarya'], array([[1.0, 1.0]]))
        exp = 'Taxon\tEven7\tEven8\nEukarya\t1.0\t1.0\n'
        obs = format_taxa_summary(taxa_summary)
        self.assertEqual(obs, exp)

        # More than one taxon.
        taxa_summary = (['Expected'], ['Eukarya', 'Bacteria', 'Archaea'],
                        array([[0.5], [0.6], [0.4]]))
        exp = 'Taxon\tExpected\nEukarya\t0.5\nBacteria\t0.6\nArchaea\t0.4\n'
        obs = format_taxa_summary(taxa_summary)
        self.assertEqual(obs, exp)
コード例 #3
0
    def test_format_taxa_summary(self):
        """Test formatting a taxa summary works correctly."""
        # More than one sample.
        taxa_summary = (['Even7', 'Even8'], ['Eukarya'], array([[1.0, 1.0]]))
        exp = 'Taxon\tEven7\tEven8\nEukarya\t1.0\t1.0\n'
        obs = format_taxa_summary(taxa_summary)
        self.assertEqual(obs, exp)

        # More than one taxon.
        taxa_summary = (['Expected'], ['Eukarya', 'Bacteria', 'Archaea'],
                        array([[0.5], [0.6], [0.4]]))
        exp = 'Taxon\tExpected\nEukarya\t0.5\nBacteria\t0.6\nArchaea\t0.4\n'
        obs = format_taxa_summary(taxa_summary)
        self.assertEqual(obs, exp)
コード例 #4
0
def compare_taxa_summaries(taxa_summary1, taxa_summary2, comparison_mode,
                           correlation_type='pearson', tail_type='two-sided',
                           num_permutations=999, confidence_level=0.95,
                           perform_detailed_comparisons=False,
                           sample_id_map=None, expected_sample_id=None):
    """Compares two taxa summaries using the specified comparison mode.

    Taxa summaries are compared by computing the correlation coefficient
    between samples in the two taxa summaries based on the abundance of various
    taxa. The two taxa summaries are sorted and filled such that taxa that are
    missing in one summary but are present in the other are represented with
    zero abundance.

    Returns a four-element tuple containing the following values: the sorted
    and filled taxa summaries (in a format ready to be written to a file) and a
    report detailing the correlation between the taxa summaries (also in a
    format ready to be written to a file), including the correlation
    coefficient, parametric and nonparametric p-values, and confidence interval
    for the overall comparison.

    If perform_detailed_comparisons is True, a correlation vector is returned
    (also in a format ready to be written to a file) where each line shows the
    samples that were compared and the associated correlation coefficient,
    parametric and nonparametric p-values (uncorrected and
    Bonferroni-corrected), and the confidence interval. If
    perform_detailed_comparisons is False, None will be returned for this
    value.

    Arguments:
        taxa_summary1 - the first taxa summary to compare. This should be a
            tuple containing the sample IDs, taxa, and taxonomic data (i.e. the
            output of qiime.parse.parse_taxa_summary_table)
        taxa_summary2 - the second taxa summary to compare
        comparison_mode - the type of comparison to perform on the two taxa
            summaries. Can be either 'paired' or 'expected'. If 'paired', the
            samples that match between the two taxa summaries will be compared,
            unless a sample_id_map is specified. If 'expected', each sample in
            the first taxa summary will be compared to an 'expected' sample in
            the second taxa summary. If 'expected', the second taxa summary
            must only contain a single sample unless expected_sample_id is
            provided
        correlation_type - the type of correlation coefficient to calculate
            when comparing samples in the taxa summaries. Can be either
            'pearson' or 'spearman'
        tail_type - if 'two-sided', a two-sided test is performed. 'high'
            for a one-tailed test for positive association, or 'low' for a
            one-tailed test for negative association. This parameter affects
            both the parametric and nonparametric tests, but the confidence
            interval will always be two-sided
        num_permutations - the number of permutations to use in the
            nonparametric test. Must be a number greater than or equal to 0. If
            0, the nonparametric test will not be performed. In this case, the
            nonparametric p-value will be 'N/A' in the formatted results string
        confidence_level - the confidence level to use when constructing the
            confidence interval. Must be between 0 and 1 (exclusive)
        perform_detailed_comparisons - if True, computes the correlation
            between pairs of samples in addition to computing the overall
            correlation between taxa summaries
        sample_id_map - a dictionary mapping original sample IDs to new sample
            IDs. New sample IDs that match will be compared. All original
            sample IDs must be mapped. This argument is only used if the
            comparison mode is 'paired'. If not provided, only matching sample
            IDs between the two taxa summaries will be compared
        expected_sample_id - the sample ID in taxa_summary2 to compare all
            samples in taxa_summary1 to. This argument is only used if the
            comparison mode is 'expected'. If not provided, taxa_summary2 must
            only contain a single sample, and all samples in taxa_summary1 will
            be compared to it
    """
    # Perform some initial error checking before getting into the heavy
    # processing.
    if correlation_type not in correlation_types:
        raise ValueError("Invalid correlation type '%s'. Must be one of %r." %
                         (correlation_type, correlation_types))
    if tail_type not in tail_types:
        raise ValueError("Invalid tail type '%s'. Must be one of %r." %
                         (tail_type, tail_types))
    if num_permutations < 0:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than or equal to zero." % num_permutations)
    if confidence_level <= 0 or confidence_level >= 1:
        raise ValueError("Invalid confidence level: %.4f. Must be between "
                         "zero and one (exclusive)." % confidence_level)

    # Define some comments to be put in the result strings.
    header = "# Correlation coefficient: %s.\n" % correlation_type
    header += "# The parametric p-value(s) were calculated using a "

    if tail_type == 'two-sided':
        tail_type_desc = tail_type
    elif tail_type == 'high':
        tail_type_desc = "one-sided (positive association)"
    elif tail_type == 'low':
        tail_type_desc = "one-sided (negative association)"

    header += tail_type_desc + " test of significance using a " + \
              "t-distribution.\n"
    
    if num_permutations > 0:
        header += "# The nonparametric p-value(s) were calculated using " + \
                  "a " + tail_type_desc + " permutation test with " + \
                  str(num_permutations) + " permutations.\n"

    header += "# The confidence interval(s) were constructed at a " + \
              "confidence level of " + str(confidence_level * 100) + \
              "% using Fisher's z-transformation (see Sokal and Rohlf " + \
              "3rd edition pg. 575). The confidence interval(s) are two-sided."

    spearman_overall_warning = "# Since there were 10 or fewer " + \
            "observations when calculating Spearman's rank correlation " + \
            "coefficient, the parametric p-value is "
    spearman_detailed_warning = "# Since there were 10 or fewer taxa in " + \
            "the sorted and filled taxa summary files, the parametric " + \
            "p-values and Bonferroni-corrected parametric p-values are "
    spearman_warning_suffix = "not accurate when using the " + \
            "t-distribution. Please see Biometry (Sokal and Rohlf, " + \
            "3rd edition) page 600 for more details."
    spearman_overall_warning += spearman_warning_suffix
    spearman_detailed_warning += spearman_warning_suffix

    # Sort and fill the taxa summaries so that we can compare them.
    filled_ts1, filled_ts2 = _sort_and_fill_taxa_summaries([taxa_summary1,
                                                            taxa_summary2])
    if comparison_mode == 'paired':
        # Make sure that each sample is paired up to the sample it needs to be
        # compared against according to the sample ID map.
        compatible_ts1, compatible_ts2 = _make_compatible_taxa_summaries(
                filled_ts1, filled_ts2, sample_id_map)
        overall_corr, corr_vec = _compute_correlation(compatible_ts1,
                compatible_ts2, comparison_mode, correlation_type, tail_type,
                num_permutations, confidence_level,
                perform_detailed_comparisons)

        # Calculate the length of the vectors that were used to compute
        # correlation of.
        num_overall_observations = len(compatible_ts1[0]) * \
                                   len(compatible_ts1[1])

        # Report the number of samples that matched.
        header += "\n# Number of samples that matched between the taxa " + \
                  "summary files: %d" % len(compatible_ts1[0])
    elif comparison_mode == 'expected':
        overall_corr, corr_vec = _compute_correlation(filled_ts1, filled_ts2,
                comparison_mode, correlation_type, tail_type, num_permutations,
                confidence_level, perform_detailed_comparisons,
                expected_sample_id)
        num_overall_observations = len(filled_ts1[0]) * len(filled_ts1[1])
    else:
        raise ValueError("Invalid comparison mode '%s'. Must be one of %r." %
                         (comparison_mode, comparison_modes))

    # Format the overall correlation into a string that is writable to a file.
    # Include a warning in the header if the correlation coefficient was
    # spearman and the number of observations was <= 10.
    overall_corr_str_header = header
    if correlation_type == 'spearman' and num_overall_observations <= 10:
        overall_corr_str_header += '\n' + spearman_overall_warning
    overall_corr_str = format_correlation_info(overall_corr[0],
            overall_corr[1], overall_corr[2], overall_corr[3],
            num_permutations, overall_corr_str_header)

    # Format the correlation vector.
    corr_vec_str = None
    if perform_detailed_comparisons:
        detailed_header = header
        if correlation_type == 'spearman' and len(filled_ts1[1]) <= 10:
            detailed_header += '\n' + spearman_detailed_warning
        corr_vec_str = format_correlation_vector(corr_vec, num_permutations,
                                                 detailed_header)

    return (format_taxa_summary(filled_ts1), format_taxa_summary(filled_ts2),
           overall_corr_str, corr_vec_str)