def compute_ordination_correlation(map_f, coord_f, category, axis=1,
                                   correlation_type='pearson',
                                   num_permutations=999):
    if correlation_type not in CORRELATION_TYPES:
        raise ValueError("Invalid correlation type '%s'. Must be one of %r." %
                         (correlation_type, CORRELATION_TYPES))
    if num_permutations < 0:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than or equal to zero." % num_permutations)

    coords_samp_ids, coords, _, _ = parse_coords(coord_f)
    num_axes = len(coords[0])
    if axis < 1 or axis > num_axes:
        raise ValueError("Invalid axis number %d. Must be greater than zero "
                         "and less than or equal to the number of axes in the "
                         "input coordinates file (found %d axes)." %
                         (axis, num_axes))
    axis_data = coords[:, axis - 1]

    mdm, _ = parse_mapping_file_to_dict(map_f)
    gradient_data = []
    for samp_id in coords_samp_ids:
        if category not in mdm[samp_id]:
            raise ValueError("Category '%s' does not exist in the input "
                             "mapping file." % category)

        md_value = mdm[samp_id][category]
        try:
            md_value = float(md_value)
        except ValueError:
            raise ValueError("The category state '%s' could not be converted "
                             "to a number. All states in the '%s' category "
                             "must be numeric." % (md_value, category))
        gradient_data.append(md_value)

    corr_coeff, param_p_val, _, nonparam_p_val, _ = \
            correlation_test(axis_data, gradient_data, method=correlation_type,
                             permutations=num_permutations)

    if num_permutations > 0:
        nonparam_p_val = format_p_value_for_num_iters(nonparam_p_val,
                                                      num_permutations)
    else:
        nonparam_p_val = 'N/A'

    return corr_coeff, param_p_val, nonparam_p_val
Esempio n. 2
0
def _compute_correlation(ts1, ts2, comparison_mode, correlation_type,
                         tail_type, num_permutations, confidence_level,
                         perform_detailed_comparisons=False,
                         expected_sample_id=None):
    """Computes the correlation between two taxa summary files.

    The input taxa summaries MUST already be sorted and filled (see
    _sort_and_fill_taxa_summaries) so that the various taxa line up and contain
    the same number of taxa (e.g. the first taxon in both files is 'Bacteria'
    and not mismatched).

    Additionally, if comparison_mode is 'paired', the input taxa summaries must
    already have been made compatible, meaning the number of samples match
    between the two taxa summaries. This is very important as the first sample
    in ts1 will be compared to the first sample in ts2, the second sample in
    ts1 will be compared to the second sample in ts2, and so on. The sample IDs
    are not checked by this function to ensure they are in the correct order or
    mapping (this is the job of _make_compatible_taxa_summaries).

    Returns a two-element tuple: the first element is a four-element tuple
    containing the correlation coefficient, parametric p-value, nonparametric
    p-value, and a tuple for the confidence interval for the overall
    comparison.

    If perform_detailed_comparisons is True, the second element is a
    correlation vector, which is a list of 8-element tuples, where the first
    element is a sample ID from ts1, the second element is a sample ID from
    ts2, the third element is the correlation coefficient computed between the
    two samples (a double), the fourth element is the parametric p-value, the
    fifth element is the Bonferroni-corrected parametric p-value, the sixth
    element is the nonparametric p-value, the seventh element is the
    Bonferroni-corrected nonparametric p-value, and the eighth element is a
    tuple containing the low and high ends of the confidence interval. If
    perform_detailed_comparisons is False, None will be returned for the second
    element.

    Arguments:
        ts1 - the first taxa summary to be compared
        ts2 - the second taxa summary to be compared
        comparison_mode - the type of comparison to perform on the two taxa
            summaries. Can be either 'paired' or 'expected'. If 'paired', each
            positional pair of samples between the two taxa summaries will be
            compared. If 'expected', each sample in ts1 will be compared to an
            'expected' sample in ts2. If 'expected', ts2 must only contain a
            single sample unless expected_sample_id is provided
        correlation_type - the type of correlation coefficient to calculate
            when comparing samples in the taxa summaries. Can be either
            'pearson' or 'spearman'
        tail_type - if 'two-sided', a two-sided test is performed. 'high'
            for a one-tailed test for positive association, or 'low' for a
            one-tailed test for negative association. This parameter affects
            both the parametric and nonparametric tests, but the confidence
            interval will always be two-sided
        num_permutations - the number of permutations to use in the
            nonparametric test. Must be a number greater than or equal to 0. If
            0, the nonparametric test will not be performed. In this case, the
            nonparametric p-values will be None
        confidence_level - the confidence level to use when constructing the
            confidence interval. Must be between 0 and 1 (exclusive)
        perform_detailed_comparisons - if True, computes the correlation
            between each pair of samples in addition to computing the overall
            correlation between taxa summaries
        expected_sample_id - the sample ID in ts2 to compare all samples in ts1
            to. If not provided, ts2 must only contain a single sample, and all
            samples in ts1 will be compared to it
    """
    # Convert our notion of tail type into the format expected by PyCogent's
    # correlation_test().
    if tail_type == 'two-sided':
        tail_type = None

    if comparison_mode != 'paired' and comparison_mode != 'expected':
        raise ValueError("Invalid comparison mode '%s'. Must be one of %r." %
                         (comparison_mode, comparison_modes))

    # Make sure that the second taxa summary has only one sample if we weren't
    # provided an expected sample ID to compare against.
    if (comparison_mode == 'expected' and expected_sample_id is None and
        len(ts2[0]) != 1):
        raise ValueError("The second taxa summary file must contain a single "
                "sample (column) to compare all samples in the first taxa "
                "summary file against when the comparison mode is 'expected' "
                "and an expected sample ID is not provided. You provided a "
                "file with %d samples."
                % len(ts2[0]))

    if comparison_mode == 'paired':
        # Make sure the number of samples match between the two files (the IDs
        # do not have to match because of the sample ID map).
        if len(ts1[0]) != len(ts2[0]):
            raise ValueError("The two taxa summaries are incompatible because "
                             "they do not have the same number of sample IDs. "
                             "The taxa summaries must be made compatible "
                             "before attempting to perform "
                             "pairwise-comparisons between samples.")

    # Make sure the taxa information is the same (i.e. the summaries have been
    # sorted and filled).
    if ts1[1] != ts2[1]:
        raise ValueError("The taxa do not match exactly between the two taxa "
                         "summary files. The taxa must be sorted and filled "
                         "before attempting to compare them.")

    # Find the index of the expected sample ID.
    if comparison_mode == 'expected':
        if expected_sample_id:
            try:
                expected_idx = ts2[0].index(expected_sample_id)
            except ValueError:
                raise ValueError("The expected sample ID '%s' is not in the "
                                 "taxa summary file." % expected_sample_id)
        else:
            # We know the 'expected' taxa summary has a single sample in it, so
            # this is the only possible index.
            expected_idx = 0

    # Compute the overall correlation between each sample and the expected
    # sample, or each of the paired samples, and optionally the correlation
    # between each pair of samples individually.
    corr_vec = None
    if perform_detailed_comparisons:
        corr_vec = []
        num_comparisons = len(ts1[0])

    all_ts1_data = []
    all_ts2_data = []
    for samp_idx, samp_id in enumerate(ts1[0]):
        if comparison_mode == 'paired':
            paired_idx = samp_idx
        elif comparison_mode == 'expected':
            paired_idx = expected_idx
        else:
            # Redundant check, but here for safety in case the one above is
            # changed or removed.
            raise ValueError("Invalid comparison mode '%s'. Must be one of "
                             "%r." % (comparison_mode, comparison_modes))

        # Grab the columns of data for the current sample and its pair.
        ts1_data = ts1[2].T[samp_idx]
        ts2_data = ts2[2].T[paired_idx]
        all_ts1_data.extend(ts1_data)
        all_ts2_data.extend(ts2_data)

        if perform_detailed_comparisons:
            # Compare the current sample and its pair.
            corr_coeff, param_p_val, unused, nonparam_p_val, conf_interval = \
                    correlation_test(ts1_data, ts2_data,
                                     method=correlation_type,
                                     tails=tail_type,
                                     permutations=num_permutations,
                                     confidence_level=confidence_level)

            # Compute the Bonferroni-corrected p-values.
            param_p_val_corr = min(param_p_val * num_comparisons, 1)
            nonparam_p_val_corr = None if nonparam_p_val is None else \
                                  min(nonparam_p_val * num_comparisons, 1)

            corr_vec.append((samp_id, ts2[0][paired_idx], corr_coeff,
                param_p_val, param_p_val_corr, nonparam_p_val,
                nonparam_p_val_corr, conf_interval))

    # Compare all paired samples at once.
    results = correlation_test(all_ts1_data, all_ts2_data,
                               method=correlation_type, tails=tail_type,
                               permutations=num_permutations,
                               confidence_level=confidence_level)
    # We don't need to return all of the permuted correlation coefficients.
    overall_corr = (results[0], results[1], results[3], results[4])
    return overall_corr, corr_vec