def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
       
    sample_id_map_fp = opts.sample_id_map_fp
    if sample_id_map_fp:
        sample_id_map = dict([(k,v[0]) \
         for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()])
    else:
        sample_id_map = None
    
    input_dm_fps = opts.input_dms.split(',')
    output_f = open(opts.output_fp,'w')
    output_f.write(comment)
    output_f.write('DM1\tDM2\tNumber of entries\tMantel p-value\n')
    num_iterations = opts.num_iterations
    for i,fp1 in enumerate(input_dm_fps):
        for fp2 in input_dm_fps[i+1:]:
            (dm1_labels, dm1), (dm2_labels, dm2) =\
             make_compatible_distance_matrices(parse_distmat(open(fp1,'U')),
                                               parse_distmat(open(fp2,'U')),
                                               lookup=sample_id_map)
            if len(dm1_labels) < 2:
                output_f.write('%s\t%s\t%d\tToo few samples\n' % (fp1,fp2,len(dm1_labels)))
                continue
            p = mantel(dm1,dm2,n=num_iterations)
            p_str = format_p_value_for_num_iters(p,num_iterations)
            output_f.write('%s\t%s\t%d\t%s\n' % (fp1,fp2,len(dm1_labels),p_str))
    output_f.close()
def compute_ordination_correlation(map_f, coord_f, category, axis=1,
                                   correlation_type='pearson',
                                   num_permutations=999):
    if correlation_type not in CORRELATION_TYPES:
        raise ValueError("Invalid correlation type '%s'. Must be one of %r." %
                         (correlation_type, CORRELATION_TYPES))
    if num_permutations < 0:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than or equal to zero." % num_permutations)

    coords_samp_ids, coords, _, _ = parse_coords(coord_f)
    num_axes = len(coords[0])
    if axis < 1 or axis > num_axes:
        raise ValueError("Invalid axis number %d. Must be greater than zero "
                         "and less than or equal to the number of axes in the "
                         "input coordinates file (found %d axes)." %
                         (axis, num_axes))
    axis_data = coords[:, axis - 1]

    mdm, _ = parse_mapping_file_to_dict(map_f)
    gradient_data = []
    for samp_id in coords_samp_ids:
        if category not in mdm[samp_id]:
            raise ValueError("Category '%s' does not exist in the input "
                             "mapping file." % category)

        md_value = mdm[samp_id][category]
        try:
            md_value = float(md_value)
        except ValueError:
            raise ValueError("The category state '%s' could not be converted "
                             "to a number. All states in the '%s' category "
                             "must be numeric." % (md_value, category))
        gradient_data.append(md_value)

    corr_coeff, param_p_val, _, nonparam_p_val, _ = \
            correlation_test(axis_data, gradient_data, method=correlation_type,
                             permutations=num_permutations)

    if num_permutations > 0:
        nonparam_p_val = format_p_value_for_num_iters(nonparam_p_val,
                                                      num_permutations)
    else:
        nonparam_p_val = 'N/A'

    return corr_coeff, param_p_val, nonparam_p_val
Ejemplo n.º 3
0
 def test_format_p_value_for_num_iters(self):
     """ format_p_value_for_num_iters functions as expected """
     self.assertEqual(format_p_value_for_num_iters(0.119123123123, 100),
                      "0.12")
     self.assertEqual(format_p_value_for_num_iters(0.119123123123, 250),
                      "0.12")
     self.assertEqual(format_p_value_for_num_iters(0.119123123123, 1000),
                      "0.119")
     # test num_iters too low still returns a string (this can
     # be the last step of a long process, so we don't want to fail)
     self.assertEqual(format_p_value_for_num_iters(0.119123123123, 9),
                      "Too few iters to compute p-value (num_iters=9)")
     self.assertEqual(format_p_value_for_num_iters(0.119123123123, 1),
                      "Too few iters to compute p-value (num_iters=1)")
     self.assertEqual(format_p_value_for_num_iters(0.119123123123, 0),
                      "Too few iters to compute p-value (num_iters=0)")
Ejemplo n.º 4
0
 def test_format_p_value_for_num_iters(self):
     """ format_p_value_for_num_iters functions as expected """
     self.assertEqual(format_p_value_for_num_iters(0.119123123123, 100), "0.12")
     self.assertEqual(format_p_value_for_num_iters(0.119123123123, 250), "0.12")
     self.assertEqual(format_p_value_for_num_iters(0.119123123123, 1000), "0.119")
     # test num_iters too low still returns a string (this can
     # be the last step of a long process, so we don't want to fail)
     self.assertEqual(
         format_p_value_for_num_iters(0.119123123123, 9), "Too few iters to compute p-value (num_iters=9)"
     )
     self.assertEqual(
         format_p_value_for_num_iters(0.119123123123, 1), "Too few iters to compute p-value (num_iters=1)"
     )
     self.assertEqual(
         format_p_value_for_num_iters(0.119123123123, 0), "Too few iters to compute p-value (num_iters=0)"
     )
Ejemplo n.º 5
0
def compare_alpha_diversities(rarefaction_lines,
                              mapping_lines,
                              category,
                              depth=None,
                              test_type='nonparametric',
                              num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.

    Notes:
     Returns a defaultdict which as keys has the pairs of treatments being
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines.
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use
     the deepest available in the file.
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)

    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data,
                                                  rarefaction_data, category)

    ps_avg_div = get_per_sample_average_diversities(rarefaction_data, depth)

    ttest_results, ad_avgs = {}, {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1:
            ttest_results[treatment_pair] = (None, None)
            # add alpha diversity averages and standard deviations. since their
            # is only a single sample if we are in this part of the loop, we can
            # just record the sample value as the avg and 0 as the std.
            ad_avgs[treatment_pair[0]] = (sid_pair[0][0], 0.)
            ad_avgs[treatment_pair[1]] = (sid_pair[1][0], 0.)
        else:
            i = array([ps_avg_div[x] for x in sid_pair[0]])
            j = array([ps_avg_div[x] for x in sid_pair[1]])
            # add alpha diversity averages and standard deviations.
            ad_avgs[treatment_pair[0]] = (i.mean(), i.std())
            ad_avgs[treatment_pair[1]] = (j.mean(), j.std())
            # conduct tests
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair] = (None, None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i, j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(
                    i, j, permutations=num_permutations)
                if p_val is not None:
                    p_val = float(
                        format_p_value_for_num_iters(
                            p_val, num_iters=num_permutations))
                elif p_val is None:  # None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair] = (obs_t, p_val)

    return ttest_results, ad_avgs
Ejemplo n.º 6
0
def compare_alpha_diversities(rarefaction_lines,
                              mapping_lines,
                              category,
                              depth=None,
                              test_type='nonparametric',
                              num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use 
     the deepest available in the file. 
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)

    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data,
                                                  rarefaction_data, category)

    # extract only rows of the rarefaction data that are at the given depth
    # if depth is not given default to the deepest rarefaction available
    # rarefaction file is not guaranteed to be in order of rarefaction depth
    if depth == None:
        depth = array(rarefaction_data[3])[:, 0].max()

    rare_mat = array([row for row in rarefaction_data[3] if row[0] == depth])

    # Average each col of the rarefaction mtx. Computing t test on averages over
    # all iterations. Avoids more comps which kills signifigance.
    rare_mat = (rare_mat.sum(0) /
                rare_mat.shape[0])[2:]  #remove depth,iter cols
    sids = rarefaction_data[0][3:]  # 0-2 are header strings

    ttest_results = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1:
            ttest_results[treatment_pair] = (None, None)
        else:
            pair0_indices = [sids.index(i) for i in sid_pair[0]]
            pair1_indices = [sids.index(i) for i in sid_pair[1]]
            i = rare_mat.take(pair0_indices)
            j = rare_mat.take(pair1_indices)
            # found discussion of how to quickly check an array for nan here:
            # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair] = (None, None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i, j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(
                    i, j, permutations=num_permutations)
                if p_val != None:
                    p_val = float(
                        format_p_value_for_num_iters(
                            p_val, num_iters=num_permutations))
                elif p_val == None:  #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair] = (obs_t, p_val)
    # create dict of average alpha diversity values
    alphadiv_avgs = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # calculate the alpha diversity average, std vals. choosing only first
        # treatment pair doesn't guarantees full covering, must look at both
        for sid_list, treatment_str in zip(sid_pair, treatment_pair):
            # check if already computed and added
            if not treatment_str in alphadiv_avgs.keys():
                alphadiv_vals = \
                    rare_mat.take([sids.index(i) for i in sid_list])
                ad_mean = alphadiv_vals.mean()
                ad_std = alphadiv_vals.std()
                alphadiv_avgs[treatment_str] = (ad_mean, ad_std)
    return ttest_results, alphadiv_avgs
Ejemplo n.º 7
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    random_trials = opts.random_trials
    if random_trials != None and random_trials < 10:
        option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.')
        
    output_dir = opts.output_dir
    sample_id_map_fp = opts.sample_id_map_fp
    num_dimensions = opts.num_dimensions
    
    if not exists(output_dir): 
        makedirs(output_dir)
    
    if opts.store_trial_details:
        trial_output_dir = '%s/trial_details/' % output_dir
    else:
        trial_output_dir = None
  
    input_fp1 = opts.input_fps[0]
    input_fp2 = opts.input_fps[1]
    input_fp1_dir, input_fn1 = split(input_fp1)
    input_fp1_basename, input_fp1_ext = splitext(input_fn1)
    input_fp2_dir, input_fn2 = split(input_fp2)
    input_fp2_basename, input_fp2_ext = splitext(input_fn2)
    output_summary_fp = '%s/%s_%s_procrustes_results.txt' %\
     (output_dir,input_fp1_basename,input_fp2_basename)
    output_matrix1_fp = '%s/pc1_transformed.txt' % output_dir
    output_matrix2_fp = '%s/pc2_transformed.txt' % output_dir
    
    if sample_id_map_fp:
        sample_id_map = dict([(k,v[0]) \
         for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()])
    else:
        sample_id_map = None
    
    transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\
      get_procrustes_results(open(input_fp1,'U'),\
                             open(input_fp2,'U'),\
                             sample_id_map=sample_id_map,\
                             randomize=False,
                             max_dimensions=num_dimensions)
    output_matrix1_f = open(output_matrix1_fp,'w')
    output_matrix1_f.write(transformed_coords1)
    output_matrix1_f.close()
    output_matrix2_f = open(output_matrix2_fp,'w')
    output_matrix2_f.write(transformed_coords2)
    output_matrix2_f.close()
    
    if random_trials:
        summary_file_lines = ['FP1 FP2 Included_dimensions MC_p_value Count_better M^2']
        coords_f1 = list(open(input_fp1,'U'))
        coords_f2 = list(open(input_fp2,'U'))
        actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
         procrustes_monte_carlo(coords_f1,\
                                coords_f2,\
                                trials=random_trials,\
                                max_dimensions=num_dimensions,
                                sample_id_map=sample_id_map,
                                trial_output_dir=trial_output_dir)
        # truncate the p-value to the correct number of significant
        # digits
        mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials)
        max_dims_str = str(num_dimensions or 'alldim')
        summary_file_lines.append('%s %s %s %s %d %1.3f' %\
         (input_fp1, input_fp2, str(max_dims_str), mc_p_value_str,\
          count_better, actual_m_squared))
        f = open(output_summary_fp,'w')
        f.write('\n'.join(summary_file_lines))
        f.write('\n')
        f.close()
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, 
    depth=None, test_type='nonparametric', num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use 
     the deepest available in the file. 
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
    
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, 
        rarefaction_data, category)
    
    # extract only rows of the rarefaction data that are at the given depth
    # if depth is not given default to the deepest rarefaction available
    # rarefaction file is not guaranteed to be in order of rarefaction depth
    if depth == None:
        depth = array(rarefaction_data[3])[:,0].max()

    rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth])
    
    # Average each col of the rarefaction mtx. Computing t test on averages over
    # all iterations. Avoids more comps which kills signifigance. 
    rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols
    sids = rarefaction_data[0][3:] # 0-2 are header strings
    
    ttest_results = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0])==1 and len(sid_pair[1])==1:
            ttest_results[treatment_pair]= (None,None)
        else:
            pair0_indices = [sids.index(i) for i in sid_pair[0]]
            pair1_indices = [sids.index(i) for i in sid_pair[1]]
            i = rare_mat.take(pair0_indices)
            j = rare_mat.take(pair1_indices)
            # found discussion of how to quickly check an array for nan here:
            # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair]= (None,None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i,j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(i,j, 
                    permutations=num_permutations)
                if p_val != None: 
                    p_val = float(format_p_value_for_num_iters(p_val, 
                        num_iters=num_permutations))
                elif p_val ==  None: #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair]= (obs_t,p_val)
    # create dict of average alpha diversity values
    alphadiv_avgs = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # calculate the alpha diversity average, std vals. choosing only first
        # treatment pair doesn't guarantees full covering, must look at both
        for sid_list, treatment_str in zip(sid_pair, treatment_pair):
            # check if already computed and added
            if not treatment_str in alphadiv_avgs.keys():
                alphadiv_vals = \
                    rare_mat.take([sids.index(i) for i in sid_list])
                ad_mean = alphadiv_vals.mean()
                ad_std = alphadiv_vals.std()
                alphadiv_avgs[treatment_str] = (ad_mean, ad_std) 
    return ttest_results, alphadiv_avgs
Ejemplo n.º 9
0
def run_mantel_correlogram(fps,
                           distmats,
                           num_perms,
                           comment,
                           alpha,
                           sample_id_map=None,
                           variable_size_distance_classes=False):
    """Runs a Mantel correlogram analysis on all pairs of distance matrices.

    Returns a string suitable for writing out to a file containing the results
    of the test, a list of correlogram filepath names, and a list of matplotlib
    Figure objects representing each correlogram.

    The correlogram filepaths can have an extension string appended to the end
    of them and then be used to save each of the correlogram Figures to a file.
    Each correlogram filepath will be a combination of the two distance matrix
    filepaths that were used to create it.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        fps - list of filepaths of the distance matrices
        distmats - list of tuples containing dm labels and dm data (i.e. the
            output of parse_distmat)
        num_perms - the number of permutations to use to calculate the
            p-value(s)
        comment - comment string to add to the beginning of the results string
        alpha - the alpha value to use to determine significance in the
            correlogram plots
        sample_id_map - dict mapping sample IDs (i.e. what is expected by
            make_compatible_distance_matrices)
        variable_size_distance_classes - create distance classes that vary in
            size (i.e. width) but have the same number of distances in each
            class
    """
    if len(fps) != len(distmats):
        raise ValueError("Must provide the same number of filepaths as there "
                         "are distance matrices.")
    if comment is None:
        comment = ''
    result = comment + 'DM1\tDM2\tNumber of entries\t' + \
                       'Number of permutations\tClass index\t' + \
                       'Number of distances\tMantel r statistic\t' + \
                       'p-value\tp-value (Bonferroni corrected)\tTail type\n'
    correlogram_fps = []
    correlograms = []

    # Loop over all pairs of dms.
    for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)):
        for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]:
            # Make the current pair of distance matrices compatible by only
            # keeping samples that match between them, and ordering them by
            # the same sample IDs.
            (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                make_compatible_distance_matrices((dm1_labels, dm1_data),
                                                  (dm2_labels, dm2_data), lookup=sample_id_map)
            if len(dm1_labels) < 3:
                result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2,
                                                             len(dm1_labels))
                continue

            dm1 = SymmetricDistanceMatrix(dm1_data, dm1_labels)
            dm2 = SymmetricDistanceMatrix(dm2_data, dm2_labels)

            # Create an instance of our Mantel correlogram test and run it with
            # the specified number of permutations.
            mc = MantelCorrelogram(
                dm1,
                dm2,
                alpha=alpha,
                variable_size_distance_classes=variable_size_distance_classes)
            results = mc(num_perms)

            # Generate a name for the current correlogram and save it and the
            # correlogram itself.
            dm1_name = path.basename(fp1)
            dm2_name = path.basename(fp2)
            correlogram_fps.append('_'.join((dm1_name, 'AND', dm2_name,
                                             'mantel_correlogram')) + '.')
            correlograms.append(results['correlogram_plot'])

            # Iterate over the results and write them to the text file.
            first_time = True
            for class_idx, num_dist, r, p, p_corr in zip(
                    results['class_index'], results['num_dist'],
                    results['mantel_r'], results['mantel_p'],
                    results['mantel_p_corr']):
                # Format p-values and figure out which tail type we have based
                # on the sign of r.
                p_str = None
                if p is not None:
                    p_str = format_p_value_for_num_iters(p, num_perms)
                p_corr_str = None
                if p_corr is not None:
                    p_corr_str = format_p_value_for_num_iters(
                        p_corr, num_perms)
                if r is None:
                    tail_type = None
                elif r < 0:
                    tail_type = 'less'
                else:
                    tail_type = 'greater'

                if first_time:
                    result += '%s\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\n' % (
                        fp1, fp2, len(dm1_labels), num_perms, class_idx,
                        num_dist, r, p_str, p_corr_str, tail_type)
                    first_time = False
                else:
                    result += '\t\t\t\t%s\t%d\t%s\t%s\t%s\t%s\n' % (
                        class_idx, num_dist, r, p_str, p_corr_str, tail_type)
    return result, correlogram_fps, correlograms
Ejemplo n.º 10
0
def run_mantel_test(method, fps, distmats, num_perms, tail_type, comment,
                    control_dm_fp=None, control_dm=None,
                    sample_id_map=None):
    """Runs a Mantel test on all pairs of distance matrices.

    Returns a string suitable for writing out to a file containing the results
    of the test.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        method - which Mantel test to run (either 'mantel' or 'partial_mantel')
        fps - list of filepaths of the distance matrices
        distmats - list of tuples containing dm labels and dm data (i.e. the
            output of parse_distmat)
        num_perms - the number of permutations to use to calculate the
            p-value(s)
        tail_type - the type of tail test to use when calculating the
            p-value(s). Can be 'two sided', 'greater', or 'less'. Only applies
            when method is mantel
        comment - comment string to add to the beginning of the results string
        control_dm_fp - filepath of the control distance matrix. Only applies
            when method is partial_mantel (it is required then)
        control_dm - tuple containing control distance matrix labels and matrix
            data. Only applies when method is partial_mantel (it is required
            then)
        sample_id_map - dict mapping sample IDs (i.e. what is expected by
            make_compatible_distance_matrices)
    """
    if len(fps) != len(distmats):
        raise ValueError("Must provide the same number of filepaths as there "
                         "are distance matrices.")
    if comment is None:
        comment = ''
    result = comment

    if method == 'mantel':
        result += 'DM1\tDM2\tNumber of entries\tMantel r statistic\t' + \
                  'p-value\tNumber of permutations\tTail type\n'
    elif method == 'partial_mantel':
        if not control_dm_fp or not control_dm:
            raise ValueError("You must provide a control matrix filepath and "
                             "control matrix when running the partial Mantel "
                             "test.")
        result += 'DM1\tDM2\tCDM\tNumber of entries\t' + \
            'Mantel r statistic\tp-value\tNumber of permutations\t' +\
            'Tail type\n'
    else:
        raise ValueError("Invalid method '%s'. Must be either 'mantel' or "
                         "'partial_mantel'." % method)

    # Loop over all pairs of dms.
    for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)):
        for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]:
            # Make the current pair of distance matrices compatible by only
            # keeping samples that match between them, and ordering them by
            # the same sample IDs.
            (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                make_compatible_distance_matrices((dm1_labels, dm1_data),
                                                  (dm2_labels, dm2_data), lookup=sample_id_map)
            if method == 'partial_mantel':
                # We need to intersect three sets (three matrices).
                (dm1_labels, dm1_data), (cdm_labels, cdm_data) = \
                    make_compatible_distance_matrices(
                        (dm1_labels, dm1_data), control_dm,
                        lookup=sample_id_map)
                (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                    make_compatible_distance_matrices(
                        (dm1_labels, dm1_data), (dm2_labels, dm2_data),
                        lookup=sample_id_map)
                if len(dm1_labels) < 3:
                    result += '%s\t%s\t%s\t%d\tToo few samples\n' % (fp1,
                                                                     fp2, control_dm_fp, len(dm1_labels))
                    continue
            elif len(dm1_labels) < 3:
                result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2,
                                                             len(dm1_labels))
                continue

            dm1 = SymmetricDistanceMatrix(dm1_data, dm1_labels)
            dm2 = SymmetricDistanceMatrix(dm2_data, dm2_labels)

            # Create an instance of our correlation test and run it with
            # the specified number of permutations.
            if method == 'mantel':
                results = Mantel(dm1, dm2, tail_type)(num_perms)
                p_str = format_p_value_for_num_iters(results['p_value'],
                                                     num_perms)
                result += "%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (fp1, fp2,
                                                              len(dm1_labels), results[
                                                                  'r_value'], p_str,
                                                              num_perms, tail_type)
            elif method == 'partial_mantel':
                cdm = SymmetricDistanceMatrix(cdm_data, cdm_labels)
                results = PartialMantel(dm1, dm2, cdm)(num_perms)
                p_str = format_p_value_for_num_iters(results['mantel_p'],
                                                     num_perms)
                result += "%s\t%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (fp1, fp2,
                                                                  control_dm_fp, len(
                                                                      dm1_labels),
                                                                  results['mantel_r'], p_str, num_perms, 'greater')
    return result
Ejemplo n.º 11
0
def run_mantel_correlogram(fps, distmats, num_perms, comment, alpha,
                           sample_id_map=None,
                           variable_size_distance_classes=False):
    """Runs a Mantel correlogram analysis on all pairs of distance matrices.

    Returns a string suitable for writing out to a file containing the results
    of the test, a list of correlogram filepath names, and a list of matplotlib
    Figure objects representing each correlogram.

    The correlogram filepaths can have an extension string appended to the end
    of them and then be used to save each of the correlogram Figures to a file.
    Each correlogram filepath will be a combination of the two distance matrix
    filepaths that were used to create it.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        fps - list of filepaths of the distance matrices
        distmats - list of tuples containing dm labels and dm data (i.e. the
            output of parse_distmat)
        num_perms - the number of permutations to use to calculate the
            p-value(s)
        comment - comment string to add to the beginning of the results string
        alpha - the alpha value to use to determine significance in the
            correlogram plots
        sample_id_map - dict mapping sample IDs (i.e. what is expected by
            make_compatible_distance_matrices)
        variable_size_distance_classes - create distance classes that vary in
            size (i.e. width) but have the same number of distances in each
            class
    """
    if len(fps) != len(distmats):
        raise ValueError("Must provide the same number of filepaths as there "
                         "are distance matrices.")
    if comment is None:
        comment = ''
    result = comment + 'DM1\tDM2\tNumber of entries\t' + \
                       'Number of permutations\tClass index\t' + \
                       'Number of distances\tMantel r statistic\t' + \
                       'p-value\tp-value (Bonferroni corrected)\tTail type\n'
    correlogram_fps = []
    correlograms = []

    # Loop over all pairs of dms.
    for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)):
        for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]:
            # Make the current pair of distance matrices compatible by only
            # keeping samples that match between them, and ordering them by
            # the same sample IDs.
            (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                make_compatible_distance_matrices((dm1_labels, dm1_data),
                                                  (dm2_labels, dm2_data), lookup=sample_id_map)
            if len(dm1_labels) < 3:
                result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2,
                                                             len(dm1_labels))
                continue

            dm1 = SymmetricDistanceMatrix(dm1_data, dm1_labels)
            dm2 = SymmetricDistanceMatrix(dm2_data, dm2_labels)

            # Create an instance of our Mantel correlogram test and run it with
            # the specified number of permutations.
            mc = MantelCorrelogram(dm1, dm2, alpha=alpha,
                                   variable_size_distance_classes=variable_size_distance_classes)
            results = mc(num_perms)

            # Generate a name for the current correlogram and save it and the
            # correlogram itself.
            dm1_name = path.basename(fp1)
            dm2_name = path.basename(fp2)
            correlogram_fps.append('_'.join((dm1_name, 'AND', dm2_name,
                                             'mantel_correlogram')) + '.')
            correlograms.append(results['correlogram_plot'])

            # Iterate over the results and write them to the text file.
            first_time = True
            for class_idx, num_dist, r, p, p_corr in zip(
                    results['class_index'], results['num_dist'],
                    results['mantel_r'], results['mantel_p'],
                    results['mantel_p_corr']):
                # Format p-values and figure out which tail type we have based
                # on the sign of r.
                p_str = None
                if p is not None:
                    p_str = format_p_value_for_num_iters(p, num_perms)
                p_corr_str = None
                if p_corr is not None:
                    p_corr_str = format_p_value_for_num_iters(p_corr,
                                                              num_perms)
                if r is None:
                    tail_type = None
                elif r < 0:
                    tail_type = 'less'
                else:
                    tail_type = 'greater'

                if first_time:
                    result += '%s\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\n' % (
                        fp1, fp2, len(dm1_labels), num_perms, class_idx,
                        num_dist, r, p_str, p_corr_str, tail_type)
                    first_time = False
                else:
                    result += '\t\t\t\t%s\t%d\t%s\t%s\t%s\t%s\n' % (class_idx,
                                                                    num_dist, r, p_str, p_corr_str, tail_type)
    return result, correlogram_fps, correlograms
Ejemplo n.º 12
0
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category,
                              depth=None, test_type='nonparametric', num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.

    Notes:
     Returns a defaultdict which as keys has the pairs of treatments being
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines.
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use
     the deepest available in the file.
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)

    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data,
                                                  rarefaction_data, category)

    ps_avg_div = get_per_sample_average_diversities(rarefaction_data, depth)

    ttest_results, ad_avgs = {}, {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1:
            ttest_results[treatment_pair] = (None, None)
            # add alpha diversity averages and standard deviations. since their
            # is only a single sample if we are in this part of the loop, we can
            # just record the sample value as the avg and 0 as the std.
            ad_avgs[treatment_pair[0]] = (sid_pair[0][0], 0.)
            ad_avgs[treatment_pair[1]] = (sid_pair[1][0], 0.)
        else:
            i = array([ps_avg_div[x] for x in sid_pair[0]])
            j = array([ps_avg_div[x] for x in sid_pair[1]])
            # add alpha diversity averages and standard deviations.
            ad_avgs[treatment_pair[0]] = (i.mean(), i.std())
            ad_avgs[treatment_pair[1]] = (j.mean(), j.std())
            # conduct tests
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair] = (None, None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i, j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(i, j,
                                                     permutations=num_permutations)
                if p_val is not None:
                    p_val = float(format_p_value_for_num_iters(p_val,
                                                               num_iters=num_permutations))
                elif p_val is None:  # None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair] = (obs_t, p_val)

    return ttest_results, ad_avgs
Ejemplo n.º 13
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        if not path.exists(opts.output_dir):
            create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    # Parse the mapping file and distance matrix.
    md_map = MetadataMap.parseMetadataMap(open(opts.mapping_file,'U'))
    dm = DistanceMatrix.parseDistanceMatrix(open(opts.input_dm,'U'))

    # Separate all categories into a list, then grab the first category.
    categories = opts.categories.split(',')

    # Cursory check to make sure all categories passed in are in mapping file.
    maps = parse_mapping_file(open(opts.mapping_file,'U').readlines())
    for category in categories:
        if not category in maps[1][1:]:
            option_parser.error("Category '%s' not found in mapping file "
                                "columns:" % category)

    # Make sure the input distance matrix is symmetric and hollow. Must check
    # here before allowing R to use it, as R will silently ignore the diagonal
    # and upper triangle of the distance matrix.
    if not dm.is_symmetric_and_hollow():
        option_parser.error("The distance matrix must be symmetric and "
                            "hollow.")

    # Figure out which method we need to run.
    if opts.method == 'adonis':
        command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \
            str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "adonis.r", output_dir=opts.output_dir)
    elif opts.method == 'anosim':
        anosim = Anosim(md_map, dm, categories[0])
        anosim_results = anosim(opts.num_permutations)

        output_file = open(opts.output_dir + "/" + opts.method + \
            "_results.txt", "w+")
        output_file.write("Method Name\tR-value\tP-value")
        output_file.write("\n")
        output_file.write(anosim_results["method_name"]+"\t"+\
            str(anosim_results["r_value"])+"\t"+\
            str(anosim_results["p_value"])+"\t")
        output_file.write("\n")
        output_file.close()
    elif opts.method == 'best':
        bioenv = BioEnv(dm, md_map, categories)
        bioenv_results = bioenv()

        output_file = open(opts.output_dir+"/best_results.txt", 'w+')
        output_file.write("Method Name:\tNum_Vars:\t")
        output_file.write("\n")
        output_file.write(bioenv_results["method_name"]+"\t"+\
            str(bioenv_results["num_vars"]) + "\t")
        output_file.write("\n")
        output_file.write("Variables:\t")
        output_file.write("\n")
        for variable in bioenv_results["vars"]:
            output_file.write(str(variable) + "\t")
        output_file.write("\n")
        output_file.write("RHO_Values:\t")
        output_file.write("\n")
        for rho_val in bioenv_results["bioenv_rho_vals"]:
            output_file.write(str(rho_val) + "\t")
        output_file.write("\n")
        output_file.close()
    elif opts.method == 'morans_i':
        command_args = ["-i " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir]
        rex = RExecutor()
        rex(command_args, "morans_i.r", output_dir=opts.output_dir)
    elif opts.method == 'mrpp':
        command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + \
            " -n " + str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "mrpp.r", output_dir=opts.output_dir)
    elif opts.method == 'permanova':
        permanova_plain = Permanova(md_map, dm, categories[0])
        permanova_results = permanova_plain(opts.num_permutations)

        output_file = open(opts.output_dir+"/permanova_results.txt", 'w+')
        output_file.write("Method Name\tF-value\tP-value")
        output_file.write("\n")
        output_file.write(permanova_results["method_name"]+"\t"+\
            str(permanova_results["f_value"]) + "\t" + \
            format_p_value_for_num_iters(permanova_results["p_value"], \
            opts.num_permutations)+"\t")
        output_file.write("\n")
        output_file.close()
    elif opts.method == 'permdisp':
        command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \
            str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "permdisp.r", output_dir=opts.output_dir)
    elif opts.method == 'dbrda':
        command_args = ["-i " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \
            str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "dbrda.r", output_dir=opts.output_dir)
Ejemplo n.º 14
0
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth,
    test_type='nonparametric', num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use.
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, 
        rarefaction_data, category)
    
    # extract only rows of the rarefaction data that are at the given depth
    rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth])
    
    # Average each col of the rarefaction mtx. Computing t test on averages over
    # all iterations. Avoids more comps which kills signifigance. 
    rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols
    sids = rarefaction_data[0][3:] # 0-2 are header strings
    results = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0])==1 and len(sid_pair[1])==1:
            t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1])
            results[t_key]= (None,None)
        else:
            pair0_indices = [sids.index(i) for i in sid_pair[0]]
            pair1_indices = [sids.index(i) for i in sid_pair[1]]
            t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1])
            i = rare_mat.take(pair0_indices)
            j = rare_mat.take(pair1_indices)
            # found discussion of how to quickly check an array for nan here:
            # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy
            if isnan(np_min(i)) or isnan(np_min(j)):
                results[t_key]= (None,None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i,j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(i,j, 
                    permutations=num_permutations)
                if p_val != None: 
                    p_val = float(format_p_value_for_num_iters(p_val, 
                        num_iters=num_permutations))
                elif p_val ==  None: #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            results[t_key]= (obs_t,p_val)
    return results
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fps = opts.input_fps
    sample_id_map_fps = opts.sample_id_map_fps
    num_dimensions = opts.num_dimensions
    max_dims_str = str(num_dimensions or 'alldim')
    output_dir = opts.output_dir
    random_trials = opts.random_trials
    
    if random_trials != None and random_trials < 10:
        option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.')
    
    if sample_id_map_fps and \
       (len(sample_id_map_fps) + 1) != len(opts.input_fps):
       option_parser.error('If providing sample id maps, there must be exactly'
                           ' one fewer sample id maps than input coordinate'
                           ' matrices.')
    
    if not exists(output_dir): 
        makedirs(output_dir)
  
    reference_input_fp = input_fps[0]
    reference_input_fp_dir, input_fn1 = split(reference_input_fp)
    reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1)
    output_summary_fp = join(output_dir,'procrustes_results.txt')
    summary_file_lines = \
     ['#FP1\tFP2\tNum included dimensions\tMonte Carlo p-value\tCount better\tM^2',
      '#Warning: p-values in this file are NOT currently adjusted for multiple comparisons.']
    
    for i,query_input_fp in enumerate(input_fps[1:]):
        query_input_fp_dir, query_input_fn = split(query_input_fp)
        query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn)
        output_matrix1_fp = join(output_dir,
         '%s_transformed_reference.txt' % reference_input_fp_basename)
        output_matrix2_fp = join(output_dir,\
         '%s_transformed_q%d.txt' % (query_input_fp_basename, i+1))
        
        if sample_id_map_fps:
            sample_id_map = dict([(k,v[0]) \
             for k,v in fields_to_dict(open(sample_id_map_fps[i], "U")).items()])
        else:
            sample_id_map = None
        
        transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\
          get_procrustes_results(open(reference_input_fp,'U'),\
                                 open(query_input_fp,'U'),\
                                 sample_id_map=sample_id_map,\
                                 randomize=False,
                                 max_dimensions=num_dimensions)
        
        output_matrix1_f = open(output_matrix1_fp,'w')
        output_matrix1_f.write(transformed_coords1)
        output_matrix1_f.close()
        output_matrix2_f = open(output_matrix2_fp,'w')
        output_matrix2_f.write(transformed_coords2)
        output_matrix2_f.close()
        
        if random_trials:
            if opts.store_trial_details:
                trial_output_dir = join(output_dir,'trial_details_%d' % i+2)
            else:
                trial_output_dir = None
            coords_f1 = list(open(reference_input_fp,'U'))
            coords_f2 = list(open(query_input_fp,'U'))
            actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
             procrustes_monte_carlo(coords_f1,
                                    coords_f2,
                                    trials=random_trials,
                                    max_dimensions=num_dimensions,
                                    sample_id_map=sample_id_map,
                                    trial_output_dir=trial_output_dir)
            # truncate the p-value to the correct number of significant
            # digits
            mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials)
            summary_file_lines.append('%s\t%s\t%s\t%s\t%d\t%1.3f' %\
             (reference_input_fp, query_input_fp, max_dims_str, mc_p_value_str,\
              count_better, actual_m_squared))
        else:
            summary_file_lines.append('%s\t%s\t%s\tNA\tNA\t%1.3f' %\
             (reference_input_fp, query_input_fp, max_dims_str, m_squared))
    # Write output summary
    f = open(output_summary_fp,'w')
    f.write('\n'.join(summary_file_lines))
    f.write('\n')
    f.close()
Ejemplo n.º 16
0
def run_mantel_test(method,
                    fps,
                    distmats,
                    num_perms,
                    tail_type,
                    comment,
                    control_dm_fp=None,
                    control_dm=None,
                    sample_id_map=None):
    """Runs a Mantel test on all pairs of distance matrices.

    Returns a string suitable for writing out to a file containing the results
    of the test.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        method - which Mantel test to run (either 'mantel' or 'partial_mantel')
        fps - list of filepaths of the distance matrices
        distmats - list of tuples containing dm labels and dm data (i.e. the
            output of parse_distmat)
        num_perms - the number of permutations to use to calculate the
            p-value(s)
        tail_type - the type of tail test to use when calculating the
            p-value(s). Can be 'two sided', 'greater', or 'less'. Only applies
            when method is mantel
        comment - comment string to add to the beginning of the results string
        control_dm_fp - filepath of the control distance matrix. Only applies
            when method is partial_mantel (it is required then)
        control_dm - tuple containing control distance matrix labels and matrix
            data. Only applies when method is partial_mantel (it is required
            then)
        sample_id_map - dict mapping sample IDs (i.e. what is expected by
            make_compatible_distance_matrices)
    """
    if len(fps) != len(distmats):
        raise ValueError("Must provide the same number of filepaths as there "
                         "are distance matrices.")
    if comment is None:
        comment = ''
    result = comment

    if method == 'mantel':
        result += 'DM1\tDM2\tNumber of entries\tMantel r statistic\t' + \
                  'p-value\tNumber of permutations\tTail type\n'
    elif method == 'partial_mantel':
        if not control_dm_fp or not control_dm:
            raise ValueError("You must provide a control matrix filepath and "
                             "control matrix when running the partial Mantel "
                             "test.")
        result += 'DM1\tDM2\tCDM\tNumber of entries\t' + \
            'Mantel r statistic\tp-value\tNumber of permutations\t' +\
            'Tail type\n'
    else:
        raise ValueError("Invalid method '%s'. Must be either 'mantel' or "
                         "'partial_mantel'." % method)

    # Loop over all pairs of dms.
    for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)):
        for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]:
            # Make the current pair of distance matrices compatible by only
            # keeping samples that match between them, and ordering them by
            # the same sample IDs.
            (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                make_compatible_distance_matrices((dm1_labels, dm1_data),
                                                  (dm2_labels, dm2_data), lookup=sample_id_map)
            if method == 'partial_mantel':
                # We need to intersect three sets (three matrices).
                (dm1_labels, dm1_data), (cdm_labels, cdm_data) = \
                    make_compatible_distance_matrices(
                        (dm1_labels, dm1_data), control_dm,
                        lookup=sample_id_map)
                (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                    make_compatible_distance_matrices(
                        (dm1_labels, dm1_data), (dm2_labels, dm2_data),
                        lookup=sample_id_map)
                if len(dm1_labels) < 3:
                    result += '%s\t%s\t%s\t%d\tToo few samples\n' % (
                        fp1, fp2, control_dm_fp, len(dm1_labels))
                    continue
            elif len(dm1_labels) < 3:
                result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2,
                                                             len(dm1_labels))
                continue

            dm1 = SymmetricDistanceMatrix(dm1_data, dm1_labels)
            dm2 = SymmetricDistanceMatrix(dm2_data, dm2_labels)

            # Create an instance of our correlation test and run it with
            # the specified number of permutations.
            if method == 'mantel':
                results = Mantel(dm1, dm2, tail_type)(num_perms)
                p_str = format_p_value_for_num_iters(results['p_value'],
                                                     num_perms)
                result += "%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (
                    fp1, fp2, len(dm1_labels), results['r_value'], p_str,
                    num_perms, tail_type)
            elif method == 'partial_mantel':
                cdm = SymmetricDistanceMatrix(cdm_data, cdm_labels)
                results = PartialMantel(dm1, dm2, cdm)(num_perms)
                p_str = format_p_value_for_num_iters(results['mantel_p'],
                                                     num_perms)
                result += "%s\t%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (
                    fp1, fp2, control_dm_fp, len(dm1_labels),
                    results['mantel_r'], p_str, num_perms, 'greater')
    return result
Ejemplo n.º 17
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fps = opts.input_fps
    sample_id_map_fps = opts.sample_id_map_fps
    num_dimensions = opts.num_dimensions
    max_dims_str = str(num_dimensions or 'alldim')
    output_dir = opts.output_dir
    random_trials = opts.random_trials

    if random_trials is not None and random_trials < 10:
        option_parser.error(
            'Must perform >= 10 trails for Monte Carlo analysis.')

    if sample_id_map_fps and \
       (len(sample_id_map_fps) + 1) != len(opts.input_fps):
        option_parser.error('If providing sample id maps, there must be '
                            'exactly one fewer sample id maps than input '
                            'coordinate matrices.')

    if not exists(output_dir):
        makedirs(output_dir)

    reference_input_fp = input_fps[0]
    reference_input_fp_dir, input_fn1 = split(reference_input_fp)
    reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1)
    output_summary_fp = join(output_dir, 'procrustes_results.txt')
    summary_file_lines = [
        '#FP1\tFP2\tNum included dimensions\tMonte Carlo '
        'p-value\tCount better\tM^2',
        '#Warning: p-values in this file are NOT currently '
        'adjusted for multiple comparisons.'
    ]

    for i, query_input_fp in enumerate(input_fps[1:]):
        query_input_fp_dir, query_input_fn = split(query_input_fp)
        query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn)
        output_matrix1_fp = join(
            output_dir,
            '%s_transformed_reference.txt' % reference_input_fp_basename)
        output_matrix2_fp = join(
            output_dir,
            '%s_transformed_q%d.txt' % (query_input_fp_basename, i + 1))

        if sample_id_map_fps:
            with open(sample_id_map_fps[i], "U") as f:
                sample_id_map = dict([
                    (k, v[0]) for k, v in fields_to_dict(f).iteritems()
                ])
        else:
            sample_id_map = None

        with open(reference_input_fp, 'U') as ref_in_f:
            with open(query_input_fp, 'U') as query_in_f:
                transf_coords1, transf_coords2, m_squared, rand_coords2 =\
                    get_procrustes_results(ref_in_f, query_in_f,
                                           sample_id_map=sample_id_map,
                                           randomize=False,
                                           max_dimensions=num_dimensions)

        transf_coords1.write(output_matrix1_fp)
        transf_coords2.write(output_matrix2_fp)

        if random_trials:
            if opts.store_trial_details:
                trial_output_dir = join(output_dir, 'trial_details_%d' % i + 2)
            else:
                trial_output_dir = None
            coords_f1 = open(reference_input_fp, 'U')
            coords_f2 = open(query_input_fp, 'U')
            actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
                procrustes_monte_carlo(coords_f1,
                                       coords_f2,
                                       trials=random_trials,
                                       max_dimensions=num_dimensions,
                                       sample_id_map=sample_id_map,
                                       trial_output_dir=trial_output_dir)
            # truncate the p-value to the correct number of significant
            # digits
            mc_p_value_str = format_p_value_for_num_iters(
                mc_p_value, random_trials)
            summary_file_lines.append(
                '%s\t%s\t%s\t%s\t%d\t%1.3f' %
                (reference_input_fp, query_input_fp, max_dims_str,
                 mc_p_value_str, count_better, actual_m_squared))
        else:
            summary_file_lines.append(
                '%s\t%s\t%s\tNA\tNA\t%1.3f' %
                (reference_input_fp, query_input_fp, max_dims_str, m_squared))
    # Write output summary
    with open(output_summary_fp, 'w') as f:
        f.write('\n'.join(summary_file_lines))
        f.write('\n')
Ejemplo n.º 18
0
def compare_alpha_diversities(rarefaction_lines, mapping_lines, 
                              category, depth, test_type='nonparametric',
                              num_permutations=999):
    """compares alpha diversities
    
    inputs:
        rarefaction_file - rarefaction file which gives scores for 
        various rarefactions and depths
        
        mapping_file - file that has ID's and categories that the ID's
        fall in
        
        category - the category to be compared, is a string
        
        depth - the depth of the rarefaction_file to use, is an integer

        test_type - the type of t-test to perform, is a string. Must be either
        'parametric' or 'nonparametric'

        num_permutations - the number of Monte Carlo permutations to use if
        test_type is 'nonparametric', is an integer
    
    outputs:
        results - a nested dictionary which specifies the category as
        the top level key, and as its value, dictionaries which give the
        results of the t_two_sample test for all unique pairs of values
        in the specified category
    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    value_pairs = make_value_pairs_from_category(mapping_data, category)
    
    category_values_Ids = make_category_values_Id_dict(mapping_data, 
                                                       category)
    
    SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs,
                                                    category_values_Ids)
    
    map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict(
                                                       rarefaction_data)
    
    reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth,
                                                       rarefaction_data)
    
    results = {category:{}}
    
    for pair in range(len(SampleId_pairs)):
        # Must flatten the matrix because t_two_sample only operates on
        # non-nested sequences (otherwise we'll get the wrong degrees of
        # freedom).
        i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0],
                                                reduced_rarefaction_mtx,
                                                map_from_Id_to_col)).flatten()
        
        j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1],
                                                reduced_rarefaction_mtx,
                                                map_from_Id_to_col)).flatten()

        if test_type == 'parametric':
            obs_t, p_val = t_two_sample(i,j)
        elif test_type == 'nonparametric':
            obs_t, _, _, p_val = mc_t_two_sample(i,j,
                                                 permutations=num_permutations)
            p_val = format_p_value_for_num_iters(p_val, num_permutations)
        else:
            raise ValueError("Invalid test type '%s'." % test_type)

        results[category][(str(value_pairs[pair][0]),
                           str(value_pairs[pair][1]))] = obs_t, p_val
    return results