Esempio n. 1
0
def monte_carlo_group_distances_within_between(single_field, \
    paired_field, dmat, dir_prefix = '', \
    subdir_prefix='monte_carlo_group_distances',\
    num_iters=10):
    """Calculate Monte Carlo stats within and between fields.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """

    path_prefix = path.join(dir_prefix, subdir_prefix)
    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)

    real_dists = []
    within_category_distances = \
        within_category_distances_grouped(single_field,label_suffix='')
    real_dists.extend([['Within',field,distances] for field,\
        distances in within_category_distances.items()])

    between_category_distances = \
        between_category_distances_grouped(single_field,label_suffix='')
    real_dists.extend([['Between',field,distances] for field,\
        distances in between_category_distances.items()])

    within_and_between = \
        within_and_between_fields(paired_field)

    real_dists.extend([[field.split('_',1)[0],\
        field.split('_',1)[1],distances] for \
        field, distances in within_and_between.items()])

    outfile = open(
        path.join(path_prefix, 'group_distances_within_and_between.txt'), 'w')
    outfile.write('\t'.join(['Comparison','Category_1','Avg',\
        'Comparison','Category_2','Avg','t','p',\
        'p_greater','p_less','Iterations\n']))

    rand_distances = get_random_dists(real_dists, dmat, num_iters)

    #iterate over the groups
    for i, (first_g1, second_g1, distances_g1) in \
        enumerate(real_dists[:-1]):
        real_dist_1 = average(distances_g1)
        rand_dists_1 = [rand_distances[n][i][-1] for n in range(num_iters)]
        #then for each other pair (not including same group)
        for j in range(i + 1, len(real_dists)):
            first_g2, second_g2, distances_g2 = real_dists[j]
            real_dist_2 = average(distances_g2)
            rand_dists_2 = [rand_distances[n][j][-1] \
                for n in range(num_iters)]
            ttests = [t_two_sample(rand_dists_1[n],rand_dists_2[n])[0] \
                for n in range(num_iters)]
            real_ttest = t_two_sample(distances_g1, distances_g2)
            curr_line = [first_g1, second_g1, real_dist_1, \
                first_g2, second_g2, real_dist_2]
            curr_line.extend([real_ttest[0], real_ttest[1],\
                (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                num_iters])
            outfile.write('\t'.join(map(str, curr_line)))
            outfile.write('\n')
Esempio n. 2
0
def compare_alpha_diversities(rarefaction_lines,
                              mapping_lines,
                              category,
                              depth=None,
                              test_type='nonparametric',
                              num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use 
     the deepest available in the file. 
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)

    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data,
                                                  rarefaction_data, category)

    # extract only rows of the rarefaction data that are at the given depth
    # if depth is not given default to the deepest rarefaction available
    # rarefaction file is not guaranteed to be in order of rarefaction depth
    if depth == None:
        depth = array(rarefaction_data[3])[:, 0].max()

    rare_mat = array([row for row in rarefaction_data[3] if row[0] == depth])

    # Average each col of the rarefaction mtx. Computing t test on averages over
    # all iterations. Avoids more comps which kills signifigance.
    rare_mat = (rare_mat.sum(0) /
                rare_mat.shape[0])[2:]  #remove depth,iter cols
    sids = rarefaction_data[0][3:]  # 0-2 are header strings

    ttest_results = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1:
            ttest_results[treatment_pair] = (None, None)
        else:
            pair0_indices = [sids.index(i) for i in sid_pair[0]]
            pair1_indices = [sids.index(i) for i in sid_pair[1]]
            i = rare_mat.take(pair0_indices)
            j = rare_mat.take(pair1_indices)
            # found discussion of how to quickly check an array for nan here:
            # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair] = (None, None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i, j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(
                    i, j, permutations=num_permutations)
                if p_val != None:
                    p_val = float(
                        format_p_value_for_num_iters(
                            p_val, num_iters=num_permutations))
                elif p_val == None:  #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair] = (obs_t, p_val)
    # create dict of average alpha diversity values
    alphadiv_avgs = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # calculate the alpha diversity average, std vals. choosing only first
        # treatment pair doesn't guarantees full covering, must look at both
        for sid_list, treatment_str in zip(sid_pair, treatment_pair):
            # check if already computed and added
            if not treatment_str in alphadiv_avgs.keys():
                alphadiv_vals = \
                    rare_mat.take([sids.index(i) for i in sid_list])
                ad_mean = alphadiv_vals.mean()
                ad_std = alphadiv_vals.std()
                alphadiv_avgs[treatment_str] = (ad_mean, ad_std)
    return ttest_results, alphadiv_avgs
Esempio n. 3
0
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \
    dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\
    default_iters=10, fields=None):
    """Calculate Monte Carlo stats for specified group distances.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    orig_distance_matrix = distance_matrix.copy()

    path_prefix = path.join(dir_prefix, subdir_prefix)

    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)

    if fields is None:
        fields = [mapping[0][0]]

    if prefs is None:
        prefs = {}

    if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs:
        prefs = build_monte_carlo_prefs(fields, default_iters)

    for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items():
        if '&&' in field:
            groups = group_by_fields(mapping, field.split('&&'))
        else:
            groups = group_by_field(mapping, field)
        outfile = open(
            path.join(path_prefix, 'group_distances_' + field + '.txt'), 'w')
        outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\
            'Category_2a','Category_2b','Avg','t','p',\
            'p_greater','p_less','Iterations\n']))
        real_dists = distances_by_groups(distance_header, distance_matrix,\
            groups)

        #iterate over the groups
        for i, (first_g1, second_g1, distances_g1) in \
            enumerate(real_dists[:-1]):

            real_dist_1 = average(distances_g1)

            #then for each other pair (not including same group)
            for j in range(i + 1, len(real_dists)):
                first_g2, second_g2, distances_g2 = real_dists[j]

                real_dist_2 = average(distances_g2)

                # permute distances just within these groups!
                rand_dists_1, rand_dists_2 = \
                        permute_between_groups(distances_g1,
                                               distances_g2,
                                               num_iters)

                ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \
                    for n in range(num_iters)]
                real_ttest = t_two_sample(distances_g1.flatten(),
                                          distances_g2.flatten())
                curr_line = [first_g1, second_g1, real_dist_1, \
                    first_g2, second_g2, real_dist_2]
                curr_line.extend([real_ttest[0], real_ttest[1],\
                    (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                    (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                    num_iters])
                outfile.write('\t'.join(map(str, curr_line)))
                outfile.write('\n')
def monte_carlo_group_distances_within_between(single_field, \
    paired_field, dmat, dir_prefix = '', \
    subdir_prefix='monte_carlo_group_distances',\
    num_iters=10):
    """Calculate Monte Carlo stats within and between fields.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """

    path_prefix = path.join(dir_prefix,subdir_prefix)
    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)
    
    real_dists = []
    within_category_distances = \
        within_category_distances_grouped(single_field,label_suffix='')
    real_dists.extend([['Within',field,distances] for field,\
        distances in within_category_distances.items()])
        
    between_category_distances = \
        between_category_distances_grouped(single_field,label_suffix='')
    real_dists.extend([['Between',field,distances] for field,\
        distances in between_category_distances.items()])
    
    within_and_between = \
        within_and_between_fields(paired_field)
    
    real_dists.extend([[field.split('_',1)[0],\
        field.split('_',1)[1],distances] for \
        field, distances in within_and_between.items()])
    
    outfile = open(path.join(path_prefix,
                            'group_distances_within_and_between.txt'), 'w')
    outfile.write('\t'.join(['Comparison','Category_1','Avg',\
        'Comparison','Category_2','Avg','t','p',\
        'p_greater','p_less','Iterations\n']))

    rand_distances = get_random_dists(real_dists, dmat, num_iters)
    
    #iterate over the groups
    for i, (first_g1, second_g1, distances_g1) in \
        enumerate(real_dists[:-1]):
        real_dist_1 = average(distances_g1)
        rand_dists_1 = [rand_distances[n][i][-1] for n in range(num_iters)]
        #then for each other pair (not including same group)
        for j in range(i+1,len(real_dists)):
            first_g2, second_g2, distances_g2 = real_dists[j]
            real_dist_2 = average(distances_g2)
            rand_dists_2 = [rand_distances[n][j][-1] \
                for n in range(num_iters)]
            ttests = [t_two_sample(rand_dists_1[n],rand_dists_2[n])[0] \
                for n in range(num_iters)]
            real_ttest = t_two_sample(distances_g1, distances_g2)
            curr_line = [first_g1, second_g1, real_dist_1, \
                first_g2, second_g2, real_dist_2]
            curr_line.extend([real_ttest[0], real_ttest[1],\
                (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                num_iters])
            outfile.write('\t'.join(map(str, curr_line)))
            outfile.write('\n')
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \
    dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\
    default_iters=10, fields=None):
    """Calculate Monte Carlo stats for specified group distances.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """
    mapping, header, comments = parse_mapping_file(open(mapping_file,'U'))
    header = [header]
    header.extend(mapping)
    mapping=header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    orig_distance_matrix = distance_matrix.copy()

    path_prefix = path.join(dir_prefix,subdir_prefix)
    
    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)
    
    if fields is None:
        fields = [mapping[0][0]]
        
    if prefs is None:
        prefs = {}
 
    if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs:
        prefs = build_monte_carlo_prefs(fields,default_iters)
            
    for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items():
        if '&&' in field:
            groups = group_by_fields(mapping, field.split('&&'))
        else:
            groups = group_by_field(mapping, field)
        outfile = open(path.join(path_prefix,
                                 'group_distances_'+field+'.txt'), 'w')
        outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\
            'Category_2a','Category_2b','Avg','t','p',\
            'p_greater','p_less','Iterations\n']))
        real_dists = distances_by_groups(distance_header, distance_matrix,\
            groups)
 
        #iterate over the groups
        for i, (first_g1, second_g1, distances_g1) in \
            enumerate(real_dists[:-1]):

            real_dist_1 = average(distances_g1)

            #then for each other pair (not including same group)
            for j in range(i+1,len(real_dists)):
                first_g2, second_g2, distances_g2 = real_dists[j]

                real_dist_2 = average(distances_g2)

                # permute distances just within these groups!
                rand_dists_1, rand_dists_2 = \
                        permute_between_groups(distances_g1, 
                                               distances_g2,
                                               num_iters)

                ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \
                    for n in range(num_iters)]
                real_ttest = t_two_sample(distances_g1.flatten(), distances_g2.flatten())
                curr_line = [first_g1, second_g1, real_dist_1, \
                    first_g2, second_g2, real_dist_2]
                curr_line.extend([real_ttest[0], real_ttest[1],\
                    (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                    (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                    num_iters])
                outfile.write('\t'.join(map(str, curr_line)))
                outfile.write('\n')
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, 
    depth=None, test_type='nonparametric', num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use 
     the deepest available in the file. 
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
    
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, 
        rarefaction_data, category)
    
    # extract only rows of the rarefaction data that are at the given depth
    # if depth is not given default to the deepest rarefaction available
    # rarefaction file is not guaranteed to be in order of rarefaction depth
    if depth == None:
        depth = array(rarefaction_data[3])[:,0].max()

    rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth])
    
    # Average each col of the rarefaction mtx. Computing t test on averages over
    # all iterations. Avoids more comps which kills signifigance. 
    rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols
    sids = rarefaction_data[0][3:] # 0-2 are header strings
    
    ttest_results = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0])==1 and len(sid_pair[1])==1:
            ttest_results[treatment_pair]= (None,None)
        else:
            pair0_indices = [sids.index(i) for i in sid_pair[0]]
            pair1_indices = [sids.index(i) for i in sid_pair[1]]
            i = rare_mat.take(pair0_indices)
            j = rare_mat.take(pair1_indices)
            # found discussion of how to quickly check an array for nan here:
            # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair]= (None,None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i,j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(i,j, 
                    permutations=num_permutations)
                if p_val != None: 
                    p_val = float(format_p_value_for_num_iters(p_val, 
                        num_iters=num_permutations))
                elif p_val ==  None: #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair]= (obs_t,p_val)
    # create dict of average alpha diversity values
    alphadiv_avgs = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # calculate the alpha diversity average, std vals. choosing only first
        # treatment pair doesn't guarantees full covering, must look at both
        for sid_list, treatment_str in zip(sid_pair, treatment_pair):
            # check if already computed and added
            if not treatment_str in alphadiv_avgs.keys():
                alphadiv_vals = \
                    rare_mat.take([sids.index(i) for i in sid_list])
                ad_mean = alphadiv_vals.mean()
                ad_std = alphadiv_vals.std()
                alphadiv_avgs[treatment_str] = (ad_mean, ad_std) 
    return ttest_results, alphadiv_avgs
def compare_alpha_diversities(rarefaction_lines,
                              mapping_lines,
                              category,
                              depth=None,
                              test_type='nonparametric',
                              num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use 
     the deepest available in the file. 
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)

    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data,
                                                  rarefaction_data, category)

    ps_avg_div = get_per_sample_average_diversities(rarefaction_data, depth)

    ttest_results, ad_avgs = {}, {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1:
            ttest_results[treatment_pair] = (None, None)
            # add alpha diversity averages and standard deviations. since their
            # is only a single sample if we are in this part of the loop, we can
            # just record the sample value as the avg and 0 as the std.
            ad_avgs[treatment_pair[0]] = (sid_pair[0][0], 0.)
            ad_avgs[treatment_pair[1]] = (sid_pair[1][0], 0.)
        else:
            i = array([ps_avg_div[x] for x in sid_pair[0]])
            j = array([ps_avg_div[x] for x in sid_pair[1]])
            # add alpha diversity averages and standard deviations.
            ad_avgs[treatment_pair[0]] = (i.mean(), i.std())
            ad_avgs[treatment_pair[1]] = (j.mean(), j.std())
            # conduct tests
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair] = (None, None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i, j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(
                    i, j, permutations=num_permutations)
                if p_val != None:
                    p_val = float(
                        format_p_value_for_num_iters(
                            p_val, num_iters=num_permutations))
                elif p_val == None:  #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair] = (obs_t, p_val)

    return ttest_results, ad_avgs
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, 
    depth=None, test_type='nonparametric', num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use 
     the deepest available in the file. 
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
    
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, 
        rarefaction_data, category)
    
    ps_avg_div = get_per_sample_average_diversities(rarefaction_data, depth)
    
    ttest_results, ad_avgs = {}, {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0])==1 and len(sid_pair[1])==1:
            ttest_results[treatment_pair]= (None,None)
            # add alpha diversity averages and standard deviations. since their 
            # is only a single sample if we are in this part of the loop, we can
            # just record the sample value as the avg and 0 as the std.
            ad_avgs[treatment_pair[0]] = (sid_pair[0][0], 0.)
            ad_avgs[treatment_pair[1]] = (sid_pair[1][0], 0.)
        else:
            i = array([ps_avg_div[x] for x in sid_pair[0]])
            j = array([ps_avg_div[x] for x in sid_pair[1]])
            # add alpha diversity averages and standard deviations.
            ad_avgs[treatment_pair[0]] = (i.mean(), i.std())
            ad_avgs[treatment_pair[1]] = (j.mean(), j.std())
            # conduct tests
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair]= (None,None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i,j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(i,j, 
                    permutations=num_permutations)
                if p_val != None: 
                    p_val = float(format_p_value_for_num_iters(p_val, 
                        num_iters=num_permutations))
                elif p_val ==  None: #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair]= (obs_t,p_val)

    return ttest_results, ad_avgs