Beispiel #1
0
def rank_test(data, value="Yes", tails="high"):
    data.sort()
    v_dist = []
    non_v_dist = []
    for d in data:
        if d[1] == value:
            v_dist.append(d[0])
        else:
            non_v_dist.append(d[0])
    t, parametric_p, ts, non_parametric_p = mc_t_two_sample(v_dist, non_v_dist, tails=tails)
    return t, parametric_p, non_parametric_p, v_dist, non_v_dist
Beispiel #2
0
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth,
    test_type='nonparametric', num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use.
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, 
        rarefaction_data, category)
    
    # extract only rows of the rarefaction data that are at the given depth
    rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth])
    
    # Average each col of the rarefaction mtx. Computing t test on averages over
    # all iterations. Avoids more comps which kills signifigance. 
    rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols
    sids = rarefaction_data[0][3:] # 0-2 are header strings
    results = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0])==1 and len(sid_pair[1])==1:
            t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1])
            results[t_key]= (None,None)
        else:
            pair0_indices = [sids.index(i) for i in sid_pair[0]]
            pair1_indices = [sids.index(i) for i in sid_pair[1]]
            t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1])
            i = rare_mat.take(pair0_indices)
            j = rare_mat.take(pair1_indices)
            # found discussion of how to quickly check an array for nan here:
            # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy
            if isnan(np_min(i)) or isnan(np_min(j)):
                results[t_key]= (None,None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i,j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(i,j, 
                    permutations=num_permutations)
                if p_val != None: 
                    p_val = float(format_p_value_for_num_iters(p_val, 
                        num_iters=num_permutations))
                elif p_val ==  None: #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            results[t_key]= (obs_t,p_val)
    return results
def compare_alpha_diversities(rarefaction_lines, mapping_lines, 
                              category, depth, test_type='nonparametric',
                              num_permutations=999):
    """compares alpha diversities
    
    inputs:
        rarefaction_file - rarefaction file which gives scores for 
        various rarefactions and depths
        
        mapping_file - file that has ID's and categories that the ID's
        fall in
        
        category - the category to be compared, is a string
        
        depth - the depth of the rarefaction_file to use, is an integer

        test_type - the type of t-test to perform, is a string. Must be either
        'parametric' or 'nonparametric'

        num_permutations - the number of Monte Carlo permutations to use if
        test_type is 'nonparametric', is an integer
    
    outputs:
        results - a nested dictionary which specifies the category as
        the top level key, and as its value, dictionaries which give the
        results of the t_two_sample test for all unique pairs of values
        in the specified category
    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    value_pairs = make_value_pairs_from_category(mapping_data, category)
    
    category_values_Ids = make_category_values_Id_dict(mapping_data, 
                                                       category)
    
    SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs,
                                                    category_values_Ids)
    
    map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict(
                                                       rarefaction_data)
    
    reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth,
                                                       rarefaction_data)
    
    results = {category:{}}
    
    for pair in range(len(SampleId_pairs)):
        # Must flatten the matrix because t_two_sample only operates on
        # non-nested sequences (otherwise we'll get the wrong degrees of
        # freedom).
        i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0],
                                                reduced_rarefaction_mtx,
                                                map_from_Id_to_col)).flatten()
        
        j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1],
                                                reduced_rarefaction_mtx,
                                                map_from_Id_to_col)).flatten()

        if test_type == 'parametric':
            obs_t, p_val = t_two_sample(i,j)
        elif test_type == 'nonparametric':
            obs_t, _, _, p_val = mc_t_two_sample(i,j,
                                                 permutations=num_permutations)
            p_val = format_p_value_for_num_iters(p_val, num_permutations)
        else:
            raise ValueError("Invalid test type '%s'." % test_type)

        results[category][(str(value_pairs[pair][0]),
                           str(value_pairs[pair][1]))] = obs_t, p_val
    return results