Example #1
0
def group_distances(mapping_file,
                    dmatrix_file,
                    fields,
                    dir_prefix='',
                    subdir_prefix='group_distances'):
    """Calculate all lists of distance groups.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """
    distance_groups = {}
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file, 'U'))

    if fields == []:
        raise ValueError(
            'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.'
        )

    single_field = defaultdict(dict)
    for i in range(len(fields)):
        field = fields[i]
        groups = group_by_field(mapping, field)
        data = distances_by_groups(distance_header, distance_matrix, groups)
        # Need to remove pound signs from field name.
        field_name = field.replace('#', '')
        single_field[field_name] = data

    write_distance_files(group_distance_dict=single_field,
                         dir_prefix=dir_prefix,
                         subdir_prefix=subdir_prefix + '_single')

    paired_field = defaultdict(dict)
    paired_field_for_writing = defaultdict(dict)
    for i in range(len(fields)):
        for j in range(i, len(fields)):
            fieldi = fields[i]
            fieldj = fields[j]
            groups = group_by_fields(mapping, [fieldi, fieldj])
            data = distances_by_groups(distance_header, distance_matrix,
                                       groups)
            paired_field[fieldi + '_to_' + fieldj] = data
            paired_field_for_writing[fieldi + '_to_' + field] = data

    write_distance_files(group_distance_dict=paired_field_for_writing,
                         dir_prefix=dir_prefix,
                         subdir_prefix=subdir_prefix + '_pairs')

    return single_field, paired_field, distance_matrix
Example #2
0
 def test_group_by_fields(self):
     """group_by_fields should group table by fields"""
     t = [
             ['#sample', 'loc', 'age', 'mal'],
             ['a','US','5','n'],
             ['b','US','10','n'],
             ['c','Mal','5','y'],
             ['d','Mal','10','n'],
             ['e','Mal','5','y'],
         ]
     self.assertEqual(group_by_fields(t, ['age','loc']), \
         {('5','US'):['a'], ('10','US'):['b'], ('5','Mal'):['c','e'],
         ('10','Mal'):['d']})
Example #3
0
 def test_group_by_fields(self):
     """group_by_fields should group table by fields"""
     t = [
         ['#sample', 'loc', 'age', 'mal'],
         ['a', 'US', '5', 'n'],
         ['b', 'US', '10', 'n'],
         ['c', 'Mal', '5', 'y'],
         ['d', 'Mal', '10', 'n'],
         ['e', 'Mal', '5', 'y'],
     ]
     self.assertEqual(group_by_fields(t, ['age','loc']), \
         {('5','US'):['a'], ('10','US'):['b'], ('5','Mal'):['c','e'],
         ('10','Mal'):['d']})
def group_distances(mapping_file, dmatrix_file, fields, dir_prefix='',
                    subdir_prefix='group_distances'):
    """Calculate all lists of distance groups.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """
    distance_groups = {}
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file, 'U'))

    if fields == []:
        raise ValueError(
            'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.')

    single_field = defaultdict(dict)
    for i in range(len(fields)):
        field = fields[i]
        groups = group_by_field(mapping, field)
        data = distances_by_groups(distance_header, distance_matrix, groups)
        # Need to remove pound signs from field name.
        field_name = field.replace('#', '')
        single_field[field_name] = data

    write_distance_files(group_distance_dict=single_field,
                         dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_single')

    paired_field = defaultdict(dict)
    paired_field_for_writing = defaultdict(dict)
    for i in range(len(fields)):
        for j in range(i, len(fields)):
            fieldi = fields[i]
            fieldj = fields[j]
            groups = group_by_fields(mapping, [fieldi, fieldj])
            data = distances_by_groups(
                distance_header,
                distance_matrix,
                groups)
            paired_field[fieldi + '_to_' + fieldj] = data
            paired_field_for_writing[fieldi + '_to_' + field] = data

    write_distance_files(group_distance_dict=paired_field_for_writing,
                         dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_pairs')

    return single_field, paired_field, distance_matrix
def group_distances(mapping_file,dmatrix_file,fields,dir_prefix='',\
    subdir_prefix='group_distances'):
    """Calculate all lists of distance groups."""
    distance_groups = {}
    mapping, header, comments = parse_mapping_file(open(mapping_file,'U'))
    header = [header]
    header.extend(mapping)
    mapping=header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    if fields == []:
        raise ValueError, 'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.'
        
    single_field = defaultdict(dict)
    for i in range(len(fields)):
        field = fields[i]
        groups = group_by_field(mapping, field)
        data = distances_by_groups(distance_header, distance_matrix, groups)
        #Need to remove pound signs from field name.
        field_name = field.replace('#','')
        single_field[field_name]=data

    write_distance_files(group_distance_dict=single_field,\
        dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_single')
        
    paired_field = defaultdict(dict)
    paired_field_for_writing = defaultdict(dict)
    for i in range(len(fields)):
        for j in range(i,len(fields)):
            fieldi = fields[i]
            fieldj = fields[j]
            groups = group_by_fields(mapping, [fieldi,fieldj])
            data = distances_by_groups(distance_header, distance_matrix, groups)
            paired_field[fieldi+'_to_'+fieldj]=data
            paired_field_for_writing[fieldi+'_to_'+field]=data
    
    write_distance_files(group_distance_dict=paired_field_for_writing,\
        dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_pairs')
    
    return single_field, paired_field, distance_matrix
Example #6
0
def group_distances(mapping_file,dmatrix_file,fields,dir_prefix='',\
    subdir_prefix='group_distances'):
    """Calculate all lists of distance groups."""
    distance_groups = {}
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    if fields is None:
        fields = [mapping[0][0]]
    single_field = defaultdict(dict)
    for i in range(len(fields)):
        field = fields[i]
        groups = group_by_field(mapping, field)
        data = distances_by_groups(distance_header, distance_matrix, groups)
        #Need to remove pound signs from field name.
        field_name = field.replace('#', '')

        single_field[field_name] = data

    write_distance_files(group_distance_dict=single_field,\
        dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_single')

    paired_field = defaultdict(dict)
    for i in range(len(fields)):
        for j in range(i, len(fields)):
            fieldi = fields[i]
            fieldj = fields[j]
            groups = group_by_fields(mapping, [fieldi, fieldj])
            data = distances_by_groups(distance_header, distance_matrix,
                                       groups)
            paired_field[fieldi + '_to_' + fieldj] = data

    write_distance_files(group_distance_dict=paired_field,\
        dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_pairs')

    return single_field, paired_field, distance_matrix
Example #7
0
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \
    dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\
    default_iters=10, fields=None):
    """Calculate Monte Carlo stats for specified group distances.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    orig_distance_matrix = distance_matrix.copy()

    path_prefix = path.join(dir_prefix, subdir_prefix)

    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)

    if fields is None:
        fields = [mapping[0][0]]

    if prefs is None:
        prefs = {}

    if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs:
        prefs = build_monte_carlo_prefs(fields, default_iters)

    for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items():
        if '&&' in field:
            groups = group_by_fields(mapping, field.split('&&'))
        else:
            groups = group_by_field(mapping, field)
        outfile = open(
            path.join(path_prefix, 'group_distances_' + field + '.txt'), 'w')
        outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\
            'Category_2a','Category_2b','Avg','t','p',\
            'p_greater','p_less','Iterations\n']))
        real_dists = distances_by_groups(distance_header, distance_matrix,\
            groups)

        #iterate over the groups
        for i, (first_g1, second_g1, distances_g1) in \
            enumerate(real_dists[:-1]):

            real_dist_1 = average(distances_g1)

            #then for each other pair (not including same group)
            for j in range(i + 1, len(real_dists)):
                first_g2, second_g2, distances_g2 = real_dists[j]

                real_dist_2 = average(distances_g2)

                # permute distances just within these groups!
                rand_dists_1, rand_dists_2 = \
                        permute_between_groups(distances_g1,
                                               distances_g2,
                                               num_iters)

                ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \
                    for n in range(num_iters)]
                real_ttest = t_two_sample(distances_g1.flatten(),
                                          distances_g2.flatten())
                curr_line = [first_g1, second_g1, real_dist_1, \
                    first_g2, second_g2, real_dist_2]
                curr_line.extend([real_ttest[0], real_ttest[1],\
                    (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                    (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                    num_iters])
                outfile.write('\t'.join(map(str, curr_line)))
                outfile.write('\n')
Example #8
0
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \
    dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\
    default_iters=10, fields=None):
    """Calculate Monte Carlo stats for specified group distances.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """
    mapping, header, comments = parse_mapping_file(open(mapping_file,'U'))
    header = [header]
    header.extend(mapping)
    mapping=header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    orig_distance_matrix = distance_matrix.copy()

    path_prefix = path.join(dir_prefix,subdir_prefix)
    
    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)
    
    if fields is None:
        fields = [mapping[0][0]]
        
    if prefs is None:
        prefs = {}
 
    if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs:
        prefs = build_monte_carlo_prefs(fields,default_iters)
            
    for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items():
        if '&&' in field:
            groups = group_by_fields(mapping, field.split('&&'))
        else:
            groups = group_by_field(mapping, field)
        outfile = open(path.join(path_prefix,
                                 'group_distances_'+field+'.txt'), 'w')
        outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\
            'Category_2a','Category_2b','Avg','t','p',\
            'p_greater','p_less','Iterations\n']))
        real_dists = distances_by_groups(distance_header, distance_matrix,\
            groups)
 
        #iterate over the groups
        for i, (first_g1, second_g1, distances_g1) in \
            enumerate(real_dists[:-1]):

            real_dist_1 = average(distances_g1)

            #then for each other pair (not including same group)
            for j in range(i+1,len(real_dists)):
                first_g2, second_g2, distances_g2 = real_dists[j]

                real_dist_2 = average(distances_g2)

                # permute distances just within these groups!
                rand_dists_1, rand_dists_2 = \
                        permute_between_groups(distances_g1, 
                                               distances_g2,
                                               num_iters)

                ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \
                    for n in range(num_iters)]
                real_ttest = t_two_sample(distances_g1.flatten(), distances_g2.flatten())
                curr_line = [first_g1, second_g1, real_dist_1, \
                    first_g2, second_g2, real_dist_2]
                curr_line.extend([real_ttest[0], real_ttest[1],\
                    (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                    (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                    num_iters])
                outfile.write('\t'.join(map(str, curr_line)))
                outfile.write('\n')
Example #9
0
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \
    dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\
    default_iters=10, fields=None):
    """Calculate Monte Carlo stats for specified group distances.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values
    """
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    orig_distance_matrix = distance_matrix.copy()

    path_prefix = _make_path([dir_prefix, subdir_prefix])

    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)

    if fields is None:
        fields = [mapping[0][0]]

    if prefs is None:
        prefs = {}

    if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs:
        prefs = build_monte_carlo_prefs(fields, default_iters)

    for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items():
        if '&&' in field:
            groups = group_by_fields(mapping, field.split('&&'))
        else:
            groups = group_by_field(mapping, field)
        outfile = open(path_prefix + 'group_distances_' + field + '.xls', 'w')
        outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\
            'Category_2a','Category_2b','Avg','t','p',\
            'p_greater','p_less','Iterations\n']))
        real_dists = distances_by_groups(distance_header, distance_matrix,\
            groups)
        rand_distances = [distances_by_groups(distance_header, \
            permute_for_monte_carlo(distance_matrix), groups) \
            for i in range(num_iters)]
        #iterate over the groups

        for i, (first_g1, second_g1, distances_g1) in \
            enumerate(real_dists[:-1]):

            real_dist_1 = average(distances_g1)
            rand_dists_1 = [rand_distances[n][i][-1] for n in range(num_iters)]
            #then for each other pair (not including same group)
            for j in range(i + 1, len(real_dists)):
                first_g2, second_g2, distances_g2 = real_dists[j]

                real_dist_2 = average(distances_g2)
                rand_dists_2 = [rand_distances[n][j][-1] \
                    for n in range(num_iters)]
                ttests = [t_two_sample(rand_dists_1[n],rand_dists_2[n])[0] \
                    for n in range(num_iters)]
                real_ttest = t_two_sample(distances_g1, distances_g2)
                curr_line = [first_g1, second_g1, real_dist_1, \
                    first_g2, second_g2, real_dist_2]
                curr_line.extend([real_ttest[0], real_ttest[1],\
                    (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                    (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                    num_iters])
                outfile.write('\t'.join(map(str, curr_line)))
                outfile.write('\n')