def setUp(self):
     """define data for tests"""
     # small amount of redundancy here since setUp called at each test, but
     # limited tests means little concern
     self.rarefaction_file = \
         ['\tsequences per sample\titeration\tSam1\tSam2\tSam3\tSam4\tSam5\tSam6',
         'rare480.txt\t480\t0\t2.52800404052\t2.3614611247\t2.59867416108\t3.56970811181\t3.44800265895\t1.9433560517',
         'rare480.txt\t480\t1\t2.06375457238\t3.32293450758\t3.4189896645\t3.35312890712\t3.10763472113\t2.78155253726',
         'rare480.txt\t480\t2\t2.44788730109\t3.42464996459\t2.24541787295\t2.491419231\t2.60106690099\t5.40828403581',
         'rare480.txt\t480\t3\t5.1846120153\t3.67022675065\t1.54879964908\t2.8055801405\t4.3086171269\t3.87761898868',
         'rare910.txt\t910\t0\t2.67580703282\t1.72405794627\t2.15312863498\t2.4300954476\t3.7753658185\t3.36198860355',
         'rare910.txt\t910\t1\t4.10226466956\t2.24587945345\t3.02932964779\t2.98218513619\t3.73316846484\t1.85879566537',
         'rare910.txt\t910\t2\t1.65800670063\t2.42281993323\t3.02400997565\t3.271608097\t2.99265263795\t3.68802382515',
         'rare910.txt\t910\t3\t2.50976021964\t2.43976761056\t3.32119905587\t2.47487750248\t1.901408525\t3.42883742207',
         'rare500.txt\t500\t0\t3.42225118215\tn/a\t4.03758268426\t2.35344629448\t2.26690085385\t1.80164570104',
         'rare850.txt\t850\t0\t4.2389858006\t4.97464230229\t1.53451087057\t3.35785261181\t1.91658777533\t2.32583475424',
         'rare850.txt\t850\t1\t2.81445883827\tn/a\t2.54767461948\t1.38835207925\t3.70018890199\t1.57359105209',
         'rare850.txt\t850\t2\t2.9340493412\t3.95897035158\tn/a\t2.07761860166\t3.42393336685\t2.6927305603']
     self.rarefaction_data = parse_rarefaction(self.rarefaction_file)
     self.mapping_file = \
         ['#SampleID\tDose\tLinkerPrimerSequence\tWeight\tTTD\tDescription',
         '#Comment Line',
         'Sam1\t1xDose\tATCG\tHigh\t31\ts1_desc',
         'Sam2\t1xDose\tACCG\tLow\t67\ts2_desc',
         'Sam3\t2xDose\tACGT\tMed\t21\ts3_desc',
         'Sam4\t2xDose\tAACG\tLow\t55\ts4_desc',
         'Sam5\tControl\tCGTC\tLow\t67\ts5_desc',
         'Sam6\t1xDose\tACCT\tLow\t55\ts6_desc']
     self.mapping_data = parse_mapping_file_to_dict(self.mapping_file)[0]
def generate_alpha_diversity_boxplots(rarefaction_lines,
                                      mapping_lines,
                                      category,
                                      depth=None):
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    
    category_value_to_sample_ids = \
     get_category_value_to_sample_ids(mapping_lines,
                                      category)
    
    per_sample_average_diversities = \
     get_per_sample_average_diversities(rarefaction_data,
                                        category,
                                        depth)
    
    per_category_value_average_diversities = \
     collapse_sample_diversities_by_category_value(category_value_to_sample_ids,
                                                   per_sample_average_diversities)
    
    # sort the data alphabetically
    sorted_per_category_value_average_diversities = \
     per_category_value_average_diversities.items()
    sorted_per_category_value_average_diversities.sort()
    
    x_tick_labels = []
    distributions = []
    for cat, avg_diversities in sorted_per_category_value_average_diversities:
        x_tick_labels.append("%s (n=%d)" % (cat, len(avg_diversities)))
        distributions.append(avg_diversities)
    
    return generate_box_plots(distributions,
                              x_tick_labels=x_tick_labels)
def compare_alpha_diversities(rarefaction_lines, mapping_lines, 
                              category, depth):
    """compares alpha diversities
    
    inputs:
        rarefaction_file - rarefaction file which gives scores for 
        various rarefactions and depths
        
        mapping_file - file that has ID's and categories that the ID's
        fall in
        
        category - the category to be compared, is a string
        
        depth - the depth of the rarefaction_file to use, is an integer
    
    outputs:
        results - a nested dictionary which specifies the category as
        the top level key, and as its value, dictionaries which give the
        results of the t_two_sample test for all unique pairs of values
        in the specified category
    
    """
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    value_pairs = make_value_pairs_from_category(mapping_data, category)
    
    category_values_Ids = make_category_values_Id_dict(mapping_data, 
                                                       category)
    
    SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs,
                                                    category_values_Ids)
    
    map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict(
                                                       rarefaction_data)
    
    reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth,
                                                       rarefaction_data)
    
    results = {category:{}}
    
    for pair in range(len(SampleId_pairs)):
        i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0],
                           reduced_rarefaction_mtx, map_from_Id_to_col))
        
        j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1],
                           reduced_rarefaction_mtx, map_from_Id_to_col))
        
        results[category][(str(value_pairs[pair][0]),
                           str(value_pairs[pair][1]))] =\
                          t_two_sample(i,j)
    
    return results
def mean_alpha(alpha_dict, depth):
    """mean collated alpha diversity data at a given depth

    Input:
    alpha_dict: dictionary where the values are the lines of a collated alpha
    diversity data files and the keys are the names of each of these files with
    no extension, this name is usually the metric used to compute the alpha
    diversity.
    depth: selected depth to mean the computed alpha diversity values for the
    alpha_dict data.

    Output:
    metrics: list of metric names i. e. the name of each collated alpha div file
    sample_ids: list of sample identifiers represented
    data: a list of lists with the mean of alpha diversity data at a given
    depth for the different metrics, each column is a different metric.
    """

    assert type(alpha_dict) == dict, "Input data must be a dictionary"
    assert depth >= 0 and type(depth) == int, "The spcified depth must be a "+\
        "positive integer."

    metrics = []
    sample_ids = []
    data = []

    for key, value in alpha_dict.iteritems():
        metrics.append('{0}_even_{1}'.format(key, depth))
        identifiers, _, _, rarefaction_data = parse_rarefaction(value)

        # check all the files have the same sample ids in the same order
        if sample_ids:
            if not sample_ids == identifiers[3:]:
                raise (ValueError, "Non-matching sample ids were found in the "
                    "collated alpha diversity files. Make sure all the files "
                    "contain data for the same samples.")
        else:
            sample_ids = identifiers[3:]

        # find all the data at the desired depth and get the mean values, remove
        # the first two elements ([depth, iteration]) as those are not needed
        data.append(array([row[2:] for row in rarefaction_data if\
            row[0] == depth]).mean(axis=0))

    # transpose the data to match the formatting of non-collated alpha div data
    data = array(data).T.tolist()

    return metrics, sample_ids, data
Beispiel #5
0
def _collect_alpha_diversity_boxplot_data(rarefaction_f, metadata_map,
                                          rarefaction_depth, split_category,
                                          comparison_category):
    """Pulls data from rarefaction file based on supplied categories."""
    # Pull out rarefaction data for the specified depth.
    rarefaction = parse_rarefaction(rarefaction_f)

    # First three vals are part of the header, so ignore them.
    sample_ids = rarefaction[0][3:]

    # First two vals are depth and iteration number, so ignore them.
    rarefaction_data = [row[2:] for row in rarefaction[3]
                        if row[0] == rarefaction_depth]

    if not rarefaction_data:
        raise ValueError("Rarefaction depth of %d could not be found in "
                         "collated alpha diversity file." % rarefaction_depth)

    # Build up dict mapping (body site, [self|other]) -> distribution.
    plot_data = defaultdict(list)
    for row in rarefaction_data:
        assert len(sample_ids) == len(row)
        for sample_id, adiv_val in zip(sample_ids, row):
            if not isnan(adiv_val):
                split_cat_val = metadata_map.getCategoryValue(sample_id,
                                                              split_category)
                comp_cat_val = metadata_map.getCategoryValue(sample_id,
                        comparison_category)

                plot_data[split_cat_val, comp_cat_val].append(adiv_val)

    # Format tick labels as '<body site> (self|other)' and sort alphabetically.
    plot_data = sorted(map(lambda e: ('%s (%s)' %
                                      (e[0][0], e[0][1]), e[1]),
                           plot_data.items()))
    x_tick_labels = []
    dists = []
    for label, dist in plot_data:
        x_tick_labels.append(label)
        dists.append(dist)

    return x_tick_labels, dists
Beispiel #6
0
    def setUp(self):
        """define some top-level data"""

        self.data = {}
        self.data['xaxis'] = [10.0]
        self.sample_dict = {'Sample1': {10.00: [1.3276140000000001]}}
        self.data['yvals'] = {'Sample1': [1.3276140000000001]}
        self.data['err'] = {'Sample1': [.1]}
        self.xmax = 140
        self.ymax = 20
        self.std_type = 'stddev'
        self.ops = ['Sample1']
        self.mapping_category = 'SampleID'
        self.imagetype = 'png'
        self.resolution = 70
        self.mapping_lookup = {'SampleID-Sample1': 'col_0_row_0'}
        self.data['map'] = [['SampleID', 'Day'], ['Sample1', 'Day1']]
        self.color_prefs={'SampleID': {'column': 'SampleID', 'color': \
                          {'Sample1': '#ff0000'}}}
        self.groups = {'Sample1': ['Sample1']}
        self.background_color = 'black'
        self.label_color = 'white'
        self.labelname = 'SampleID'
        self.rare_data={'color': {'Sample1': '#ff0000'}, \
            'series': {'Sample1': [2.0515300000000001],}, \
             'headers': ['test.txt','SampleID'], 'xaxis': [10.0], \
             'error': {'Sample1': [0.0]}, 'options': ['Sample1']}
        self.fpath = '/tmp/'
        self.output_dir = '/tmp/'
        self.metric_name = 'test'
        self._paths_to_clean_up = []
        self._folders_to_cleanup = []
        self.rarefaction_file_data = [[10.0, 0.0, 1.0], [10.0, 1.0, 3.0]]
        d = {'redtowhite3_0': '#7fff00', 'redtowhite3_1': '#7fff00'}
        self.data_colors = color_dict_to_objects(d)
        self.colors = {'Sample1': 'redtowhite3_0', 'Sample2': 'redtowhite3_1'}
        self.colors2 = {'Sample1': 'redtowhite3_0'}
        self.mappingfile = [
            '#SampleID\tSex\tAge', '123\tF\t32', '234\tM\t30', '345\tM\t32'
        ]
        #self.p_mappingfile = parse_mapping_file(self.mappingfile,\
        #                                            strip_quotes=True)
        self.rarefactionfile=[\
                    '\tsequences per sample\titeration\t123\t234\t345',
                    'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996',
                    'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055',
                    'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725',
                    'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474',
                    'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928',
                    'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642']

        self.rares = {'test.txt': (['', 'sequences per sample', 'iteration', \
                      'Sample1'], [], ['rare1.txt', 'rare2.txt'], \
                      [[10.0, 2.0, 7.0, 7.0, 9.0], [10.0, 2.0, 7.0, 7.0, 9.0]])}
        self.col_headers, self.comments, self.rarefaction_fns, \
        self.rarefaction_data = parse_rarefaction(self.rarefactionfile)
        self.matrix, self.seqs_per_samp, self.sampleIDs = \
        get_rarefaction_data(self.rarefaction_data, self.col_headers)
        self.ave_seqs_per_sample1 = {'Sample1':[2.03172,9.4417849999999994,\
        12.508435]}
        self.ave_seqs_per_sample = {'123':[2.03172,9.4417849999999994,\
        12.508435],'234':[0.42876999999999998,0.42876999999999998,\
        0.42876999999999998],'345':[2.255255,9.625995,11.58785]}
        self.collapsed_ser_sex = {'M':[1.3420125000000001,5.0273824999999999,\
        6.0083099999999998], 'F':[2.03172,9.4417849999999994,12.508435]}
        self.err_ser_sex = {'M':[0.91324250000000007,4.5986124999999998,\
        5.5795399999999997],'F':[0.0,0.0,0.0]}
        self.rarefaction_legend_mat_init = {'test': {'SampleID': {}}}
        self.col_headers2=['', 'sequences per sample', 'iteration', 'Sample1', \
                           'Sample2']

        self.rarefaction_data_mat = {
            'SampleID': {
                'Sample1': {
                    'test': {
                        'ave': ['     7.000'],
                        'err': ['       nan']
                    }
                }
            }
        }

        self.rarefaction_legend_mat = {
            'test': {
                'samples': {
                    'Sample1': {
                        'color': '#ff0000',
                        'link': 'html_plots/testcol_0_row_0.png'
                    }
                },
                'groups': {
                    'SampleID': {
                        'Sample1': {
                            'groupcolor': '#ff0000',
                            'groupsamples': ['Sample1']
                        }
                    }
                }
            }
        }
        self.exp_err_series_ave = {
            'M': [1.571915, 6.49885, 8.1750183333333339]
        }
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category,
                              depth=None, test_type='nonparametric', num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.

    Notes:
     Returns a defaultdict which as keys has the pairs of treatments being
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines.
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use
     the deepest available in the file.
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)

    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data,
                                                  rarefaction_data, category)

    ps_avg_div = get_per_sample_average_diversities(rarefaction_data, depth)

    ttest_results, ad_avgs = {}, {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1:
            ttest_results[treatment_pair] = (None, None)
            # add alpha diversity averages and standard deviations. since their
            # is only a single sample if we are in this part of the loop, we can
            # just record the sample value as the avg and 0 as the std.
            ad_avgs[treatment_pair[0]] = (sid_pair[0][0], 0.)
            ad_avgs[treatment_pair[1]] = (sid_pair[1][0], 0.)
        else:
            i = array([ps_avg_div[x] for x in sid_pair[0]])
            j = array([ps_avg_div[x] for x in sid_pair[1]])
            # add alpha diversity averages and standard deviations.
            ad_avgs[treatment_pair[0]] = (i.mean(), i.std())
            ad_avgs[treatment_pair[1]] = (j.mean(), j.std())
            # conduct tests
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair] = (None, None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i, j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(i, j,
                                                     permutations=num_permutations)
                if p_val is not None:
                    p_val = float(format_p_value_for_num_iters(p_val,
                                                               num_iters=num_permutations))
                elif p_val is None:  # None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair] = (obs_t, p_val)

    return ttest_results, ad_avgs
    def setUp(self):
        """define some top-level data"""

        self.data = {}
        self.data["xaxis"] = [10.0]
        self.sample_dict = {"Sample1": {10.00: [1.3276140000000001]}}
        self.data["yvals"] = {"Sample1": [1.3276140000000001]}
        self.data["err"] = {"Sample1": [0.1]}
        self.xmax = 140
        self.ymax = 20
        self.std_type = "stddev"
        self.ops = ["Sample1"]
        self.mapping_category = "SampleID"
        self.imagetype = "png"
        self.resolution = 70
        self.mapping_lookup = {"SampleID-Sample1": "col_0_row_0"}
        self.data["map"] = [["SampleID", "Day"], ["Sample1", "Day1"]]
        self.color_prefs = {"SampleID": {"column": "SampleID", "color": {"Sample1": "#ff0000"}}}
        self.groups = {"Sample1": ["Sample1"]}
        self.background_color = "black"
        self.label_color = "white"
        self.labelname = "SampleID"
        self.rare_data = {
            "color": {"Sample1": "#ff0000"},
            "series": {"Sample1": [2.0515300000000001]},
            "headers": ["test.txt", "SampleID"],
            "xaxis": [10.0],
            "error": {"Sample1": [0.0]},
            "options": ["Sample1"],
        }
        self.fpath = "/tmp/"
        self.output_dir = "/tmp/"
        self.metric_name = "test"
        self._paths_to_clean_up = []
        self._folders_to_cleanup = []
        self.rarefaction_file_data = [[10.0, 0.0, 1.0], [10.0, 1.0, 3.0]]
        d = {"redtowhite3_0": "#7fff00", "redtowhite3_1": "#7fff00"}
        self.data_colors = color_dict_to_objects(d)
        self.colors = {"Sample1": "redtowhite3_0", "Sample2": "redtowhite3_1"}
        self.colors2 = {"Sample1": "redtowhite3_0"}
        self.mappingfile = ["#SampleID\tSex\tAge", "123\tF\t32", "234\tM\t30", "345\tM\t32"]
        # self.p_mappingfile = parse_mapping_file(self.mappingfile,\
        #                                            strip_quotes=True)
        self.rarefactionfile = [
            "\tsequences per sample\titeration\t123\t234\t345",
            "rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996",
            "rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055",
            "rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725",
            "rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474",
            "rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928",
            "rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642",
        ]

        self.rares = {
            "test.txt": (
                ["", "sequences per sample", "iteration", "Sample1"],
                [],
                ["rare1.txt", "rare2.txt"],
                [[10.0, 2.0, 7.0, 7.0, 9.0], [10.0, 2.0, 7.0, 7.0, 9.0]],
            )
        }
        self.col_headers, self.comments, self.rarefaction_fns, self.rarefaction_data = parse_rarefaction(
            self.rarefactionfile
        )
        self.matrix, self.seqs_per_samp, self.sampleIDs = get_rarefaction_data(self.rarefaction_data, self.col_headers)
        self.ave_seqs_per_sample1 = {"Sample1": [2.03172, 9.4417849999999994, 12.508435]}
        self.ave_seqs_per_sample = {
            "123": [2.03172, 9.4417849999999994, 12.508435],
            "234": [0.42876999999999998, 0.42876999999999998, 0.42876999999999998],
            "345": [2.255255, 9.625995, 11.58785],
        }
        self.collapsed_ser_sex = {
            "M": [1.3420125000000001, 5.0273824999999999, 6.0083099999999998],
            "F": [2.03172, 9.4417849999999994, 12.508435],
        }
        self.err_ser_sex = {"M": [0.91324250000000007, 4.5986124999999998, 5.5795399999999997], "F": [0.0, 0.0, 0.0]}
        self.rarefaction_legend_mat_init = {"test": {"SampleID": {}}}
        self.col_headers2 = ["", "sequences per sample", "iteration", "Sample1", "Sample2"]

        self.rarefaction_data_mat = {"SampleID": {"Sample1": {"test": {"ave": ["     7.000"], "err": ["       nan"]}}}}

        self.rarefaction_legend_mat = {
            "test": {
                "samples": {"Sample1": {"color": "#ff0000", "link": "html_plots/testcol_0_row_0.png"}},
                "groups": {"SampleID": {"Sample1": {"groupcolor": "#ff0000", "groupsamples": ["Sample1"]}}},
            }
        }
        self.exp_err_series_ave = {"M": [1.571915, 6.49885, 8.1750183333333339]}
    def setUp(self):
        """define data for tests"""
        self.rarefaction_file = \
         ['\tsequences per sample\titeration\t123\t234\t345\t456',
          'rare10.txt\t10\t0\t1.99181\t5.42877\t2.13996\t0.002322',
          'rare10.txt\t10\t1\t2.07163\t1.42877\t2.37055\t0.01219',
          'rare310.txt\t310\t0\t8.83115\t6.42877\t11.00725\t0.18233',
          'rare310.txt\t310\t1\t10.05242\t9.42877\t8.24474\t0.99229',
          'rare810.txt\t810\t0\t12.03067\tn/a\t11.58928\t0.8993',
          'rare910.txt\t910\t1\t12.9862\t2.42877\t11.58642\t1.22563']

        self.rarefaction_data = parse_rarefaction(self.rarefaction_file)

        self.mapping_file = \
        ['#SampleID\tTreatment\tLinker'+\
         'PrimerSequence\tDose\tTTD\tDescription',
         '#Comment Line',
         '123\tAAAA\tBBBB\tHigh\t31\tM_ID_123',
         '234\tCCCC\tDDDD\tLow\t67\tM_ID_234',
         '345\tAAAA\tFFFF\tMed\t21\tM_ID_345',
         '456\tAAAA\tGGGG\tLow\t67\tM_ID_456'
         ]

        self.mapping_data = \
         parse_mapping_file_to_dict(self.mapping_file)[0]
        self.value_pairs_Dose = \
         [('Low','Med'),('Low','High'),('Med','High')]
        self.value_pairs_TTD = \
         [('67', '21'), ('67', '31'), ('21', '31')]
        self.value_pairs_Treatment = \
         [('CCCC', 'AAAA')]
        self.cat_val_Dose = \
         {'High': ['123'], 'Low': ['234', '456'], 'Med': ['345']}
        self.cat_val_TTD = \
         {'21': ['345'], '31': ['123'], '67': ['234', '456']}
        self.cat_val_Treatment = \
         {'AAAA': ['345', '123', '456'], 'CCCC': ['234']}
        self.Id_pairs_Dose = \
         [(['234', '456'], ['345']), (['234', '456'], ['123']),
          (['345'], ['123'])]
        self.Id_pairs_TTD = \
         [(['234', '456'], ['345']), (['234', '456'], ['123']),
          (['345'], ['123'])]

        self.Id_pairs_Treatment = \
         [(['234'], ['345', '123', '456'])]

        self.rarefaction_cols_dict = \
         {'123': 0, '234': 1, '345': 2, '456':3}

        self.extracted_mtx_10 = \
         array([[ 1.99181,  5.42877,  2.13996, 0.002322],
               [ 2.07163,  1.42877,  2.37055, 0.01219]])

        self.extracted_mtx_310 = \
         array([[  8.83115,   6.42877,  11.00725, 0.18233],
               [ 10.05242,   9.42877,   8.24474, 0.99229]])

        self.extracted_mtx_910 = \
         array([[ 12.9862 ,   2.42877,  11.58642, 1.22563]])

        self.sample_pair1 = \
         (['234'], ['345', '123'])

        self.rarefaction_mtx_for_sample_pair1_0 = \
         array([[ 5.42877],
               [ 1.42877]])

        self.rarefaction_mtx_for_sample_pair1_1 = \
         array([[ 2.13996,  1.99181],
               [ 2.37055,  2.07163]])

        self.compared_alpha_diversities_TTD = {
            'TTD': {
                ('67', '21'): (-0.27929839680103463, 0.79386220041241184),
                ('21', '31'): (1.8321466933860993, 0.20839398129924847),
                ('67', '31'): (-0.16318504125427058, 0.87828549279958279)
            }
        }
def mean_alpha(alpha_dict, depth):
    """mean collated alpha diversity data at a given depth

    Input:
    alpha_dict: dictionary where the values are the lines of a collated alpha
    diversity data files and the keys are the names of each of these files with
    no extension, this name is usually the metric used to compute the alpha
    diversity.
    depth: selected depth to mean the computed alpha diversity values for the
    alpha_dict data. If None is passed, the highest depth will be used.

    Output:
    metrics: list of metric names i. e. the name of each collated alpha div file
    sample_ids: list of sample identifiers represented
    data: a list of lists with the mean of alpha diversity data at a given
    depth for the different metrics, each column is a different metric.
    """

    assert type(alpha_dict) == dict, "Input data must be a dictionary"
    assert depth == None or (depth >= 0 and type(depth) == int), "The "+\
        "specified depth must be a positive integer."

    metrics = []
    sample_ids = []
    data = []

    for key, value in alpha_dict.iteritems():
        identifiers, _, _, rarefaction_data = parse_rarefaction(value)

        # if depth is specified as None use the highest available, retrieve it
        # on a per file basis so you make sure the value exists for all files
        if depth == None:
            _depth = int(max([row[0] for row in rarefaction_data]))
        else:
            _depth = depth
        metrics.append('{0}_even_{1}'.format(key, _depth))

        # check there are elements with the desired rarefaction depth
        if sum([1 for row in rarefaction_data if row[0] == _depth]) == 0:
            # get a sorted list of strings with the available rarefaction depths
            available_rarefaction_depths = map(str, sorted(list(set([row[0] for
                row in rarefaction_data]))))
            raise ValueError, ("The depth %d does not exist in the collated "
                "alpha diversity file for the metric: %s. The available depths "
                "are: %s."%(_depth,key,', '.join(available_rarefaction_depths)))

        # check all the files have the same sample ids in the same order
        if sample_ids:
            if not sample_ids == identifiers[3:]:
                raise ValueError, ("Non-matching sample ids were found in the "
                    "collated alpha diversity files. Make sure all the files "
                    "contain data for the same samples.")
        else:
            sample_ids = identifiers[3:]

        # find all the data at the desired depth and get the mean values, remove
        # the first two elements ([depth, iteration]) as those are not needed
        data.append(array([row[2:] for row in rarefaction_data if\
            row[0] == _depth]).mean(axis=0))

    # transpose the data to match the formatting of non-collated alpha div data
    data = array(data).T.tolist()

    return metrics, sample_ids, data
Beispiel #11
0
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, 
    depth=None, test_type='nonparametric', num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use 
     the deepest available in the file. 
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, 
        rarefaction_data, category)
    
    # extract only rows of the rarefaction data that are at the given depth
    # if depth is not given default to the deepest rarefaction available
    # rarefaction file is not guaranteed to be in order of rarefaction depth
    if depth == None:
        depth = array(rarefaction_data[3])[:,0].max()

    rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth])
    
    # Average each col of the rarefaction mtx. Computing t test on averages over
    # all iterations. Avoids more comps which kills signifigance. 
    rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols
    sids = rarefaction_data[0][3:] # 0-2 are header strings
    results = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0])==1 and len(sid_pair[1])==1:
            t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1])
            results[t_key]= (None,None)
        else:
            pair0_indices = [sids.index(i) for i in sid_pair[0]]
            pair1_indices = [sids.index(i) for i in sid_pair[1]]
            t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1])
            i = rare_mat.take(pair0_indices)
            j = rare_mat.take(pair1_indices)
            # found discussion of how to quickly check an array for nan here:
            # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy
            if isnan(np_min(i)) or isnan(np_min(j)):
                results[t_key]= (None,None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i,j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(i,j, 
                    permutations=num_permutations)
                if p_val != None: 
                    p_val = float(format_p_value_for_num_iters(p_val, 
                        num_iters=num_permutations))
                elif p_val ==  None: #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            results[t_key]= (obs_t,p_val)
    return results
 def setUp(self):
     """define data for tests"""
     self.rarefaction_file = \
      ['\tsequences per sample\titeration\t123\t234\t345\t456',
       'rare10.txt\t10\t0\t1.99181\t5.42877\t2.13996\t0.002322',
       'rare10.txt\t10\t1\t2.07163\t1.42877\t2.37055\t0.01219',
       'rare310.txt\t310\t0\t8.83115\t6.42877\t11.00725\t0.18233',
       'rare310.txt\t310\t1\t10.05242\t9.42877\t8.24474\t0.99229',
       'rare810.txt\t810\t0\t12.03067\tn/a\t11.58928\t0.8993',
       'rare910.txt\t910\t1\t12.9862\t2.42877\t11.58642\t1.22563']
     
     self.rarefaction_data = parse_rarefaction(self.rarefaction_file)
     
     self.mapping_file = \
     ['#SampleID\tTreatment\tLinker'+\
      'PrimerSequence\tDose\tTTD\tDescription',
      '#Comment Line',
      '123\tAAAA\tBBBB\tHigh\t31\tM_ID_123',
      '234\tCCCC\tDDDD\tLow\t67\tM_ID_234',
      '345\tAAAA\tFFFF\tMed\t21\tM_ID_345',
      '456\tAAAA\tGGGG\tLow\t67\tM_ID_456'
      ]
     
     self.mapping_data = \
      parse_mapping_file_to_dict(self.mapping_file)[0]
     self.value_pairs_Dose = \
      [('Low','Med'),('Low','High'),('Med','High')]                         
     self.value_pairs_TTD = \
      [('67', '21'), ('67', '31'), ('21', '31')]
     self.value_pairs_Treatment = \
      [('CCCC', 'AAAA')]
     self.cat_val_Dose = \
      {'High': ['123'], 'Low': ['234', '456'], 'Med': ['345']}
     self.cat_val_TTD = \
      {'21': ['345'], '31': ['123'], '67': ['234', '456']}
     self.cat_val_Treatment = \
      {'AAAA': ['345', '123', '456'], 'CCCC': ['234']}
     self.Id_pairs_Dose = \
      [(['234', '456'], ['345']), (['234', '456'], ['123']),
       (['345'], ['123'])]
     self.Id_pairs_TTD = \
      [(['234', '456'], ['345']), (['234', '456'], ['123']),
       (['345'], ['123'])]
     
     self.Id_pairs_Treatment = \
      [(['234'], ['345', '123', '456'])]
     
     self.rarefaction_cols_dict = \
      {'123': 0, '234': 1, '345': 2, '456':3}
    
     self.extracted_mtx_10 = \
      array([[ 1.99181,  5.42877,  2.13996, 0.002322],
            [ 2.07163,  1.42877,  2.37055, 0.01219]])
     
     self.extracted_mtx_310 = \
      array([[  8.83115,   6.42877,  11.00725, 0.18233],
            [ 10.05242,   9.42877,   8.24474, 0.99229]])
     
     self.extracted_mtx_910 = \
      array([[ 12.9862 ,   2.42877,  11.58642, 1.22563]])
     
     self.sample_pair1 = \
      (['234'], ['345', '123'])
         
     self.rarefaction_mtx_for_sample_pair1_0 = \
      array([[ 5.42877],
            [ 1.42877]])
     
     self.rarefaction_mtx_for_sample_pair1_1 = \
      array([[ 2.13996,  1.99181],
            [ 2.37055,  2.07163]])
     
     self.compared_alpha_diversities_TTD = {'TTD': {('67', '21'):
         (-0.27929839680103463, 0.79386220041241184), ('21', '31'):
         (1.8321466933860993, 0.20839398129924847), ('67', '31'):
         (-0.16318504125427058, 0.87828549279958279)}}
 def setUp(self):
     """define data for tests"""
     self.rarefaction_file = \
      ['\tsequences per sample\titeration\t123\t234\t345',
       'rare10.txt\t10\t0\t1.99181\t5.42877\t2.13996',
       'rare10.txt\t10\t1\t2.07163\t1.42877\t2.37055',
       'rare310.txt\t310\t0\t8.83115\t6.42877\t11.00725',
       'rare310.txt\t310\t1\t10.05242\t9.42877\t8.24474',
       'rare810.txt\t810\t0\t12.03067\tn/a\t11.58928',
       'rare910.txt\t910\t1\t12.9862\t2.42877\t11.58642']
     
     self.rarefaction_data = parse_rarefaction(self.rarefaction_file)
     
     self.mapping_file = \
     ['#SampleID\tTreatment\tLinker'+\
      'PrimerSequence\tDose\tTTD\tDescription',
      '#Comment Line',
      '123\tAAAA\tBBBB\tHigh\t31\tM_ID_123',
      '234\tCCCC\tDDDD\tLow\t67\tM_ID_234',
      '345\tAAAA\tFFFF\tMed\t21\tM_ID_345']
     
     self.mapping_data = \
      parse_mapping_file_to_dict(self.mapping_file)[0]
     self.value_pairs_Dose = \
      [('Low','Med'),('Low','High'),('Med','High')]                         
     self.value_pairs_TTD = \
      [('67', '21'), ('67', '31'), ('21', '31')]
     self.value_pairs_Treatment = \
      [('CCCC', 'AAAA')]
     self.cat_val_Dose = \
      {'High': ['123'], 'Low': ['234'], 'Med': ['345']}
     self.cat_val_TTD = \
      {'21': ['345'], '31': ['123'], '67': ['234']}
     self.cat_val_Treatment = \
      {'AAAA': ['345', '123'], 'CCCC': ['234']}
     self.Id_pairs_Dose = \
      [(['234'], ['345']), (['234'], ['123']), (['345'], ['123'])]
     self.Id_pairs_TTD = \
      [(['234'], ['345']), (['234'], ['123']), (['345'], ['123'])]
     
     self.Id_pairs_Treatment = \
      [(['234'], ['345', '123'])]
     
     self.rarefaction_cols_dict = \
      {'123': 0, '234': 1, '345': 2}
    
     self.extracted_mtx_10 = \
      array([[ 1.99181,  5.42877,  2.13996],
            [ 2.07163,  1.42877,  2.37055]])
     
     self.extracted_mtx_310 = \
      array([[  8.83115,   6.42877,  11.00725],
            [ 10.05242,   9.42877,   8.24474]])
     
     self.extracted_mtx_910 = \
      array([[ 12.9862 ,   2.42877,  11.58642]])
     
     self.sample_pair1 = \
      (['234'], ['345', '123'])
         
     self.rarefaction_mtx_for_sample_pair1_0 = \
      array([[ 5.42877],
            [ 1.42877]])
     
     self.rarefaction_mtx_for_sample_pair1_1 = \
      array([[ 2.13996,  1.99181],
            [ 2.37055,  2.07163]])
     
     self.compared_alpha_diversities_TTD = \
      {'TTD':{('21','31'):(1.8321466933860993,0.20839398129924847),
      ('67', '21'): (0.58578495700890432, 0.61731739324369639),
      ('67', '31'): (0.69838596448703294, 0.55721515283248324)}}
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, 
    depth=None, test_type='nonparametric', num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use. if None, then will use 
     the deepest available in the file. 
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
    
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, 
        rarefaction_data, category)
    
    # extract only rows of the rarefaction data that are at the given depth
    # if depth is not given default to the deepest rarefaction available
    # rarefaction file is not guaranteed to be in order of rarefaction depth
    if depth == None:
        depth = array(rarefaction_data[3])[:,0].max()

    rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth])
    
    # Average each col of the rarefaction mtx. Computing t test on averages over
    # all iterations. Avoids more comps which kills signifigance. 
    rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols
    sids = rarefaction_data[0][3:] # 0-2 are header strings
    
    ttest_results = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0])==1 and len(sid_pair[1])==1:
            ttest_results[treatment_pair]= (None,None)
        else:
            pair0_indices = [sids.index(i) for i in sid_pair[0]]
            pair1_indices = [sids.index(i) for i in sid_pair[1]]
            i = rare_mat.take(pair0_indices)
            j = rare_mat.take(pair1_indices)
            # found discussion of how to quickly check an array for nan here:
            # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy
            if isnan(np_min(i)) or isnan(np_min(j)):
                ttest_results[treatment_pair]= (None,None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i,j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(i,j, 
                    permutations=num_permutations)
                if p_val != None: 
                    p_val = float(format_p_value_for_num_iters(p_val, 
                        num_iters=num_permutations))
                elif p_val ==  None: #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            ttest_results[treatment_pair]= (obs_t,p_val)
    # create dict of average alpha diversity values
    alphadiv_avgs = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # calculate the alpha diversity average, std vals. choosing only first
        # treatment pair doesn't guarantees full covering, must look at both
        for sid_list, treatment_str in zip(sid_pair, treatment_pair):
            # check if already computed and added
            if not treatment_str in alphadiv_avgs.keys():
                alphadiv_vals = \
                    rare_mat.take([sids.index(i) for i in sid_list])
                ad_mean = alphadiv_vals.mean()
                ad_std = alphadiv_vals.std()
                alphadiv_avgs[treatment_str] = (ad_mean, ad_std) 
    return ttest_results, alphadiv_avgs
Beispiel #15
0
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth,
    test_type='nonparametric', num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use.
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, 
        rarefaction_data, category)
    
    # extract only rows of the rarefaction data that are at the given depth
    rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth])
    
    # Average each col of the rarefaction mtx. Computing t test on averages over
    # all iterations. Avoids more comps which kills signifigance. 
    rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols
    sids = rarefaction_data[0][3:] # 0-2 are header strings
    results = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0])==1 and len(sid_pair[1])==1:
            t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1])
            results[t_key]= (None,None)
        else:
            pair0_indices = [sids.index(i) for i in sid_pair[0]]
            pair1_indices = [sids.index(i) for i in sid_pair[1]]
            t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1])
            i = rare_mat.take(pair0_indices)
            j = rare_mat.take(pair1_indices)
            # found discussion of how to quickly check an array for nan here:
            # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy
            if isnan(np_min(i)) or isnan(np_min(j)):
                results[t_key]= (None,None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i,j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(i,j, 
                    permutations=num_permutations)
                if p_val != None: 
                    p_val = float(format_p_value_for_num_iters(p_val, 
                        num_iters=num_permutations))
                elif p_val ==  None: #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            results[t_key]= (obs_t,p_val)
    return results
Beispiel #16
0
    def test_parse_rarefaction(self):
        self.rarefactionfile = [
            '\tsequences per sample\titeration\t123\t234\t345',
            'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996',
            'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055',
            'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725',
            'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474',
            'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928',
            'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642'
        ]

        self.col_headers = [
            '', 'sequences per sample', 'iteration', '123', '234', '345'
        ]
        self.comments = []
        self.rarefaction_fns = [
            'rare10.txt', 'rare10.txt', 'rare310.txt', 'rare310.txt',
            'rare610.txt', 'rare610.txt'
        ]
        self.rarefaction_data = [
            [
                10.0, 0.0, 1.9918100000000001, 0.42876999999999998,
                2.1399599999999999
            ],
            [
                10.0, 1.0, 2.0716299999999999, 0.42876999999999998,
                2.3705500000000002
            ],
            [
                310.0, 0.0, 8.8311499999999992, 0.42876999999999998,
                11.007250000000001
            ], [310.0, 1.0, 10.05242, 0.42876999999999998, 8.2447400000000002],
            [610.0, 0.0, 12.030670000000001, 0.42876999999999998, 11.58928],
            [610.0, 1.0, 12.9862, 0.42876999999999998, 11.58642]
        ]

        test_col_headers, test_comments, test_rarefaction_fns, test_rarefaction_data = parse_rarefaction(
            self.rarefactionfile)
        self.assertEqual(test_col_headers, self.col_headers)
        self.assertEqual(test_comments, self.comments)
        self.assertEqual(test_rarefaction_fns, self.rarefaction_fns)
        self.assertEqual(test_rarefaction_data, self.rarefaction_data)
Beispiel #17
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_dir = opts.input_dir
    imagetype = opts.imagetype
    resolution = opts.resolution
    output_dir = opts.output_dir
    ymax = opts.ymax
    std_type = opts.std_type
    suppress_webpage = opts.suppress_html_output
    output_type = opts.output_type
    generate_per_sample_plots = opts.generate_per_sample_plots
    generate_average_tables = opts.generate_average_tables

    # Get the command-line options.
    prefs, data, background_color, label_color, ball_scale, arrow_colors = \
        sample_color_prefs_and_map_data_from_options(opts)

    rares = {}
    if isdir(input_dir):
        rarenames = listdir(input_dir)
        rarenames = [r for r in rarenames if not r.startswith('.')]
        for r in rarenames:
            try:
                rarefl = open(path.join(input_dir, r), 'U').readlines()
                rares[r] = parse_rarefaction(rarefl)
            except (IOError):
                option_parser.error('Problem with rarefaction file. %s' %
                                    exc_info()[1])
                exit(0)
    else:
        try:
            input_file = input_dir.split(',')
            for i in range(len(input_file)):
                input_path = split(input_file[i])[-1]
                rarefl = open(input_file[i], 'U').readlines()
                rares[input_path] = parse_rarefaction(rarefl)
        except (IOError):
            option_parser.error('Problem with rarefaction file. %s' %
                                exc_info()[1])
            exit(0)
    if imagetype not in ['png', 'svg', 'pdf']:
        option_parser.error('Supplied extension not supported.')
        exit(0)

    try:
        resolution = int(resolution)
    except (ValueError):
        option_parser.error('Invalid resolution.')
        exit(0)

    # output directory check
    if isinstance(output_dir, str) and output_dir != '.':
        if exists(output_dir):
            output_dir = output_dir
        else:
            try:
                create_dir(output_dir, False)
                output_dir = output_dir
            except (ValueError):
                option_parser.error('Could not create output directory.')
                exit(0)
    else:
        output_dir = mkdtemp('./')

    # Generate the plots and html text
    html_output = make_averages(
        prefs,
        data,
        background_color,
        label_color,
        rares,
        output_dir,
        resolution,
        imagetype,
        ymax,
        suppress_webpage,
        std_type,
        output_type,
        generate_per_sample_plots=generate_per_sample_plots,
        generate_average_tables=generate_average_tables)

    if html_output:
        # Write the html file.
        outfile = open(path.join(output_dir, 'rarefaction_plots.html'), 'w')
        outfile.write(html_output)
        outfile.close()
Beispiel #18
0
 def test_parse_rarefaction(self):
     self.rarefactionfile = ['\tsequences per sample\titeration\t123\t234\t345',
                             'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996',
                             'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055',
                             'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725',
                             'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474',
                             'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928',
                             'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642']
     
     self.col_headers = ['', 'sequences per sample', 'iteration', '123', '234', '345']
     self.comments = []
     self.rarefaction_fns = ['rare10.txt', 'rare10.txt', 'rare310.txt', 'rare310.txt', 'rare610.txt', 'rare610.txt']
     self.rarefaction_data = [[10.0, 0.0, 1.9918100000000001, 0.42876999999999998, 2.1399599999999999], [10.0, 1.0, 2.0716299999999999, 0.42876999999999998, 2.3705500000000002], [310.0, 0.0, 8.8311499999999992, 0.42876999999999998, 11.007250000000001], [310.0, 1.0, 10.05242, 0.42876999999999998, 8.2447400000000002], [610.0, 0.0, 12.030670000000001, 0.42876999999999998, 11.58928], [610.0, 1.0, 12.9862, 0.42876999999999998, 11.58642]]
     
     test_col_headers, test_comments, test_rarefaction_fns, test_rarefaction_data = parse_rarefaction(self.rarefactionfile)
     self.assertEqual(test_col_headers, self.col_headers)
     self.assertEqual(test_comments, self.comments)
     self.assertEqual(test_rarefaction_fns, self.rarefaction_fns)
     self.assertEqual(test_rarefaction_data, self.rarefaction_data)
def main():
    option_parser, options, args = parse_command_line_parameters(**script_info)

    ops = {}
    input_dir = options.input_dir

    rares = {}
    if isdir(input_dir):
        rarenames = listdir(input_dir)
        rarenames = [r for r in rarenames if not r.startswith(".")]
        for r in rarenames:
            try:
                rarefl = open(path.join(input_dir, r), "U").readlines()
                rares[r] = parse_rarefaction(rarefl)
            except (IOError):
                option_parser.error("Problem with rarefaction file. %s" % exc_info()[1])
                exit(0)
    else:
        try:
            input_file = input_dir.split(",")
            for i in range(len(input_file)):
                input_path = split(input_file[i])[-1]
                rarefl = open(input_file[i], "U").readlines()
                rares[input_path] = parse_rarefaction(rarefl)
        except (IOError):
            option_parser.error("Problem with rarefaction file. %s" % exc_info()[1])
            exit(0)
    if options.imagetype not in ["png", "svg", "pdf"]:
        option_parser.error("Supplied extension not supported.")
        exit(0)
    else:
        imagetype = options.imagetype

    try:
        resolution = int(options.resolution)
    except (ValueError):
        option_parser.error("Inavlid resolution.")
        exit(0)

    # Get the command-line options.
    prefs, data, background_color, label_color, ball_scale, arrow_colors = sample_color_prefs_and_map_data_from_options(
        options
    )

    # output directory check
    if isinstance(options.output_dir, str) and options.output_dir != ".":
        if exists(options.output_dir):
            output_dir = options.output_dir
        else:
            try:
                create_dir(options.output_dir, False)
                output_dir = options.output_dir
            except (ValueError):
                option_parser.error("Could not create output directory.")
                exit(0)
    else:
        output_dir = get_random_directory_name()

    # Generate the plots and html text
    ymax = options.ymax
    suppress_webpage = options.suppress_html_output
    html_output = make_averages(
        prefs, data, background_color, label_color, rares, output_dir, resolution, imagetype, ymax, suppress_webpage
    )

    if html_output:
        # Write the html file.
        outfile = open(path.join(output_dir, "rarefaction_plots.html"), "w")
        outfile.write(html_output)
        outfile.close()
    def setUp(self):
        """define some top-level data"""

        self.data={}
        self.data['xaxis']=[10.0]
        self.sample_dict={'Sample1':{10.00: [1.3276140000000001]}}
        self.data['yvals']={'Sample1': [1.3276140000000001]}
        self.data['err']={'Sample1': [.1]}
        self.xmax=140
        self.ymax=20
        self.std_type='stddev'
        self.ops=['Sample1']
        self.mapping_category='SampleID'
        self.imagetype='png'
        self.resolution=70
        self.mapping_lookup={'SampleID-Sample1':'col_0_row_0'}
        self.data['map']=[['SampleID','Day'],['Sample1','Day1']]
        self.color_prefs={'SampleID': {'column': 'SampleID', 'color': \
                          {'Sample1': '#ff0000'}}}
        self.groups={'Sample1':['Sample1']}
        self.background_color='black'
        self.label_color='white'
        self.labelname='SampleID'
        self.rare_data={'color': {'Sample1': '#ff0000'}, \
            'series': {'Sample1': [2.0515300000000001],}, \
             'headers': ['test.txt','SampleID'], 'xaxis': [10.0], \
             'error': {'Sample1': [0.0]}, 'options': ['Sample1']}
        self.fpath='/tmp/'
        self.output_dir='/tmp/'
        self.metric_name='test'
        self._paths_to_clean_up = []
        self._folders_to_cleanup = []
        self.rarefaction_file_data=[[10.0, 0.0, 1.0], [10.0, 1.0, 3.0]]
        d = {'redtowhite3_0':'#7fff00','redtowhite3_1':'#7fff00'}
        self.data_colors = color_dict_to_objects(d)
        self.colors={'Sample1':'redtowhite3_0','Sample2':'redtowhite3_1'}
        self.colors2={'Sample1':'redtowhite3_0'}
        self.mappingfile = ['#SampleID\tSex\tAge',
                            '123\tF\t32',
                            '234\tM\t30',
                            '345\tM\t32']
        #self.p_mappingfile = parse_mapping_file(self.mappingfile,\
        #                                            strip_quotes=True)
        self.rarefactionfile=[\
                    '\tsequences per sample\titeration\t123\t234\t345',
                    'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996',
                    'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055',
                    'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725',
                    'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474',
                    'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928',
                    'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642']
                    
        self.rares = {'test.txt': (['', 'sequences per sample', 'iteration', \
                      'Sample1'], [], ['rare1.txt', 'rare2.txt'], \
                      [[10.0, 2.0, 7.0, 7.0, 9.0], [10.0, 2.0, 7.0, 7.0, 9.0]])}
        self.col_headers, self.comments, self.rarefaction_fns, \
        self.rarefaction_data = parse_rarefaction(self.rarefactionfile)
        self.matrix, self.seqs_per_samp, self.sampleIDs = \
        get_rarefaction_data(self.rarefaction_data, self.col_headers)
        self.ave_seqs_per_sample1 = {'Sample1':[2.03172,9.4417849999999994,\
        12.508435]}
        self.ave_seqs_per_sample = {'123':[2.03172,9.4417849999999994,\
        12.508435],'234':[0.42876999999999998,0.42876999999999998,\
        0.42876999999999998],'345':[2.255255,9.625995,11.58785]}
        self.collapsed_ser_sex = {'M':[1.3420125000000001,5.0273824999999999,\
        6.0083099999999998], 'F':[2.03172,9.4417849999999994,12.508435]}
        self.err_ser_sex = {'M':[0.91324250000000007,4.5986124999999998,\
        5.5795399999999997],'F':[0.0,0.0,0.0]}
        self.rarefaction_legend_mat_init={'test': {'SampleID': {}}}
        self.col_headers2=['', 'sequences per sample', 'iteration', 'Sample1', \
                           'Sample2']
                           
        self.rarefaction_data_mat={'SampleID': {'Sample1': {'test': {'ave': ['     7.000'], 'err': ['       nan']}}}}
       
        self.rarefaction_legend_mat={'test': {'samples': {'Sample1': {'color': '#ff0000', 'link': 'html_plots/testcol_0_row_0.png'}}, 'groups': {'SampleID': {'Sample1': {'groupcolor': '#ff0000', 'groupsamples': ['Sample1']}}}}}
        self.exp_err_series_ave={'M': [1.571915, 6.49885, 8.1750183333333339]}
Beispiel #21
0
def mean_alpha(alpha_dict, depth):
    """mean collated alpha diversity data at a given depth

    Input:
    alpha_dict: dictionary where the values are the lines of a collated alpha
    diversity data files and the keys are the names of each of these files with
    no extension, this name is usually the metric used to compute the alpha
    diversity.
    depth: selected depth to mean the computed alpha diversity values for the
    alpha_dict data. If None is passed, the highest depth will be used.

    Output:
    metrics: list of metric names i. e. the name of each collated alpha div file
    sample_ids: list of sample identifiers represented
    data: a list of lists with the mean of alpha diversity data at a given
    depth for the different metrics, each column is a different metric.
    """

    assert type(alpha_dict) == dict, "Input data must be a dictionary"
    assert depth == None or (depth >= 0 and type(depth) == int), "The "+\
        "specified depth must be a positive integer."

    metrics = []
    sample_ids = []
    data = []

    for key, value in alpha_dict.iteritems():
        identifiers, _, _, rarefaction_data = parse_rarefaction(value)

        # if depth is specified as None use the highest available, retrieve it
        # on a per file basis so you make sure the value exists for all files
        if depth == None:
            _depth = int(max([row[0] for row in rarefaction_data]))
        else:
            _depth = depth
        metrics.append('{0}_even_{1}'.format(key, _depth))

        # check there are elements with the desired rarefaction depth
        if sum([1 for row in rarefaction_data if row[0] == _depth]) == 0:
            # get a sorted list of strings with the available rarefaction depths
            available_rarefaction_depths = map(
                str, sorted(list(set([row[0] for row in rarefaction_data]))))
            raise ValueError, (
                "The depth %d does not exist in the collated "
                "alpha diversity file for the metric: %s. The available depths "
                "are: %s." %
                (_depth, key, ', '.join(available_rarefaction_depths)))

        # check all the files have the same sample ids in the same order
        if sample_ids:
            if not sample_ids == identifiers[3:]:
                raise ValueError, (
                    "Non-matching sample ids were found in the "
                    "collated alpha diversity files. Make sure all the files "
                    "contain data for the same samples.")
        else:
            sample_ids = identifiers[3:]

        # find all the data at the desired depth and get the mean values, remove
        # the first two elements ([depth, iteration]) as those are not needed
        data.append(array([row[2:] for row in rarefaction_data if\
            row[0] == _depth]).mean(axis=0))

    # transpose the data to match the formatting of non-collated alpha div data
    data = array(data).T.tolist()

    return metrics, sample_ids, data
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_dir = opts.input_dir
    imagetype = opts.imagetype
    resolution = opts.resolution
    output_dir = opts.output_dir
    ymax = opts.ymax
    std_type = opts.std_type
    suppress_webpage = opts.suppress_html_output
    output_type = opts.output_type
    generate_per_sample_plots = opts.generate_per_sample_plots
    generate_average_tables = opts.generate_average_tables

    # Get the command-line options.
    prefs, data, background_color, label_color, ball_scale, arrow_colors = \
        sample_color_prefs_and_map_data_from_options(opts)

    rares = {}
    if isdir(input_dir):
        rarenames = listdir(input_dir)
        rarenames = [r for r in rarenames if not r.startswith('.')]
        for r in rarenames:
            try:
                rarefl = open(path.join(input_dir, r), 'U').readlines()
                rares[r] = parse_rarefaction(rarefl)
            except(IOError):
                option_parser.error('Problem with rarefaction file. %s' %
                                    exc_info()[1])
                exit(0)
    else:
        try:
            input_file = input_dir.split(',')
            for i in range(len(input_file)):
                input_path = split(input_file[i])[-1]
                rarefl = open(input_file[i], 'U').readlines()
                rares[input_path] = parse_rarefaction(rarefl)
        except(IOError):
            option_parser.error('Problem with rarefaction file. %s' %
                                exc_info()[1])
            exit(0)
    if imagetype not in ['png', 'svg', 'pdf']:
        option_parser.error('Supplied extension not supported.')
        exit(0)

    try:
        resolution = int(resolution)
    except(ValueError):
        option_parser.error('Inavlid resolution.')
        exit(0)

    # output directory check
    if isinstance(output_dir, str) and output_dir != '.':
        if exists(output_dir):
            output_dir = output_dir
        else:
            try:
                create_dir(output_dir, False)
                output_dir = output_dir
            except(ValueError):
                option_parser.error('Could not create output directory.')
                exit(0)
    else:
        output_dir = get_random_directory_name()

    # Generate the plots and html text
    html_output = make_averages(prefs, data, background_color, label_color,
                                rares, output_dir, resolution, imagetype, ymax,
                                suppress_webpage, std_type, output_type,
                                generate_per_sample_plots=generate_per_sample_plots,
                                generate_average_tables=generate_average_tables)

    if html_output:
        # Write the html file.
        outfile = open(path.join(output_dir, 'rarefaction_plots.html'), 'w')
        outfile.write(html_output)
        outfile.close()
Beispiel #23
0
def compare_alpha_diversities(rarefaction_lines, mapping_lines, 
                              category, depth, test_type='nonparametric',
                              num_permutations=999):
    """compares alpha diversities
    
    inputs:
        rarefaction_file - rarefaction file which gives scores for 
        various rarefactions and depths
        
        mapping_file - file that has ID's and categories that the ID's
        fall in
        
        category - the category to be compared, is a string
        
        depth - the depth of the rarefaction_file to use, is an integer

        test_type - the type of t-test to perform, is a string. Must be either
        'parametric' or 'nonparametric'

        num_permutations - the number of Monte Carlo permutations to use if
        test_type is 'nonparametric', is an integer
    
    outputs:
        results - a nested dictionary which specifies the category as
        the top level key, and as its value, dictionaries which give the
        results of the t_two_sample test for all unique pairs of values
        in the specified category
    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    value_pairs = make_value_pairs_from_category(mapping_data, category)
    
    category_values_Ids = make_category_values_Id_dict(mapping_data, 
                                                       category)
    
    SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs,
                                                    category_values_Ids)
    
    map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict(
                                                       rarefaction_data)
    
    reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth,
                                                       rarefaction_data)
    
    results = {category:{}}
    
    for pair in range(len(SampleId_pairs)):
        # Must flatten the matrix because t_two_sample only operates on
        # non-nested sequences (otherwise we'll get the wrong degrees of
        # freedom).
        i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0],
                                                reduced_rarefaction_mtx,
                                                map_from_Id_to_col)).flatten()
        
        j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1],
                                                reduced_rarefaction_mtx,
                                                map_from_Id_to_col)).flatten()

        if test_type == 'parametric':
            obs_t, p_val = t_two_sample(i,j)
        elif test_type == 'nonparametric':
            obs_t, _, _, p_val = mc_t_two_sample(i,j,
                                                 permutations=num_permutations)
            p_val = format_p_value_for_num_iters(p_val, num_permutations)
        else:
            raise ValueError("Invalid test type '%s'." % test_type)

        results[category][(str(value_pairs[pair][0]),
                           str(value_pairs[pair][1]))] = obs_t, p_val
    return results