def test_get_otu_table_info(self):
        """get_otu_table_info works"""
        otu_table = """#Full OTU Counts
#OTU ID\tsample1\tsample2\tsample3
0\t0\t2\t0
1\t1\t0\t0
2\t1\t1\t1""".split('\n')
        sample_ids, otu_ids, otu_data, lineages = \
            parse_otu_table(otu_table)
        result, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        self.assertEqual(result['1'], {'sample1': '1', 'sample3': '0', 'sample2': '0'})
        self.assertEqual(result['0'], {'sample1': '0', 'sample3': '0', 'sample2': '2'})
        self.assertEqual(result['2'], {'sample1': '1', 'sample3': '1', 'sample2': '1'})
        self.assertEqual(num_samples, 3)
        self.assertEqual(taxonomy_info, {})

        #test that it parses otu tables with taxonomy fields appropriately
        otu_table = """#Full OTU Counts
#OTU ID\tsample1\tsample2\tsample3\tConsensus Lineage
0\t0\t2\t0\tBacteria; Bacteroidetes; Bacteroidales; Parabacteroidaceae; Unclassified; otu_475
1\t1\t0\t0\tBacteria; Bacteroidetes; Bacteroidales; adhufec77-25; Barnesiella; Barnesiella_viscericola; otu_369
2\t1\t1\t1\tBacteria; Firmicutes; Clostridia; Clostridiales; Faecalibacterium; Unclassified; otu_1121""".split('\n')
        sample_ids, otu_ids, otu_data, lineages = \
            parse_otu_table(otu_table)
        result, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        self.assertEqual(result['1'], {'sample1': '1', 'sample3': '0', 'sample2': '0'})
        self.assertEqual(result['0'], {'sample1': '0', 'sample3': '0', 'sample2': '2'})
        self.assertEqual(result['2'], {'sample1': '1', 'sample3': '1', 'sample2': '1'})
        self.assertEqual(num_samples, 3)
        self.assertEqual(taxonomy_info, {'1': 'Bacteria; Bacteroidetes; Bacteroidales; adhufec77-25; Barnesiella; Barnesiella_viscericola; otu_369', '0': 'Bacteria; Bacteroidetes; Bacteroidales; Parabacteroidaceae; Unclassified; otu_475', '2': 'Bacteria; Firmicutes; Clostridia; Clostridiales; Faecalibacterium; Unclassified; otu_1121'})
Example #2
0
    def test_sort_otu_table_error(self):
        """ sort_otu_table handles errors """

        self.assertRaises(ValueError,sort_otu_table,
            parse_otu_table(self.otu_table1),['NA','Key','Fing','Key'])
        self.assertRaises(KeyError,sort_otu_table,
            parse_otu_table(self.otu_table1),['NA','Key'])
    def test_get_single_paired_T_values(self):
        """get_single_paired_T_values works"""
        cat_mapping = """#SampleID\ttimepoint_zero\tindividual
s1\t1\tA
s2\t0\tA
s3\t1\tB
s4\t0\tB
s5\t1\tC
s6\t0\tC""".split('\n')
        otu_table = """#Full OTU Counts
#OTU ID\ts1\ts2\ts3\ts4\ts5\ts6
0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0\t0.2
1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0\t999999999.0
2\t0.0\t0.2\t0.0\t-0.7\t0.0\t0.1""".split('\n')
        sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table, float)
        mapping_data, header, comments = parse_mapping_file(cat_mapping)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        OTU_list = ['0', '1', '2']
        
        before_vals, after_vals = get_single_paired_T_values('0', \
            mapping_data, header, 'individual', 'timepoint_zero', otu_ids,\
            sample_ids, otu_data, 999999999.0)
        self.assertFloatEqual(before_vals, [0.0, 0.0])
        self.assertFloatEqual(after_vals, [0.3, 0.2])
        #test of OTU1
        before_vals, after_vals = get_single_paired_T_values('1', \
            mapping_data, header, 'individual', 'timepoint_zero', otu_ids,\
            sample_ids, otu_data, 999999999.0)
        self.assertFloatEqual(before_vals, [0.0])
        self.assertFloatEqual(after_vals, [-0.2])
        #works when a sample is missing from the OTU table
        #e.g. if an after timepoint dropped out during rarefaction
        #will also drop the before
        otu_table2 = """#Full OTU Counts
#OTU ID\ts1\ts2\ts3\ts4\ts5
0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0
1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0
2\t0.0\t0.2\t0.0\t-0.7\t0.0""".split('\n')
        sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table2, float)
        before_vals, after_vals = get_single_paired_T_values('0', \
            mapping_data, header, 'individual', 'timepoint_zero', otu_ids,\
            sample_ids, otu_data, 999999999.0)

        self.assertEqual(before_vals, [0.0])
        self.assertFloatEqual(after_vals, [0.3])
        #works when the before is missing
        otu_table3 = """#Full OTU Counts
#OTU ID\ts1\ts2\ts3\ts4\ts6
0\t999999999.0\t999999999.0\t0.0\t0.3\t0.2
1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0
2\t0.0\t0.2\t0.0\t-0.7\t0.1""".split('\n')
        sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table3, float)
        before_vals, after_vals = get_single_paired_T_values('0', \
            mapping_data, header, 'individual', 'timepoint_zero', otu_ids,\
            sample_ids, otu_data, 999999999.0)
        self.assertEqual(before_vals, [0.0])
        self.assertFloatEqual(after_vals, [0.3])
Example #4
0
def test_wrapper(test, otu_table, category_mapping, category, threshold, \
                 _filter, otu_include=None):
    """runs statistical test to look for category/OTU associations"""

    if test == 'ANOVA' or test == 'correlation':
        otu_table = convert_OTU_table_relative_abundance(otu_table)
        sample_ids, otu_ids, otu_data, lineages = \
            parse_otu_table(otu_table, float)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        mapping_data, header, comments = parse_mapping_file(category_mapping)
        category_info, category_values = \
            get_category_info(mapping_data, header, category, threshold)
        OTU_list = filter_OTUs(otu_sample_info, _filter, all_samples= False, \
            category_mapping_info=category_info)
    elif test == 'g_test':
        sample_ids, otu_ids, otu_data, lineages = \
            parse_otu_table(otu_table, float)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        mapping_data, header, comments = parse_mapping_file(category_mapping)
        category_info, category_values = \
            get_category_info(mapping_data, header, category, threshold)
        OTU_list = filter_OTUs(otu_sample_info, _filter, all_samples= True, \
            category_mapping_info=category_info)
    else:
        raise ValueError(
            "An invalid test statistic was given. (-s option). Valid values are ANOVA, correlation, and g_test."
        )

    #filter OTU_list with the otu_include list
    if otu_include:
        otu_include = [line.strip() for line in otu_include]
        OTU_list = [OTU for OTU in OTU_list if OTU in otu_include]
    if len(OTU_list) == 0:
        raise ValueError(
            "No OTUs remain after applying the filter. Try lowering the filter value (-f option)"
        )
    if test == 'ANOVA':
        results = run_ANOVA_OTUs(OTU_list, category_info, otu_sample_info, \
                        category_values)
        output = output_results_ANOVA(results, category_values, taxonomy_info)
    elif test == 'correlation':
        results = run_correlation_OTUs(OTU_list, category_info,
                                       otu_sample_info)
        output = output_results_correlation(results, taxonomy_info)
    elif test == 'g_test':
        results = run_G_test_OTUs(OTU_list, category_info, otu_sample_info, \
                         category_values)
        output = output_results_G_test(results, taxonomy_info)
    return output
Example #5
0
    def test_sort_otu_table(self):
        """ sort_otu_table fns as expected """

        actual = sort_otu_table(parse_otu_table(self.otu_table1),
                                ['NA','Key','Fing'])
        expected = parse_otu_table(self.age_sorted_otu_table1)
        # sample ids match expected
        self.assertEqual(actual[0],expected[0])
        # otu ids match expected
        self.assertEqual(actual[1],expected[1])
        # otu data match expected
        self.assertEqual(actual[2],expected[2])
        # taxa match expected
        self.assertEqual(actual[3],expected[3])
Example #6
0
    def test_sort_otu_table_by_mapping_field_some_values_differ(self):
        """ sort_otu_table fns when some values differ"""

        actual = sort_otu_table_by_mapping_field(parse_otu_table(self.otu_table1),
                              parse_mapping_file(self.mapping_f2),
                              sort_field = "Nothing")
        expected = parse_otu_table(self.nothing_sorted_otu_table1)
        # sample ids match expected
        self.assertEqual(actual[0],expected[0])
        # otu ids match expected
        self.assertEqual(actual[1],expected[1])
        # otu data match expected
        self.assertEqual(actual[2],expected[2])
        # taxa match expected
        self.assertEqual(actual[3],expected[3])
Example #7
0
    def test_sort_otu_table_by_mapping_field_some_values_same(self):
        """ sort_otu_table_by_mapping_field fns when all values are the same"""

        actual = sort_otu_table_by_mapping_field(parse_otu_table(self.otu_table1),
                              parse_mapping_file(self.mapping_f2),
                              sort_field = "Name")
        expected = parse_otu_table(self.name_sorted_otu_table1)
        # sample ids match expected
        self.assertEqual(actual[0],expected[0])
        # otu ids match expected
        self.assertEqual(actual[1],expected[1])
        # otu data match expected
        self.assertEqual(actual[2],expected[2])
        # taxa match expected
        self.assertEqual(actual[3],expected[3])
def pool_otu_table(otu_infile, otu_outfile, 
    pooled_sample_name, sample_ids_to_pool):
    """pools otu table file according to specified criteria."""

    ## otu table
    otu_table = parse_otu_table(otu_infile)
    pool_sample_idxs = []
    nonpool_sample_idxs = []
    for i in range(len(otu_table[0])): #sample ids
        if otu_table[0][i] in sample_ids_to_pool:
            pool_sample_idxs.append(i)
        else:
            nonpool_sample_idxs.append(i)
    
    new_sample_ids = []
    for i in range(len(otu_table[0])): #sample ids
        if otu_table[0][i] not in sample_ids_to_pool: 
            # from valid_states string on mapfile
            new_sample_ids.append(otu_table[0][i])
    new_sample_ids.append(pooled_sample_name)
    
    # otu mtx
    new_sample_abund = otu_table[2][:,pool_sample_idxs].sum(1)
    newdims = (len(otu_table[2]),len(new_sample_ids))

    new_otu_mtx = numpy.zeros(newdims,dtype=otu_table[2].dtype)
    new_otu_mtx[:,:-1] = otu_table[2][:,nonpool_sample_idxs]
    new_otu_mtx[:,-1] = new_sample_abund
    
    otu_outfile.write(format_otu_table(new_sample_ids, otu_table[1], 
        new_otu_mtx, taxonomy=otu_table[3]))
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    data={}
    mapping,headers,comments = get_map(opts, data)
    
    mapping_headers_to_use=opts.mapping_headers_to_use
    background_color=opts.background_color
    monte_carlo_dist=opts.monte_carlo_dist
    ball_scale=opts.ball_scale
    arrow_line_color=opts.arrow_line_color
    arrow_head_color=opts.arrow_head_color
    
    taxonomy_count_file = opts.input_taxa_file
    
    if taxonomy_count_file:
        try:
            counts_f = open(taxonomy_count_file, 'U').readlines()
            sample_ids, otu_ids, otu_table, lineages = \
                       parse_otu_table(counts_f,count_map_f=float)
        except (TypeError, IOError):
            raise ValueError, 'Summarized taxa file could not be parsed.'
    else:
        otu_ids=None
        
    out = build_prefs_string(mapping_headers_to_use, background_color, \
                                monte_carlo_dist, headers, otu_ids, \
                                ball_scale, arrow_line_color, arrow_head_color)
                                
    f = open(opts.output_fp,'w')
    f.write(out)
    f.close()
Example #10
0
def filter_samples_from_otu_table(otu_table_lines,
                                  samples_to_discard,
                                  negate=False):
    """ Remove specified samples from OTU table """
    otu_table_data = parse_otu_table(otu_table_lines)
    
    sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard])
    new_otu_table_data = []
    new_sample_ids = []
    
    if negate:
        def keep_sample(s):
            return s in sample_lookup
    else:
        def keep_sample(s):
            return s not in sample_lookup
    
    sample_ids, otu_ids, otu_table_data, taxa = otu_table_data
    otu_table_data = otu_table_data.transpose()
    
    for row,sample_id in zip(otu_table_data,sample_ids):
        if keep_sample(sample_id):
            new_otu_table_data.append(row)
            new_sample_ids.append(sample_id)
    
    new_otu_table_data = array(new_otu_table_data).transpose()
    
    result = format_otu_table(new_sample_ids,
                              otu_ids,
                              new_otu_table_data,
                              taxa,
                              skip_empty=True).split('\n')
    return result
    def test_run_single_paired_T_test(self):
        """run_single_paired_T_test works
        """
        cat_mapping = """#SampleID\ttimepoint_zero\tindividual
s1\t1\tA
s2\t0\tA
s3\t1\tB
s4\t0\tB
s5\t1\tC
s6\t0\tC""".split('\n')
        otu_table = """#Full OTU Counts
#OTU ID\ts1\ts2\ts3\ts4\ts5\ts6
0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0\t0.2
1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0\t999999999.0
2\t0.0\t0.2\t0.0\t-0.7\t0.0\t0.1""".split('\n')
        sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table, float)
        mapping_data, header, comments = parse_mapping_file(cat_mapping)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        OTU_list = ['0', '1', '2']
        #should return the results since there should be 4 values to evaluate
        result = run_single_paired_T_test('0', mapping_data, header, \
            'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \
            999999999.0, 4)
        self.assertEqual(len(result), 4)
        self.assertFloatEqual(result[1], 0.12566591637800242)
        self.assertFloatEqual(result[2], [0.29999999999999999, 0.20000000000000001])
        self.assertEqual(result[3], 2)
        #check the the filter works
        result = run_single_paired_T_test('0', mapping_data, header, \
            'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \
            999999999.0, 5)
        self.assertEqual(result, None)
Example #12
0
def pool_otu_table(otu_infile, otu_outfile, 
    pooled_sample_name, sample_ids_to_pool):
    """pools otu table file according to specified criteria."""

    ## otu table
    otu_table = parse_otu_table(otu_infile)
    pool_sample_idxs = []
    nonpool_sample_idxs = []
    for i in range(len(otu_table[0])): #sample ids
        if otu_table[0][i] in sample_ids_to_pool:
            pool_sample_idxs.append(i)
        else:
            nonpool_sample_idxs.append(i)
    
    new_sample_ids = []
    for i in range(len(otu_table[0])): #sample ids
        if otu_table[0][i] not in sample_ids_to_pool: 
            # from valid_states string on mapfile
            new_sample_ids.append(otu_table[0][i])
    new_sample_ids.append(pooled_sample_name)
    
    # otu mtx
    new_sample_abund = otu_table[2][:,pool_sample_idxs].sum(1)
    newdims = (len(otu_table[2]),len(new_sample_ids))

    new_otu_mtx = numpy.zeros(newdims,dtype=otu_table[2].dtype)
    new_otu_mtx[:,:-1] = otu_table[2][:,nonpool_sample_idxs]
    new_otu_mtx[:,-1] = new_sample_abund
    
    otu_outfile.write(format_otu_table(new_sample_ids, otu_table[1], 
        new_otu_mtx, taxonomy=otu_table[3]))
    def test_output_results_paired_T_test(self):
        """output_results_paired_T_test works
        """
        cat_mapping = """#SampleID\ttimepoint_zero\tindividual
s1\t1\tA
s2\t0\tA
s3\t1\tB
s4\t0\tB
s5\t1\tC
s6\t0\tC""".split('\n')
        otu_table = """#Full OTU Counts
#OTU ID\ts1\ts2\ts3\ts4\ts5\ts6
0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0\t0.2
1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0\t999999999.0
2\t0.0\t0.2\t0.0\t-0.7\t0.0\t0.1""".split('\n')
        sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table, float)
        mapping_data, header, comments = parse_mapping_file(cat_mapping)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        OTU_list = ['0', '1', '2']
        all_results = run_paired_T_test_OTUs(OTU_list, mapping_data, header, \
            'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \
            999999999.0, 4)
        output = output_results_paired_T_test(all_results)
        #of = open('/Users/lozupone/temp_output.xls', 'w')
        #of.write('\n'.join(output))
        #of.close()
        self.assertEqual(output, ['OTU\tprob\tT stat\taverage_diff\tnum_pairs\tBonferroni_corrected\tFDR_corrected', '0\t0.125665916378\t-5.0\t0.25\t2\t0.251331832756\t0.251331832756', '2\t0.685730319473\t0.468164588785\t-0.133333333333\t3\t1.37146063895\t0.685730319473'])
def calc_shared_phylotypes(infile, reference_sample=None):
    """Calculates number of shared phylotypes for each pair of sample.

    infile: otu table filehandle

    reference_sample: if set, will use this sample name to calculate shared OTUs
                      between reference sample, and pair of samples. Useful, 
                      e.g. when the reference sample is the Donor in a transplant study
    """

    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(infile)
 
    if reference_sample:
        ref_idx = sample_ids.index(reference_sample)
    (n,m) = otu_table.shape
    result_array = zeros((m,m), dtype=int)
    for i in range(m):
        for j in range (i+1):
            if reference_sample:
                result_array[i,j] = result_array[j,i] = \
                    _calc_shared_phylotypes_multiple(otu_table, [i, j, ref_idx])
            else:  
                result_array[i,j] = result_array[j,i] = \
                    _calc_shared_phylotypes_pairwise(otu_table, i, j)
                
    return format_distance_matrix(sample_ids, result_array)+"\n"
    def test_make_new_otu_counts(self):
        """make_new_otu_counts works
        """
        mapping_lines = """#SampleID\tindividual\ttimepoint_zero\ttimepoint
AT0\tA\t1\t0
AT1\tA\t0\t1
AT2\tA\t0\t2
BT0\tB\t1\t0
BT1\tB\t0\t1
BT2\tB\t0\t2
""".split('\n')
        mapping_data, header, comments = parse_mapping_file(mapping_lines)
        samples_from_subject, sample_to_subtract = \
            get_sample_individual_info(mapping_data, header, 'individual', \
            'timepoint_zero')
        otu_lines = """# QIIME v1.2.0-dev OTU table
#OTU ID\tAT0\tAT1\tS1\tAT2\tBT0\tBT1\tBT2
0\t0.5\t0.3\t99\t0.2\t0.0\t0.0\t0.0
1\t0.0\t0.0\t99\t0.0\t0.4\t0.5\t0.6
2\t0.1\t0.4\t99\t0.7\t0.5\t0.6\t0.8
3\t0.0\t0.1\t99\t0.0\t0.4\t0.0\t0.0
""".split('\n')
        otu_table = parse_otu_table(otu_lines, float)
        sample_ids, otu_ids, otu_counts, consensus = otu_table
        converted_otu_table = make_new_otu_counts(otu_ids, sample_ids, otu_counts, consensus, sample_to_subtract, samples_from_subject)
        converted_otu_table = converted_otu_table.split('\n')
        self.assertEqual(converted_otu_table[1], "#OTU ID\tAT0\tAT1\tAT2\tBT0\tBT1\tBT2")
        self.assertEqual(converted_otu_table[2], "0\t0.0\t-0.2\t-0.3\t999999999.0\t999999999.0\t999999999.0")
        self.assertEqual(converted_otu_table[3], "1\t999999999.0\t999999999.0\t999999999.0\t0.0\t0.1\t0.2")
        self.assertEqual(converted_otu_table[4], "2\t0.0\t0.3\t0.6\t0.0\t0.1\t0.3")
        self.assertEqual(converted_otu_table[5], "3\t0.0\t0.1\t0.0\t0.0\t-0.4\t-0.4")
    def test_sum_counts_by_consensus(self):
        """should sum otu counts by consensus"""
        otu_table = parse_otu_table(self.otu_table)
        obs_result, obs_mapping = sum_counts_by_consensus(otu_table, 3)
        exp_result = {('Root','Bacteria','Actinobacteria'):array([1,0,2,4]),
                      ('Root','Bacteria','Firmicutes'):array([1,3,1,1]),
                      ('Root','Bacteria','Other'):array([1,2,1,0])}
        exp_mapping = {'s1':0, 's2':1, 's3':2, 's4':3}
        self.assertEqual(obs_result, exp_result)
        self.assertEqual(obs_mapping, exp_mapping)

        obs_result, obs_mapping = sum_counts_by_consensus(otu_table, 2)
        exp_result = {('Root','Bacteria'):array([3,5,4,5])}
        exp_mapping = {'s1':0, 's2':1, 's3':2, 's4':3}
        self.assertEqual(obs_result, exp_result)
        self.assertEqual(obs_mapping, exp_mapping)

        obs_result, obs_mapping = sum_counts_by_consensus(otu_table, 4)
        exp_result = {('Root','Bacteria','Actinobacteria','Actinobacteria'):\
                array([1,0,2,4]),
                      ('Root','Bacteria','Firmicutes','"Clostridia"'):\
                              array([1,3,1,1]),
                      ('Root','Bacteria','Other','Other'):array([1,2,1,0])}
        exp_mapping = {'s1':0, 's2':1, 's3':2, 's4':3}
        self.assertEqual(obs_result, exp_result)
        self.assertEqual(obs_mapping, exp_mapping)
Example #17
0
    def test_sort_otu_table_by_mapping_field_error(self):
        """ sort_otu_table_by_mapping_field fails on samples in otu table but not mapping"""

        self.assertRaises(KeyError,sort_otu_table_by_mapping_field,
                                   parse_otu_table(self.otu_table1_bad_sampleID),
                                   parse_mapping_file(self.mapping_f2),
                                   sort_field = "Age")
Example #18
0
def R_format_otu_table(otu_filepath, output_dir=None, write_to_tmp_file=True):
    """Formats OTU table for R (remove comments & column 1 header)
       If write_to_tmp_file, writes formatted file to tmp file and returns path
       else, returns lines to go in file
    """
    sample_ids, otu_ids, otu_matrix, lineages = \
        parse_otu_table(open(otu_filepath,'U').readlines())
    # first line is sample ids, no header for first column (how R likes it)
    lines = ['\t'.join(sample_ids)]
    for i in xrange(len(otu_ids)):
        # note: casting array as a string and calling "split" is much faster
        # than mapping "str" onto the array
        array_as_strings = str(otu_matrix[i, :])[1:-1].split()
        lines.append(otu_ids[i] + '\t' + '\t'.join(array_as_strings))
    if write_to_tmp_file:
        if output_dir is None:
            tmp_fp = get_tmp_filename(prefix='otus_R_format', suffix='.txt')
        else:
            tmp_fp = join(output_dir, 'otus_R_format.txt')
        fout = open(tmp_fp, 'w')
        fout.write('\n'.join(lines))
        fout.close()
        return tmp_fp
    else:
        return lines
Example #19
0
def filter_otus_from_otu_table(otu_table_lines,otus_to_discard,negate=False):
    """ Remove specified OTUs from otu_table """
    otu_table_data = parse_otu_table(otu_table_lines)
    
    otu_lookup = {}.fromkeys([e.split()[0] for e in otus_to_discard])
    new_otu_table_data = []
    new_otu_ids = []
    new_taxa = []
    
    if negate:
        def keep_otu(s):
            return s in otu_lookup
    else:
        def keep_otu(s):
            return s not in otu_lookup
    
    sample_ids, otu_ids, otu_table_data, taxa = otu_table_data
    
    for row,otu_id,taxonomy in zip(otu_table_data,otu_ids,taxa):
        if keep_otu(otu_id):
            new_otu_table_data.append(row)
            new_otu_ids.append(otu_id)
            new_taxa.append(taxonomy)
    
    new_otu_table_data = array(new_otu_table_data)
            
    result = format_otu_table(sample_ids,
                              new_otu_ids,
                              new_otu_table_data,
                              new_taxa).split('\n')
    return result
    def test_filter_OTUs(self):
        """filter_OTUs works"""
        otu_table = """#Full OTU Counts
#OTU ID\tsample1\tsample2\tsample3
0\t0\t2\t0
1\t1\t0\t0
2\t1\t1\t1""".split('\n')
        sample_ids, otu_ids, otu_data, lineages = \
            parse_otu_table(otu_table, float)
        OTU_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        result = filter_OTUs(OTU_sample_info, 2)
        self.assertEqual(result, [])
        result = filter_OTUs(OTU_sample_info, 1)
        self.assertEqual(result, ['1', '0'])
        
        result = filter_OTUs(OTU_sample_info, 2, False)
        self.assertEqual(result, ['2'])
        result = filter_OTUs(OTU_sample_info, 1, False)
        self.assertEqual(result, ['1', '0', '2'])
        #test that is works if a category mapping file is supplied
        cat_mapping = {'sample2': '0', 'sample3': '1'}
        result = filter_OTUs(OTU_sample_info, 1,\
                        category_mapping_info=cat_mapping)
        self.assertEqual(result, ['0'])
def _filter_table_neg_control(otu_table_lines, samples):
    """removes OTUs from OTU_table that are found in one of the samples in the sample list
    """
    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_table_lines)
    new_otu_table = []
    new_otu_ids = []
    new_lineages = []
    #get the sample indices to remove
    sample_indices = []
    for i in samples:
        if i in sample_ids:
            index = sample_ids.index(i)
            sample_indices.append(index)

    for i, row in enumerate(otu_table):
        #figure out if the OTU is in any of the negative controls
        count = 0
        for j in sample_indices:
            count += row[j]
        #only write it to the new OTU table if it is not
        if count == 0:
            if lineages:
                new_lineages.append(lineages[i])
            new_otu_table.append(list(row))
            new_otu_ids.append(otu_ids[i])
    new_otu_table = array(new_otu_table)
    result = format_otu_table(sample_ids, new_otu_ids, new_otu_table, new_lineages)
    result = result.split('\n')
    #remove the samples
    return _filter_table_samples(result, 1)
def main():
    option_parser, opts, args =\
      parse_command_line_parameters(**script_info)

    otu_table_data = parse_otu_table(open(opts.input_otu_table,'U'))
    sort_field = opts.sort_field
    mapping_fp = opts.mapping_fp
    sorted_sample_ids_fp = opts.sorted_sample_ids_fp
    
    if sort_field and mapping_fp:
        mapping_data = parse_mapping_file(open(mapping_fp,'U'))
        result = sort_otu_table_by_mapping_field(otu_table_data,
                                                 mapping_data,
                                                 sort_field)
    elif sorted_sample_ids_fp:
        sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp,'U'))
        result = sort_otu_table(otu_table_data,
                                sorted_sample_ids)
    else:
        parser.error("must provide either --sort_field and --mapping_fp OR --sorted_sample_ids_fp")

    # format and write the otu table
    result_str = format_otu_table(result[0],result[1],result[2],result[3])
    of = open(opts.output_fp,'w')
    of.write(result_str)
    of.close()
Example #23
0
def calc_shared_phylotypes(infile, reference_sample=None):
    """Calculates number of shared phylotypes for each pair of sample.

    infile: otu table filehandle

    reference_sample: if set, will use this sample name to calculate shared OTUs
                      between reference sample, and pair of samples. Useful, 
                      e.g. when the reference sample is the Donor in a transplant study
    """

    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(infile)
 
    if reference_sample:
        ref_idx = sample_ids.index(reference_sample)
    (n,m) = otu_table.shape
    result_array = zeros((m,m), dtype=int)
    for i in range(m):
        for j in range (i+1):
            if reference_sample:
                result_array[i,j] = result_array[j,i] = \
                    _calc_shared_phylotypes_multiple(otu_table, [i, j, ref_idx])
            else:  
                result_array[i,j] = result_array[j,i] = \
                    _calc_shared_phylotypes_pairwise(otu_table, i, j)
                
    return format_distance_matrix(sample_ids, result_array)+"\n"
def test_wrapper(test, otu_table, category_mapping, category, threshold, \
        _filter, otu_include=None, ignore_val=None, \
        otu_table_relative_abundance=False, individual_column='individual',\
        timepoint_zero_column='timepoint_zero'):
    """runs statistical test to look for category/OTU associations"""
    if ignore_val == 'None':
        ignore_val = None
    
    if test == 'ANOVA' or test == 'correlation' or test == 'paired_T': 
        if not otu_table_relative_abundance:
            otu_table = convert_OTU_table_relative_abundance(otu_table)
        all_samples = False
    elif test == 'g_test':
        all_samples = True
    else:
        raise ValueError("An invalid test statistic was given. (-s option). Valid values are ANOVA, correlation, g_test, paired_T.")
    
    sample_ids, otu_ids, otu_data, lineages = \
        parse_otu_table(otu_table, float)
    otu_sample_info, num_samples, taxonomy_info = \
        get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
    mapping_data, header, comments = parse_mapping_file(category_mapping)
    if not test == 'paired_T':
        category_info, category_values = \
            get_category_info(mapping_data, header, category, threshold)
    #do not apply the filter_OTUs to the longitudinal studies as they are
    #filtered later
    if test == 'ANOVA' or test == 'correlation' or test == 'g_test':
        OTU_list = filter_OTUs(otu_sample_info, _filter, \
            all_samples= all_samples, category_mapping_info=category_info)
    elif test == 'longitudinal_correlation' or test == 'paired_T':
        OTU_list = otu_sample_info.keys()

    #filter OTU_list with the otu_include list
    if otu_include:
        otu_include = [line.strip() for line in otu_include]
        OTU_list = [OTU for OTU in OTU_list if OTU in otu_include]
    if len(OTU_list) == 0:
        raise ValueError("No OTUs remain after applying the filter. Try lowering the filter value (-f option)")
    if test == 'ANOVA':
        results = run_ANOVA_OTUs(OTU_list, category_info, otu_sample_info, \
                        category_values)
        output = output_results_ANOVA(results, category_values, taxonomy_info)
    elif test == 'correlation':
        results = run_correlation_OTUs(OTU_list, category_info, \
            otu_sample_info, ignore_val=ignore_val, filter=_filter)
        output = output_results_correlation(results, taxonomy_info)
    elif test == 'g_test':
        results = run_G_test_OTUs(OTU_list, category_info, otu_sample_info, \
                         category_values)
        output = output_results_G_test(results, taxonomy_info)
    elif test == 'paired_T':
        #category info in this case should be the timepoint_zero column.
        #The timepoint_zero column should be used as the category in the wrapper
        results = run_paired_T_test_OTUs(OTU_list, mapping_data, header, \
            individual_column, timepoint_zero_column, otu_ids, sample_ids, \
            otu_data,ignore_val=ignore_val, filter=_filter) 
        output = output_results_paired_T_test(results, taxonomy_info)
    return output
Example #25
0
def compute_seqs_per_library_stats(otu_f):
    counts = []
    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_f)
    for i in range(otu_table.shape[1]):
        counts.append(sum(otu_table[:, i]))

    return min(counts), max(counts), median(counts), mean(counts),\
     dict(zip(sample_ids,counts))
def test_wrapper_multiple(test, otu_table_paths, category_mapping, category, threshold, \
                _filter, otu_include=None):
    """runs statistical test to look for category/OTU associations on multiple files.
       Unlike the test_wrapper() method, this method includes all OTUs, even when 
       some have zero counts.
    """
    mapping_data, header, comments = parse_mapping_file(category_mapping)
    category_info, category_values = \
        get_category_info(mapping_data, header, category, threshold)
    
    # if this is the g_test, disallow otus that are present in all samples 
    filter_all_samples = test == "g_test"

    OTU_list, taxonomy_all_OTUs = get_common_OTUs(otu_table_paths, _filter, \
                                  category_info=category_info, \
                                  filter_all_samples=filter_all_samples, \
                                  otu_include=otu_include)

    all_results = {}
    count = 0
    for otu_table_fp in otu_table_paths:
        count += 1
        sys.stdout.flush()
        otu_table = open(otu_table_fp,'U')

        if test == 'ANOVA' or test == 'correlation': 
            otu_table = convert_OTU_table_relative_abundance(otu_table)
        elif not test=='g_test':
            raise ValueError("An invalid test statistic was given. (-s option). Valid values are ANOVA, correlation, and g_test.")
        sample_ids, otu_ids, otu_data, lineages = \
            parse_otu_table(otu_table, float)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)

        if test == 'ANOVA':
            results = run_ANOVA_OTUs(OTU_list, category_info, otu_sample_info, \
                                         category_values)
        elif test == 'correlation':
            results = run_correlation_OTUs(OTU_list, category_info, otu_sample_info)
        elif test == 'g_test':
            results = run_G_test_OTUs(OTU_list, category_info, otu_sample_info, \
                        category_values, suppress_warnings=True)
        for OTU in results.keys():
            if not all_results.has_key(OTU):
                all_results[OTU] = []
            all_results[OTU].append(results[OTU])
    
    # aggregate multiple results and create output string
    if test == 'ANOVA':
        all_results = aggregate_multiple_results_ANOVA(all_results)
        output = output_results_ANOVA(all_results, category_values, taxonomy_all_OTUs)
    elif test == 'correlation':
        all_results = aggregate_multiple_results_correlation(all_results)
        output = output_results_correlation(all_results, taxonomy_all_OTUs)
    elif test == 'g_test':
        all_results = aggregate_multiple_results_G_test(all_results)
        output = output_results_G_test(all_results, taxonomy_all_OTUs)
    return output
def _filter_table_samples(otu_table_lines, min_seqs_per_sample):
    """removes samples from OTU_table that have less than min_seqs_per_sample
    """
    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_table_lines)
    counts = sum(otu_table)
    big_enough_samples = (counts>=int(min_seqs_per_sample)).nonzero()
    res_otu_table = otu_table.copy()
    res_otu_table = res_otu_table[:,big_enough_samples[0]]
    res_sample_ids = map(sample_ids.__getitem__, big_enough_samples[0])
    return format_otu_table(res_sample_ids, otu_ids, res_otu_table, lineages)
    def setUp(self):
        self.otu_table_as_string = ["#Test otu table",
                       "\t".join(["#OTU ID","S1","S2","S3"]),
                       "\t".join(["0",      "1" ,"0" ,"2" ]),
                       "\t".join(["1",      "1" ,"2" ,"0" ]),
                       "\t".join(["2",      "1" ,"0" ,"0" ]),
                       "\t".join(["3",      "1" ,"0" ,"2" ]),
                       "\t".join(["4",      "1" ,"1" ,"2" ])]

        _, _, self.otu_table, _ = parse_otu_table(self.otu_table_as_string)
Example #29
0
def _filter_table_samples(otu_table_lines, min_seqs_per_sample):
    """removes samples from OTU_table that have less than min_seqs_per_sample
    """
    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_table_lines)
    counts = sum(otu_table)
    big_enough_samples = (counts >= int(min_seqs_per_sample)).nonzero()
    res_otu_table = otu_table.copy()
    res_otu_table = res_otu_table[:, big_enough_samples[0]]
    res_sample_ids = map(sample_ids.__getitem__, big_enough_samples[0])
    return format_otu_table(res_sample_ids, otu_ids, res_otu_table, lineages)
Example #30
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    otu_table_fp = opts.otu_table_fp
    mapping_fp = opts.mapping_fp
    tree_fp = opts.tree_fp
    output_dir = opts.out_fp
    output_basename = splitext(split(otu_table_fp)[1])[0]
    
    if not output_dir:
        output_dir = 'make_tep_output/'
    
    create_dir(output_dir)
    
    tep_fp = '%s/%s.tep' % (output_dir,output_basename)      # opts.out_fp+'.tep'
    jnlp_fp = '%s/%s.jnlp' % (output_dir,output_basename)
    tepfile = open(tep_fp, 'w')
    otu_lines = open(otu_table_fp, 'U').readlines()
    sample_ids, otu_ids, otu_table, metadata = parse_otu_table(otu_lines)
    mapping_lines = open(mapping_fp, 'U')    
    tree_lines = open(tree_fp, 'U')
    
    lines = ['>>tre\n']
    lines += tree_lines.readlines() 
    lines += '\n'
    if(metadata):
        lines += '>>otm\n#OTU ID\tOTU Metadata\n'
        for i in range(len(otu_ids)):
            lines += otu_ids[i] + '\t'
            for m in metadata[i]:
                lines += m + ';'
            # lines = lines[:len(lines)-1]
            lines += '\n'
    lines += '>>osm\n'
    lines += otu_lines
    lines += '\n>>sam\n'
    lines += mapping_lines.readlines()
    
    tepfile.writelines(lines)
    
    jnlpfile = open(jnlp_fp, 'w')
    lines = [jnlp_top_block]
    if(opts.web_flag):
        lines += 'http://topiaryexplorer.sourceforge.net/app/'
    else:
        lines += 'file:'+load_qiime_config()['topiaryexplorer_project_dir']
    lines += jnlp_middle_block
    if(opts.url):
        lines += opts.url
    else:
        lines += os.path.abspath(tep_fp)
    # lines += os.path.abspath(tep_fp)
    lines += jnlp_bottom_block
    jnlpfile.writelines(lines)
    def setUp(self):
        """Define a few simple tables"""
        self.otu_str = """#Full OTU Counts
#OTU ID\ta\tb\tc\td\te
1\t1\t2\t3\t4\t5
2\t5\t4\t3\t2\t1"""
        self.otu_table = parse_otu_table(StringIO(self.otu_str))
        self.otu_tax_str = """#Full OTU Counts
#OTU ID\ta\tb\tc\td\te\tConsensus Lineage
1\t1\t2\t3\t4\t5\tBacteria:Firmicutes
2\t5\t4\t3\t2\t1\tBacteria:Proteobacteria"""
        self.otu_tax_table = parse_otu_table(StringIO(self.otu_tax_str))
        self.map_str = """#SampleID\tStudy\tBodySite\tDescription
a\tDog\tStool\tx
b\tDog\tStool\ty
c\tHand\tPalm\tz
d\tWholeBody\tPalm\ta
e\tWholeBody\tStool\tb"""
        self.map_data, self.map_headers, self.map_comments =\
         parse_mapping_file(StringIO(self.map_str))
def longitudinal_otu_table_conversion_wrapper(otu_table, category_mapping,\
    individual_column, timepoint_zero_column):
    """returns the modified otu_table"""
    otu_table = parse_otu_table(otu_table)
    otu_table = convert_otu_table_relative(otu_table)
    sample_ids, otu_ids, otu_counts, consensus = otu_table
    mapping_data, header, comments = parse_mapping_file(category_mapping)
    samples_from_subject, sample_to_subtract = \
        get_sample_individual_info(mapping_data, header, individual_column, \
        timepoint_zero_column)
    return make_new_otu_counts(otu_ids, sample_ids, otu_counts, consensus, \
        sample_to_subtract, samples_from_subject)
Example #33
0
def get_otu_counts(fpath, data):
    """Reads the OTU table file into memory"""
    try:
        sample_ids, otu_ids, otu_table, lineages = parse_otu_table(
            open(fpath, 'U'))
    except (TypeError, IOError):
        raise MissingFileError, 'OTU Count file required for this analysis'

    if lineages == []:
        raise ValueError, '\n\nThe lineages are missing from the OTU table.  If you used single_rarefaction.py to create your otu_table, make sure you pass the "--lineages_included" option.\n'

    return sample_ids, otu_ids, otu_table, lineages
 def test_get_counts_by_cat(self):
     """get_counts_by_cat should return hand calculated values"""
     cat_otu_table, otus, taxonomy = get_counts_by_cat(\
            self.otu_sample_file.split('\n'), self.num_cats,\
             self.meta_dict,self.labels_lists_dict["Day"],"Day",self.num_samples_by_cat,False)
     cat_otu_table_test = []
     for l in cat_otu_table:
         cat_otu_table_test.append('\t'.join(map(str,l)))
     self.assertEqual('\n'.join(cat_otu_table_test),self.cat_otu_table)
     self.assertEqual(otus,self.otus)
     self.assertEqual(taxonomy,self.taxonomy)
     cat_otu_table, otus, taxonomy = get_counts_by_cat(\
             self.otu_sample_file.split('\n'), self.num_cats,\
             self.meta_dict,self.labels_lists_dict["Day"],"Day",self.num_samples_by_cat,True)
     cat_otu_table_test = []
     for l in cat_otu_table:
         cat_otu_table_test.append('\t'.join(map(str,l)))
     sams, otunames, obs, lineages = parse_otu_table(cat_otu_table_test,float)
     sams, otunames, exp, lineages = parse_otu_table(self.cat_otu_table_norm.split('\n'),float)
     self.assertEqual(otus,self.otus)
     self.assertEqual(taxonomy,self.taxonomy)
def get_otu_counts(fpath, data):
    """Reads the OTU table file into memory"""
    try:
        sample_ids,otu_ids,otu_table,lineages = \
            parse_otu_table(open(fpath,'U'), count_map_f=float)
    except (TypeError, IOError):
        raise MissingFileError, 'OTU Count file required for this analysis'
    
    if lineages==[]:
        raise ValueError, '\n\nThe lineages are missing from the OTU table.  If you used single_rarefaction.py to create your otu_table, make sure you pass the "--lineages_included" option.\n'
        
    return sample_ids,otu_ids,otu_table,lineages
Example #36
0
 def test_parse_otu_table(self):
     """parse_otu_table functions as expected with new-style OTU table
     """
     data = self.otu_table1
     data_f = (data.split('\n'))
     obs = parse_otu_table(data_f)
     exp = (['Fing','Key','NA'],
            ['0','1','2','3','4'],
            array([[19111,44536,42],[1216,3500,6],[1803,1184,2],
                   [1722,4903,17], [589,2074,34]]),
            self.expected_lineages1)
     self.assertEqual(obs, exp)
Example #37
0
def get_job_commands_single_otu_table(python_exe_fp,
                                      beta_diversity_fp,
                                      tree_fp,
                                      job_prefix,
                                      metrics,
                                      input_fp,
                                      output_dir,
                                      working_dir,
                                      jobs_to_start,
                                      command_prefix=None,
                                      command_suffix=None):
    """Generate beta diversity to split single OTU table to multiple jobs
    
    always passes -f to beta_diversity.py
    """

    command_prefix = command_prefix or '/bin/bash; '
    command_suffix = command_suffix or '; exit'

    commands = []
    result_filepaths = []

    sids = parse_otu_table(open(input_fp, 'U'))[0]

    sample_id_groups = merge_to_n_commands(sids, jobs_to_start, ',', '', '')
    for i, sample_id_group in enumerate(sample_id_groups):
        working_dir_i = os.path.join(working_dir, str(i))
        output_dir_i = os.path.join(output_dir, str(i))
        input_dir, input_fn = split(input_fp)
        sample_id_desc = sample_id_group.replace(',', '_')
        output_fns = ['%s_%s' % (metric, input_fn) \
         for metric in metrics.split(',')]
        rename_command, current_result_filepaths = get_rename_command(\
         output_fns,working_dir_i,output_dir_i)

        result_filepaths += current_result_filepaths

        command = '%s %s %s -i %s -o %s -t %s -m %s -f -r %s %s %s' %\
         (command_prefix,\
          python_exe_fp,\
          beta_diversity_fp,\
          input_fp,
          working_dir_i + '/',
          tree_fp,
          metrics,
          sample_id_group,
          rename_command,
          command_suffix)

        commands.append(command)

    return commands, result_filepaths
 def test_add_summary_category_mapping(self):
     """make_new_summary_file works
     """
     otu_table = parse_otu_table(self.otu_table, int)
     mapping, header, comments = parse_mapping_file(self.mapping)
     summary, taxon_order = add_summary_mapping(otu_table, mapping, 3)
     self.assertEqual(taxon_order, [('Root','Bacteria','Actinobacteria'),
                                    ('Root','Bacteria','Firmicutes'),
                                    ('Root','Bacteria','Other')])
     self.assertEqual(summary, {'s1':[1,1,1],
                                's2':[0,3,2],
                                's3':[2,1,1],
                                's4':[4,1,0]})
Example #39
0
    def getOtuTable(self, otu_source):
        """Returns parsed OTU table from putative OTU source."""

        #if we have a string starting with #, assume it's an OTU file,
        #otherwise assume it's a path
        # if 4-tuple, just return it
        if type(otu_source) == type((1, 3, 4, 44)):
            return otu_source
        if hasattr(otu_source, 'startswith') and otu_source.startswith('#'):
            try:
                return parse_otu_table(StringIO(otu_source))
            except (TypeError, ValueError), e:
                raise OtuMissingError, \
                    "Tried to read OTUs from string starting with # but got "+e
Example #40
0
def get_taxa(taxa_fname, sample_ids_kept=None):
    """Opens and returns taxon summaries
       Parameters
        sample_ids, optional list of ids; all other ids are removed

       Returns lineages, counts
    """
    # future: pass in open file object instead
    taxa_f = open(taxa_fname, 'U')

    sample_ids, otu_ids, otu_table, lineages =\
        parse_otu_table(taxa_f,count_map_f=float,remove_empty_rows=True)
    if sample_ids_kept:
        sam_idxs = [sample_ids.index(sam) for sam in sample_ids_kept]
        otu_table = otu_table[:, sam_idxs]
    return otu_ids, otu_table
Example #41
0
def get_common_OTUs(otu_table_paths, _filter, category_info, \
                        filter_all_samples, otu_include):
    """Searches all OTU tables in dir, returns common OTUs and their taxonomies
       Applies filter within each OTU table."""
    OTU_list = set()
    taxonomy_all_OTUs = {}
    count = 0

    # get list of all observed OTUs and their taxonomies
    for otu_table_fp in otu_table_paths:
        count += 1
        sys.stdout.flush()
        otu_table = open(otu_table_fp, 'U')
        sample_ids, otu_ids, otu_data, lineages = \
            parse_otu_table(otu_table, float)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)
        otu_table.close()
        OTU_list_i = filter_OTUs(otu_sample_info, _filter, all_samples=filter_all_samples, \
            category_mapping_info=category_info)

        if count == 1:
            OTU_list = set(OTU_list_i)
        else:
            OTU_list &= set(OTU_list_i)

        for OTU in taxonomy_info.keys():
            taxonomy_all_OTUs[OTU] = taxonomy_info[OTU]

    #filter OTU_list with the otu_include list
    if not otu_include is None:
        otu_include = [line.strip() for line in otu_include]
        OTU_list &= set(otu_include)
        if len(OTU_list) == 0:
            raise ValueError(
                "No OTUs remain after applying the filter. Try lowering the filter value (-f option)"
            )

    # remove taxonomies for OTUs not in OTU_list
    for k in taxonomy_all_OTUs.keys():
        if not k in OTU_list:
            del (taxonomy_all_OTUs[k])
    return OTU_list, taxonomy_all_OTUs
Example #42
0
    def test_parse_otu_table_float_counts(self):
        """parse_otu_table should return correct result from small table"""
        data = """#Full OTU Counts
#OTU ID	Fing	Key	NA	Consensus Lineage
0	19111	44536	42	Bacteria; Actinobacteria; Actinobacteridae; Propionibacterineae; Propionibacterium
1	1216	3500	6	Bacteria; Firmicutes; Alicyclobacillaceae; Bacilli; Lactobacillales; Lactobacillales; Streptococcaceae; Streptococcus
2	1803	1184	2	Bacteria; Actinobacteria; Actinobacteridae; Gordoniaceae; Corynebacteriaceae
3	1722	4903	17	Bacteria; Firmicutes; Alicyclobacillaceae; Bacilli; Staphylococcaceae
4	589	2074	34	Bacteria; Cyanobacteria; Chloroplasts; vectors"""
        data_f = (data.split('\n'))
        obs = parse_otu_table(data_f, count_map_f=float)
        exp = (['Fing','Key','NA'],
               ['0','1','2','3','4'],
               array([[19111.,44536.,42.],[1216.,3500.,6.],[1803.,1184.,2.],\
                    [1722.,4903.,17.], [589,2074.,34.]]),
               [['Bacteria','Actinobacteria','Actinobacteridae','Propionibacterineae','Propionibacterium'],
                ['Bacteria','Firmicutes','Alicyclobacillaceae','Bacilli','Lactobacillales','Lactobacillales','Streptococcaceae','Streptococcus'],
                ['Bacteria','Actinobacteria','Actinobacteridae','Gordoniaceae','Corynebacteriaceae'],
                ['Bacteria','Firmicutes','Alicyclobacillaceae','Bacilli','Staphylococcaceae'],
                ['Bacteria','Cyanobacteria','Chloroplasts','vectors']])
        self.assertEqual(obs, exp)
Example #43
0
    def test_parse_otu_table_file(self):
        """parse_otu_table should return correct result on fileio format object"""
        data = """#Full OTU Counts
#OTU ID	Fing	Key	NA	Consensus Lineage
0	19111	44536	42	Bacteria; Actinobacteria; Actinobacteridae; Propionibacterineae; Propionibacterium
1	1216	3500	6	Bacteria; Firmicutes; Alicyclobacillaceae; Bacilli; Lactobacillales; Lactobacillales; Streptococcaceae; Streptococcus
2	1803	1184	2	Bacteria; Actinobacteria; Actinobacteridae; Gordoniaceae; Corynebacteriaceae
3	1722	4903	17	Bacteria; Firmicutes; Alicyclobacillaceae; Bacilli; Staphylococcaceae
4	589	2074	34	Bacteria; Cyanobacteria; Chloroplasts; vectors"""
        data_f = StringIO(data)
        obs = parse_otu_table(data_f)
        exp = (['Fing','Key','NA'],
               ['0','1','2','3','4'],
               array([[19111,44536,42],[1216,3500,6],[1803,1184,2],\
                    [1722,4903,17], [589,2074,34]]),
               [['Bacteria','Actinobacteria','Actinobacteridae','Propionibacterineae','Propionibacterium'],
                ['Bacteria','Firmicutes','Alicyclobacillaceae','Bacilli','Lactobacillales','Lactobacillales','Streptococcaceae','Streptococcus'],
                ['Bacteria','Actinobacteria','Actinobacteridae','Gordoniaceae','Corynebacteriaceae'],
                ['Bacteria','Firmicutes','Alicyclobacillaceae','Bacilli','Staphylococcaceae'],
                ['Bacteria','Cyanobacteria','Chloroplasts','vectors']])
        self.assertEqual(obs, exp)
Example #44
0
def merge_otu_tables(otu_table_f1, otu_table_f2):
    """ Merge two otu tables with the same sample IDs
    
        WARNING: The OTU ids must refer to the same OTUs, which
         typically only happens when OTUs were picked against a 
         reference database, as with the BLAST OTU picker.
    
    """
    sample_ids1, otu_ids1, otu_table1, lineages1 =\
        parse_otu_table(otu_table_f1)
    sample_ids2, otu_ids2, otu_table2, lineages2 =\
        parse_otu_table(otu_table_f2)

    assert set(sample_ids1) & set(sample_ids2) == set(),\
     'Overlapping sample ids detected.'
    sample_ids_result = sample_ids1 + sample_ids2
    sample_ids_result_lookup = dict([
        (sid, i) for i, sid in enumerate(sample_ids_result)
    ])

    # Will need to add support for OTU tables wo tax info at some
    # point -- in a rush now so don't have time to add it without an
    # immediate use case.
    if lineages1 and lineages2:
        # map OTU ids to lineages -- in case of conflicts (i.e, OTU assigned)
        # different lineage in different otu tables, the lineage from
        # OTU table 1 will be taken
        lineages = True
        otu_id_to_lineage = dict(zip(otu_ids1, lineages1))
        otu_id_to_lineage.update(dict([(otu_id,lineage)\
         for otu_id,lineage in zip(otu_ids2,lineages2)\
         if otu_id not in otu_id_to_lineage]))
    elif not (lineages1 or lineages2):
        lineages = False
    else:
        raise ValueError, ('Taxonomic information must be provided either'
                           ' for all or none of the OTU tables')

    # Get the union of the otu IDs
    otu_ids_result = list(otu_ids1)
    otu_ids_lookup = {}.fromkeys(otu_ids1)
    otu_ids_result.extend([otu_id for otu_id in otu_ids2 \
                                  if otu_id not in otu_ids_lookup])
    otu_ids_result_lookup = dict([(oid, i)
                                  for i, oid in enumerate(otu_ids_result)])

    otu_table = zeros(shape=(len(otu_ids_result), len(sample_ids_result)),
                      dtype=int)
    for i, sample_id in enumerate(sample_ids1):
        #col_index = sample_ids_result.index(sample_id)
        col_index = sample_ids_result_lookup[sample_id]
        for j, otu_id in enumerate(otu_ids1):
            #row_index = otu_ids_result.index(otu_id)
            row_index = otu_ids_result_lookup[otu_id]
            otu_table[row_index, col_index] = otu_table1[j, i]

    for i, sample_id in enumerate(sample_ids2):
        #col_index = sample_ids_result.index(sample_id)
        col_index = sample_ids_result_lookup[sample_id]
        for j, otu_id in enumerate(otu_ids2):
            #row_index = otu_ids_result.index(otu_id)
            row_index = otu_ids_result_lookup[otu_id]
            otu_table[row_index, col_index] = otu_table2[j, i]

    if lineages:
        lineages_result = [
            otu_id_to_lineage[otu_id] for otu_id in otu_ids_result
        ]
    else:
        lineages_result = None

    return sample_ids_result, otu_ids_result, otu_table, lineages_result
Example #45
0
def plot_rank_abundance_graphs(sample_names, otu_table_fh,
                               output_dir, file_type='pdf',
                               absolute_counts=False,
                               x_linear_scale=False,
                               y_linear_scale=False,
                               no_legend=False,
                               log_fh=None):

    """plot rank-abundance curves for sample specified in sample_name.
    
    sample_names: comma separated string of sample names
    otu_table_fh: open file handle to otu table
    output_dir: existing directory to which files are written
    file_type: valid matplotlib file type
    x_linear_scale: if True draw x axis in linear scale, otherwise use log
    y_linear_scale: if True draw y axis in linear scale, otherwise use log
    no_legend: if True don't draw legend
    log_fh: open file handle to log file, if not None used to log 
"""
    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_table_fh)

    #figure out which samples to draw
    if sample_names=='*':
        user_sample_names = sample_ids
    else:
        user_sample_names = sample_names.split(',')
        if len(user_sample_names)<1:
            raise ValueError, "sample IDs must be comma separated list of "\
            +"sample names - found %s" % sample_names 

    # do the actual drawing
    ax=None
    for sample_name,color in zip(user_sample_names, cycle(data_color_order)):
        try:
            index = sample_ids.index(sample_name)
        except ValueError:
            if log_fh:
                log_fh.write("Warning: Sample name %s not in OTU table - skipping." % sample_name)
            continue     
        ax = plot_rank_abundance_graph(otu_table[:,index], color=color,
                                       absolute=absolute_counts,
                                       label=sample_name)
        ax.set_label(sample_name)
    
    if ax==None:
        #ax should be defined if at least one series has been drawn
        raise ValueError("No data series drawn. Check your OTU table and sample names")

    #settings for all series
    ax.grid()      
    ax.set_xlabel('Species rank')
    ax.set_ylabel('Relative abundance')

    if not x_linear_scale:
        ax.set_xscale('log')
    if not y_linear_scale:
        ax.set_yscale('log')
  
    if not no_legend:
        legend()

    #build output fp, if less than MAX_SAMPLES_... append the samples names    
    output_fp = output_dir+ "/rank_abundance"
    MAX_SAMPLES_TO_SHOW_IN_FILENAME = 6
    if len(user_sample_names) < MAX_SAMPLES_TO_SHOW_IN_FILENAME:
        output_fp += '_'.join(user_sample_names) 
    output_fp += ".%s" % file_type

    savefig(output_fp, format=file_type)
Example #46
0
        # if 4-tuple, just return it
        if type(otu_source) == type((1, 3, 4, 44)):
            return otu_source
        if hasattr(otu_source, 'startswith') and otu_source.startswith('#'):
            try:
                return parse_otu_table(StringIO(otu_source))
            except (TypeError, ValueError), e:
                raise OtuMissingError, \
                    "Tried to read OTUs from string starting with # but got "+e
        else:
            try:
                otu_file = open(otu_source, 'U')
            except (TypeError, IOError):
                raise OtuMissingError, \
                    "Couldn't read OTU file at path: %s" % otu_source
            result = parse_otu_table(otu_file)
            otu_file.close()
            return result

    def getTree(self, tree_source):
        """Returns parsed tree from putative tree source"""
        if isinstance(tree_source, PhyloNode):
            tree = tree_source  #accept tree object directly for tests
        elif tree_source:
            try:
                f = open(tree_source, 'U')
            except (TypeError, IOError):
                raise TreeMissingError, \
                    "Couldn't read tree file at path: %s" % tree_source
            tree = parse_newick(f, PhyloNode)
            f.close()
Example #47
0
def single_file_beta(input_path,
                     metrics,
                     tree_path,
                     output_dir,
                     rowids=None,
                     full_tree=False):
    """ does beta diversity calc on a single otu table

    uses name in metrics to name output beta diversity files
    assumes input tree is already trimmed to contain only otus present in otu
    table, doesn't call getSubTree()
    inputs:
     input_path (str)
     metrics (str, comma delimited if more than 1 metric)
     tree_path (str)
     output_dir (str)
     rowids (comma separated str)
    """
    f = open(input_path, 'U')
    samids, otuids, otumtx, lineages = parse_otu_table(f)
    # otu mtx is otus by samples
    f.close()
    tree = None
    if tree_path:
        f = open(tree_path, 'U')
        tree = parse_newick(f, PhyloNode)
        f.close()
        if not full_tree:
            tree = tree.getSubTree(otuids, ignore_missing=True)

    metrics_list = metrics.split(',')
    for metric in metrics_list:
        outfilepath = os.path.join(output_dir,
                                   metric + '_' + os.path.split(input_path)[1])
        try:
            metric_f = get_nonphylogenetic_metric(metric)
            is_phylogenetic = False
        except AttributeError:
            try:
                metric_f = get_phylogenetic_metric(metric)
                is_phylogenetic = True
                if tree == None:
                    stderr.write("metric %s requires a tree, but none found\n"\
                        % (metric,))
                    exit(1)
            except AttributeError:
                stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\
                    % (metric, ', '.join(list_known_metrics())))
                exit(1)
        if rowids == None:
            # standard, full way
            if is_phylogenetic:
                dissims = metric_f(otumtx.T, otuids, tree, samids)
            else:
                dissims = metric_f(otumtx.T)

            f = open(outfilepath, 'w')
            f.write(format_distance_matrix(samids, dissims))
            f.close()
        else:
            # only calc d(rowid1, *) for each rowid
            rowids_list = rowids.split(',')
            row_dissims = []  # same order as rowids_list
            for rowid in rowids_list:
                rowidx = samids.index(rowid)

                # first test if we can the dissim is a fn of only the pair
                # if not, just calc the whole matrix
                if metric_f.__name__ == 'dist_chisq' or \
                    metric_f.__name__ == 'dist_gower' or \
                    metric_f.__name__ == 'dist_hellinger' or\
                    metric_f.__name__ == 'binary_dist_chisq':
                    row_dissims.append(metric_f(otumtx.T)[rowidx])
                else:
                    try:
                        row_metric = get_phylogenetic_row_metric(metric)
                    except AttributeError:
                        # do element by element
                        dissims = []
                        for i in range(len(samids)):
                            if is_phylogenetic:
                                dissim = metric_f(
                                    otumtx.T[[rowidx, i], :], otuids, tree,
                                    [samids[rowidx], samids[i]])[0, 1]
                            else:
                                dissim = metric_f(otumtx.T[[rowidx, i], :])[0,
                                                                            1]
                            dissims.append(dissim)
                        row_dissims.append(dissims)
                    else:
                        # do whole row at once
                        dissims = row_metric(otumtx.T, otuids, tree, samids,
                                             rowid)
                        row_dissims.append(dissims)

            # rows_outfilepath = os.path.join(output_dir, metric + '_' +\
            #     '_'.join(rowids_list) + '_' + os.path.split(input_path)[1])
            f = open(outfilepath, 'w')
            f.write(format_matrix(row_dissims, rowids_list, samids))
            f.close()
Example #48
0
def test_wrapper_multiple(test, otu_table_paths, category_mapping, category, threshold, \
                _filter, otu_include=None):
    """runs statistical test to look for category/OTU associations on multiple files.
       Unlike the test_wrapper() method, this method includes all OTUs, even when 
       some have zero counts.
    """
    mapping_data, header, comments = parse_mapping_file(category_mapping)
    category_info, category_values = \
        get_category_info(mapping_data, header, category, threshold)

    # if this is the g_test, disallow otus that are present in all samples
    filter_all_samples = test == "g_test"

    OTU_list, taxonomy_all_OTUs = get_common_OTUs(otu_table_paths, _filter, \
                                  category_info=category_info, \
                                  filter_all_samples=filter_all_samples, \
                                  otu_include=otu_include)

    all_results = {}
    count = 0
    for otu_table_fp in otu_table_paths:
        count += 1
        sys.stdout.flush()
        otu_table = open(otu_table_fp, 'U')

        if test == 'ANOVA' or test == 'correlation':
            otu_table = convert_OTU_table_relative_abundance(otu_table)
        elif not test == 'g_test':
            raise ValueError(
                "An invalid test statistic was given. (-s option). Valid values are ANOVA, correlation, and g_test."
            )
        sample_ids, otu_ids, otu_data, lineages = \
            parse_otu_table(otu_table, float)
        otu_sample_info, num_samples, taxonomy_info = \
            get_otu_table_info(sample_ids, otu_ids, otu_data, lineages)

        if test == 'ANOVA':
            results = run_ANOVA_OTUs(OTU_list, category_info, otu_sample_info, \
                                         category_values)
        elif test == 'correlation':
            results = run_correlation_OTUs(OTU_list, category_info,
                                           otu_sample_info)
        elif test == 'g_test':
            results = run_G_test_OTUs(OTU_list, category_info, otu_sample_info, \
                        category_values, suppress_warnings=True)
        for OTU in results.keys():
            if not all_results.has_key(OTU):
                all_results[OTU] = []
            all_results[OTU].append(results[OTU])

    # aggregate multiple results and create output string
    if test == 'ANOVA':
        all_results = aggregate_multiple_results_ANOVA(all_results)
        output = output_results_ANOVA(all_results, category_values,
                                      taxonomy_all_OTUs)
    elif test == 'correlation':
        all_results = aggregate_multiple_results_correlation(all_results)
        output = output_results_correlation(all_results, taxonomy_all_OTUs)
    elif test == 'g_test':
        all_results = aggregate_multiple_results_G_test(all_results)
        output = output_results_G_test(all_results, taxonomy_all_OTUs)
    return output
Example #49
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes<3:
        option_parser.error(('You need to plot at least 3 axes.'))
        
    # verifying that the number of segments is between the desired range
    if number_of_segments<4 or number_of_segments>14:
        option_parser.error(('number_of_segments should be between 4 and 14.'))
        
    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes!=None and len(custom_axes.split(','))>1 and\
        isdir(input_coords):
        option_parser.error(('Jackknifed plots are limited to one custom axis, '
            'currently trying to use: %s. Make sure you use only one.' %
            custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if isdir(input_coords) == False and compare_plots:
        option_parser.error('Cannot use the \'--compare_plots\' flag unless the'
            ' input path is a directory.')

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp,'U'))

        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
    except:
        option_parser.error(('The metadata mapping file \'%s\' does not seem '
            'to be formatted correctly, verify the formatting is QIIME '
            'compliant by using check_id_map.py') % map_fp)

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[]

        # iterate only over the non-hidden files and not folders and if anything
        # ignore the procrustes results file that is generated by
        # transform_coordinate_matrices.py suffixed in procrustes_results.txt
        coord_fps = [join(input_coords, f) for f in listdir(input_coords) if
            not f.startswith('.') and not isdir(join(abspath(input_coords),f))
            and not f.endswith('procrustes_results.txt')]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:
            option_parser.error('Could not use any of the files in the input '
                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and compare_plots == False:
            if master_pcoa in coord_fps: # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in  coord_fps:
                coord_fps.remove(master_pcoa)
                coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        elif master_pcoa == None and len([f for f in coord_fps if f.endswith(
            '_transformed_reference.txt')]):
            master_pcoa = [f for f in coord_fps if f.endswith(
                '_transformed_reference.txt')][0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\
                    parse_coords(open(fp,'U'))

                # pack all the data correspondingly
                coords_headers.append(_coords_headers)
                coords_data.append(_coords_data)
                coords_eigenvalues.append(_coords_eigenvalues)
                coords_pct.append(_coords_pct)
            except ValueError:
                offending_coords_fp.append(fp)

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            option_parser.error(('The following file(s): \'%s\' could not be '
                'parsed properly. Make sure the input folder only contains '
                'coordinates files.') % ', '.join(offending_coords_fp))

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        non_shared_ids = set(sum([list(set(sum(coords_headers, []))^set(e))
            for e in coords_headers],[]))
        if non_shared_ids and len(coords_headers) > 1:
            option_parser.error(('The following sample identifier(s): \'%s\''
                'are not shared between all the files. The files used to '
                'make a jackknifed PCoA plot or coordinate comparison plot ('
                'procustes plot) must share all the same sample identifiers'
                'between each other.')%', '.join(list(non_shared_ids)))

        # flatten the list of lists into a 1-d list
        _coords_headers = list(set(sum(coords_headers, [])))

        # number of samples ids that are shared between coords and mapping files
        sids_intersection=list(set(zip(*mapping_data)[0])&set(_coords_headers))

        # sample ids that are not mapped but are in the coords
        sids_difference=list(set(_coords_headers)-set(zip(*mapping_data)[0]))

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            coords_headers, coords_data, coords_eigenvalues, coords_pct =\
                parse_coords(open(input_coords,'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except ValueError:
            option_parser.error(('The PCoA file \'%s\' does not seem to be a '
                'coordinates formatted file, verify by manually inspecting '
                'the contents.') % input_coords)

        # number of samples ids that are shared between coords and mapping files
        sids_intersection = list(set(zip(*mapping_data)[0])&set(coords_headers))
        # sample ids that are not mapped but are in the coords
        sids_difference = list(set(coords_headers)-set(zip(*mapping_data)[0]))
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # for summarized tables the "otu_ids" are really the "lineages"
            otu_sample_ids, lineages, otu_table, _ = parse_otu_table(open(
                taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True)
        except ValueError, e:
            option_parser.error('There was a problem parsing the --taxa_fp: %s'%
                e.message)

        # make sure there are matching sample ids with the otu table
        if not len(list(set(sids_intersection)&set(otu_sample_ids))):
            option_parser.error('The sample identifiers in the OTU table must '
                'have at least one match with the data in the mapping file and '
                'with the coordinates file. Verify you are using input files '
                'that belong to the same dataset.')
        if len(lineages) <= 1:
            option_parser.error('Contingency tables with one or fewer rows are '
                'not supported, please try passing a contingency table with '
                'more than one row.')
Example #50
0
def get_counts_by_cat(lines, num_meta, meta_dict, cat_list, category,
                      num_samples_by_cat, normalize):
    con_by_sample = defaultdict(set)
    node_file_str = []
    edge_file_str = []
    red_nodes = defaultdict(int)
    red_node_file_str = []
    red_edge_file_str = []
    edge_from = []
    to = []
    otu_dc = defaultdict(int)
    degree_counts = defaultdict(int)
    sample_dc = defaultdict(int)
    sample_num_seq = defaultdict(int)
    samples_from_mapping = meta_dict.keys()
    con_list = []
    label_list = []
    norm_otu_table = []
    sample_counts = defaultdict(int)
    cat_otu_table = []
    otus = []
    taxonomy = []
    sample_ids, otu_ids, otu_table, lineages = parse_otu_table(lines)

    label_list = sample_ids
    if lineages == []:
        is_con = False
    else:
        is_con = True
    for idx, line in enumerate(otu_table):
        new_line = []
        label_dict = defaultdict(int)
        data = line
        to_otu = otu_ids[idx]
        otus.append(to_otu)
        con = ''
        if is_con:
            con = '; '.join(lineages[idx])
            counts = data
        else:
            counts = data
        taxonomy.append(con)
        if not normalize:
            for i, c in zip(label_list, counts):
                if i in samples_from_mapping:
                    label_dict[meta_dict[i][0][0]] += c
            for i in cat_list:
                new_line.append(str(label_dict[i]))
            cat_otu_table.append(new_line)

        else:
            new_line.extend(counts)
            norm_otu_table.append(new_line)
            for i, c in zip(label_list, counts):
                sample_counts[i] += c
    total = 0
    if normalize:
        for l in norm_otu_table:
            counts = l
            new_line = []
            label_dict = defaultdict(float)
            getcontext().prec = 28
            for i, c in zip(label_list, counts):
                if i in samples_from_mapping:
                    label_dict[meta_dict[i][0]
                               [0]] += float(c) / (sample_counts[i])
            for i in cat_list:
                new_line.append(
                    round((label_dict[i] / num_samples_by_cat[(category, i)]) *
                          100, 5))
            cat_otu_table.append(new_line)
    return cat_otu_table, otus, taxonomy