def test_get_otu_table_info(self): """get_otu_table_info works""" otu_table = """#Full OTU Counts #OTU ID\tsample1\tsample2\tsample3 0\t0\t2\t0 1\t1\t0\t0 2\t1\t1\t1""".split('\n') sample_ids, otu_ids, otu_data, lineages = \ parse_otu_table(otu_table) result, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) self.assertEqual(result['1'], {'sample1': '1', 'sample3': '0', 'sample2': '0'}) self.assertEqual(result['0'], {'sample1': '0', 'sample3': '0', 'sample2': '2'}) self.assertEqual(result['2'], {'sample1': '1', 'sample3': '1', 'sample2': '1'}) self.assertEqual(num_samples, 3) self.assertEqual(taxonomy_info, {}) #test that it parses otu tables with taxonomy fields appropriately otu_table = """#Full OTU Counts #OTU ID\tsample1\tsample2\tsample3\tConsensus Lineage 0\t0\t2\t0\tBacteria; Bacteroidetes; Bacteroidales; Parabacteroidaceae; Unclassified; otu_475 1\t1\t0\t0\tBacteria; Bacteroidetes; Bacteroidales; adhufec77-25; Barnesiella; Barnesiella_viscericola; otu_369 2\t1\t1\t1\tBacteria; Firmicutes; Clostridia; Clostridiales; Faecalibacterium; Unclassified; otu_1121""".split('\n') sample_ids, otu_ids, otu_data, lineages = \ parse_otu_table(otu_table) result, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) self.assertEqual(result['1'], {'sample1': '1', 'sample3': '0', 'sample2': '0'}) self.assertEqual(result['0'], {'sample1': '0', 'sample3': '0', 'sample2': '2'}) self.assertEqual(result['2'], {'sample1': '1', 'sample3': '1', 'sample2': '1'}) self.assertEqual(num_samples, 3) self.assertEqual(taxonomy_info, {'1': 'Bacteria; Bacteroidetes; Bacteroidales; adhufec77-25; Barnesiella; Barnesiella_viscericola; otu_369', '0': 'Bacteria; Bacteroidetes; Bacteroidales; Parabacteroidaceae; Unclassified; otu_475', '2': 'Bacteria; Firmicutes; Clostridia; Clostridiales; Faecalibacterium; Unclassified; otu_1121'})
def test_sort_otu_table_error(self): """ sort_otu_table handles errors """ self.assertRaises(ValueError,sort_otu_table, parse_otu_table(self.otu_table1),['NA','Key','Fing','Key']) self.assertRaises(KeyError,sort_otu_table, parse_otu_table(self.otu_table1),['NA','Key'])
def test_get_single_paired_T_values(self): """get_single_paired_T_values works""" cat_mapping = """#SampleID\ttimepoint_zero\tindividual s1\t1\tA s2\t0\tA s3\t1\tB s4\t0\tB s5\t1\tC s6\t0\tC""".split('\n') otu_table = """#Full OTU Counts #OTU ID\ts1\ts2\ts3\ts4\ts5\ts6 0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0\t0.2 1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0\t999999999.0 2\t0.0\t0.2\t0.0\t-0.7\t0.0\t0.1""".split('\n') sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table, float) mapping_data, header, comments = parse_mapping_file(cat_mapping) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) OTU_list = ['0', '1', '2'] before_vals, after_vals = get_single_paired_T_values('0', \ mapping_data, header, 'individual', 'timepoint_zero', otu_ids,\ sample_ids, otu_data, 999999999.0) self.assertFloatEqual(before_vals, [0.0, 0.0]) self.assertFloatEqual(after_vals, [0.3, 0.2]) #test of OTU1 before_vals, after_vals = get_single_paired_T_values('1', \ mapping_data, header, 'individual', 'timepoint_zero', otu_ids,\ sample_ids, otu_data, 999999999.0) self.assertFloatEqual(before_vals, [0.0]) self.assertFloatEqual(after_vals, [-0.2]) #works when a sample is missing from the OTU table #e.g. if an after timepoint dropped out during rarefaction #will also drop the before otu_table2 = """#Full OTU Counts #OTU ID\ts1\ts2\ts3\ts4\ts5 0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0 1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0 2\t0.0\t0.2\t0.0\t-0.7\t0.0""".split('\n') sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table2, float) before_vals, after_vals = get_single_paired_T_values('0', \ mapping_data, header, 'individual', 'timepoint_zero', otu_ids,\ sample_ids, otu_data, 999999999.0) self.assertEqual(before_vals, [0.0]) self.assertFloatEqual(after_vals, [0.3]) #works when the before is missing otu_table3 = """#Full OTU Counts #OTU ID\ts1\ts2\ts3\ts4\ts6 0\t999999999.0\t999999999.0\t0.0\t0.3\t0.2 1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0 2\t0.0\t0.2\t0.0\t-0.7\t0.1""".split('\n') sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table3, float) before_vals, after_vals = get_single_paired_T_values('0', \ mapping_data, header, 'individual', 'timepoint_zero', otu_ids,\ sample_ids, otu_data, 999999999.0) self.assertEqual(before_vals, [0.0]) self.assertFloatEqual(after_vals, [0.3])
def test_wrapper(test, otu_table, category_mapping, category, threshold, \ _filter, otu_include=None): """runs statistical test to look for category/OTU associations""" if test == 'ANOVA' or test == 'correlation': otu_table = convert_OTU_table_relative_abundance(otu_table) sample_ids, otu_ids, otu_data, lineages = \ parse_otu_table(otu_table, float) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) mapping_data, header, comments = parse_mapping_file(category_mapping) category_info, category_values = \ get_category_info(mapping_data, header, category, threshold) OTU_list = filter_OTUs(otu_sample_info, _filter, all_samples= False, \ category_mapping_info=category_info) elif test == 'g_test': sample_ids, otu_ids, otu_data, lineages = \ parse_otu_table(otu_table, float) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) mapping_data, header, comments = parse_mapping_file(category_mapping) category_info, category_values = \ get_category_info(mapping_data, header, category, threshold) OTU_list = filter_OTUs(otu_sample_info, _filter, all_samples= True, \ category_mapping_info=category_info) else: raise ValueError( "An invalid test statistic was given. (-s option). Valid values are ANOVA, correlation, and g_test." ) #filter OTU_list with the otu_include list if otu_include: otu_include = [line.strip() for line in otu_include] OTU_list = [OTU for OTU in OTU_list if OTU in otu_include] if len(OTU_list) == 0: raise ValueError( "No OTUs remain after applying the filter. Try lowering the filter value (-f option)" ) if test == 'ANOVA': results = run_ANOVA_OTUs(OTU_list, category_info, otu_sample_info, \ category_values) output = output_results_ANOVA(results, category_values, taxonomy_info) elif test == 'correlation': results = run_correlation_OTUs(OTU_list, category_info, otu_sample_info) output = output_results_correlation(results, taxonomy_info) elif test == 'g_test': results = run_G_test_OTUs(OTU_list, category_info, otu_sample_info, \ category_values) output = output_results_G_test(results, taxonomy_info) return output
def test_sort_otu_table(self): """ sort_otu_table fns as expected """ actual = sort_otu_table(parse_otu_table(self.otu_table1), ['NA','Key','Fing']) expected = parse_otu_table(self.age_sorted_otu_table1) # sample ids match expected self.assertEqual(actual[0],expected[0]) # otu ids match expected self.assertEqual(actual[1],expected[1]) # otu data match expected self.assertEqual(actual[2],expected[2]) # taxa match expected self.assertEqual(actual[3],expected[3])
def test_sort_otu_table_by_mapping_field_some_values_differ(self): """ sort_otu_table fns when some values differ""" actual = sort_otu_table_by_mapping_field(parse_otu_table(self.otu_table1), parse_mapping_file(self.mapping_f2), sort_field = "Nothing") expected = parse_otu_table(self.nothing_sorted_otu_table1) # sample ids match expected self.assertEqual(actual[0],expected[0]) # otu ids match expected self.assertEqual(actual[1],expected[1]) # otu data match expected self.assertEqual(actual[2],expected[2]) # taxa match expected self.assertEqual(actual[3],expected[3])
def test_sort_otu_table_by_mapping_field_some_values_same(self): """ sort_otu_table_by_mapping_field fns when all values are the same""" actual = sort_otu_table_by_mapping_field(parse_otu_table(self.otu_table1), parse_mapping_file(self.mapping_f2), sort_field = "Name") expected = parse_otu_table(self.name_sorted_otu_table1) # sample ids match expected self.assertEqual(actual[0],expected[0]) # otu ids match expected self.assertEqual(actual[1],expected[1]) # otu data match expected self.assertEqual(actual[2],expected[2]) # taxa match expected self.assertEqual(actual[3],expected[3])
def pool_otu_table(otu_infile, otu_outfile, pooled_sample_name, sample_ids_to_pool): """pools otu table file according to specified criteria.""" ## otu table otu_table = parse_otu_table(otu_infile) pool_sample_idxs = [] nonpool_sample_idxs = [] for i in range(len(otu_table[0])): #sample ids if otu_table[0][i] in sample_ids_to_pool: pool_sample_idxs.append(i) else: nonpool_sample_idxs.append(i) new_sample_ids = [] for i in range(len(otu_table[0])): #sample ids if otu_table[0][i] not in sample_ids_to_pool: # from valid_states string on mapfile new_sample_ids.append(otu_table[0][i]) new_sample_ids.append(pooled_sample_name) # otu mtx new_sample_abund = otu_table[2][:,pool_sample_idxs].sum(1) newdims = (len(otu_table[2]),len(new_sample_ids)) new_otu_mtx = numpy.zeros(newdims,dtype=otu_table[2].dtype) new_otu_mtx[:,:-1] = otu_table[2][:,nonpool_sample_idxs] new_otu_mtx[:,-1] = new_sample_abund otu_outfile.write(format_otu_table(new_sample_ids, otu_table[1], new_otu_mtx, taxonomy=otu_table[3]))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) data={} mapping,headers,comments = get_map(opts, data) mapping_headers_to_use=opts.mapping_headers_to_use background_color=opts.background_color monte_carlo_dist=opts.monte_carlo_dist ball_scale=opts.ball_scale arrow_line_color=opts.arrow_line_color arrow_head_color=opts.arrow_head_color taxonomy_count_file = opts.input_taxa_file if taxonomy_count_file: try: counts_f = open(taxonomy_count_file, 'U').readlines() sample_ids, otu_ids, otu_table, lineages = \ parse_otu_table(counts_f,count_map_f=float) except (TypeError, IOError): raise ValueError, 'Summarized taxa file could not be parsed.' else: otu_ids=None out = build_prefs_string(mapping_headers_to_use, background_color, \ monte_carlo_dist, headers, otu_ids, \ ball_scale, arrow_line_color, arrow_head_color) f = open(opts.output_fp,'w') f.write(out) f.close()
def filter_samples_from_otu_table(otu_table_lines, samples_to_discard, negate=False): """ Remove specified samples from OTU table """ otu_table_data = parse_otu_table(otu_table_lines) sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard]) new_otu_table_data = [] new_sample_ids = [] if negate: def keep_sample(s): return s in sample_lookup else: def keep_sample(s): return s not in sample_lookup sample_ids, otu_ids, otu_table_data, taxa = otu_table_data otu_table_data = otu_table_data.transpose() for row,sample_id in zip(otu_table_data,sample_ids): if keep_sample(sample_id): new_otu_table_data.append(row) new_sample_ids.append(sample_id) new_otu_table_data = array(new_otu_table_data).transpose() result = format_otu_table(new_sample_ids, otu_ids, new_otu_table_data, taxa, skip_empty=True).split('\n') return result
def test_run_single_paired_T_test(self): """run_single_paired_T_test works """ cat_mapping = """#SampleID\ttimepoint_zero\tindividual s1\t1\tA s2\t0\tA s3\t1\tB s4\t0\tB s5\t1\tC s6\t0\tC""".split('\n') otu_table = """#Full OTU Counts #OTU ID\ts1\ts2\ts3\ts4\ts5\ts6 0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0\t0.2 1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0\t999999999.0 2\t0.0\t0.2\t0.0\t-0.7\t0.0\t0.1""".split('\n') sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table, float) mapping_data, header, comments = parse_mapping_file(cat_mapping) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) OTU_list = ['0', '1', '2'] #should return the results since there should be 4 values to evaluate result = run_single_paired_T_test('0', mapping_data, header, \ 'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \ 999999999.0, 4) self.assertEqual(len(result), 4) self.assertFloatEqual(result[1], 0.12566591637800242) self.assertFloatEqual(result[2], [0.29999999999999999, 0.20000000000000001]) self.assertEqual(result[3], 2) #check the the filter works result = run_single_paired_T_test('0', mapping_data, header, \ 'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \ 999999999.0, 5) self.assertEqual(result, None)
def test_output_results_paired_T_test(self): """output_results_paired_T_test works """ cat_mapping = """#SampleID\ttimepoint_zero\tindividual s1\t1\tA s2\t0\tA s3\t1\tB s4\t0\tB s5\t1\tC s6\t0\tC""".split('\n') otu_table = """#Full OTU Counts #OTU ID\ts1\ts2\ts3\ts4\ts5\ts6 0\t999999999.0\t999999999.0\t0.0\t0.3\t0.0\t0.2 1\t0.0\t-0.2\t999999999.0\t999999999.0\t999999999.0\t999999999.0 2\t0.0\t0.2\t0.0\t-0.7\t0.0\t0.1""".split('\n') sample_ids, otu_ids, otu_data, lineages = parse_otu_table(otu_table, float) mapping_data, header, comments = parse_mapping_file(cat_mapping) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) OTU_list = ['0', '1', '2'] all_results = run_paired_T_test_OTUs(OTU_list, mapping_data, header, \ 'individual', 'timepoint_zero', otu_ids, sample_ids, otu_data, \ 999999999.0, 4) output = output_results_paired_T_test(all_results) #of = open('/Users/lozupone/temp_output.xls', 'w') #of.write('\n'.join(output)) #of.close() self.assertEqual(output, ['OTU\tprob\tT stat\taverage_diff\tnum_pairs\tBonferroni_corrected\tFDR_corrected', '0\t0.125665916378\t-5.0\t0.25\t2\t0.251331832756\t0.251331832756', '2\t0.685730319473\t0.468164588785\t-0.133333333333\t3\t1.37146063895\t0.685730319473'])
def calc_shared_phylotypes(infile, reference_sample=None): """Calculates number of shared phylotypes for each pair of sample. infile: otu table filehandle reference_sample: if set, will use this sample name to calculate shared OTUs between reference sample, and pair of samples. Useful, e.g. when the reference sample is the Donor in a transplant study """ sample_ids, otu_ids, otu_table, lineages = parse_otu_table(infile) if reference_sample: ref_idx = sample_ids.index(reference_sample) (n,m) = otu_table.shape result_array = zeros((m,m), dtype=int) for i in range(m): for j in range (i+1): if reference_sample: result_array[i,j] = result_array[j,i] = \ _calc_shared_phylotypes_multiple(otu_table, [i, j, ref_idx]) else: result_array[i,j] = result_array[j,i] = \ _calc_shared_phylotypes_pairwise(otu_table, i, j) return format_distance_matrix(sample_ids, result_array)+"\n"
def test_make_new_otu_counts(self): """make_new_otu_counts works """ mapping_lines = """#SampleID\tindividual\ttimepoint_zero\ttimepoint AT0\tA\t1\t0 AT1\tA\t0\t1 AT2\tA\t0\t2 BT0\tB\t1\t0 BT1\tB\t0\t1 BT2\tB\t0\t2 """.split('\n') mapping_data, header, comments = parse_mapping_file(mapping_lines) samples_from_subject, sample_to_subtract = \ get_sample_individual_info(mapping_data, header, 'individual', \ 'timepoint_zero') otu_lines = """# QIIME v1.2.0-dev OTU table #OTU ID\tAT0\tAT1\tS1\tAT2\tBT0\tBT1\tBT2 0\t0.5\t0.3\t99\t0.2\t0.0\t0.0\t0.0 1\t0.0\t0.0\t99\t0.0\t0.4\t0.5\t0.6 2\t0.1\t0.4\t99\t0.7\t0.5\t0.6\t0.8 3\t0.0\t0.1\t99\t0.0\t0.4\t0.0\t0.0 """.split('\n') otu_table = parse_otu_table(otu_lines, float) sample_ids, otu_ids, otu_counts, consensus = otu_table converted_otu_table = make_new_otu_counts(otu_ids, sample_ids, otu_counts, consensus, sample_to_subtract, samples_from_subject) converted_otu_table = converted_otu_table.split('\n') self.assertEqual(converted_otu_table[1], "#OTU ID\tAT0\tAT1\tAT2\tBT0\tBT1\tBT2") self.assertEqual(converted_otu_table[2], "0\t0.0\t-0.2\t-0.3\t999999999.0\t999999999.0\t999999999.0") self.assertEqual(converted_otu_table[3], "1\t999999999.0\t999999999.0\t999999999.0\t0.0\t0.1\t0.2") self.assertEqual(converted_otu_table[4], "2\t0.0\t0.3\t0.6\t0.0\t0.1\t0.3") self.assertEqual(converted_otu_table[5], "3\t0.0\t0.1\t0.0\t0.0\t-0.4\t-0.4")
def test_sum_counts_by_consensus(self): """should sum otu counts by consensus""" otu_table = parse_otu_table(self.otu_table) obs_result, obs_mapping = sum_counts_by_consensus(otu_table, 3) exp_result = {('Root','Bacteria','Actinobacteria'):array([1,0,2,4]), ('Root','Bacteria','Firmicutes'):array([1,3,1,1]), ('Root','Bacteria','Other'):array([1,2,1,0])} exp_mapping = {'s1':0, 's2':1, 's3':2, 's4':3} self.assertEqual(obs_result, exp_result) self.assertEqual(obs_mapping, exp_mapping) obs_result, obs_mapping = sum_counts_by_consensus(otu_table, 2) exp_result = {('Root','Bacteria'):array([3,5,4,5])} exp_mapping = {'s1':0, 's2':1, 's3':2, 's4':3} self.assertEqual(obs_result, exp_result) self.assertEqual(obs_mapping, exp_mapping) obs_result, obs_mapping = sum_counts_by_consensus(otu_table, 4) exp_result = {('Root','Bacteria','Actinobacteria','Actinobacteria'):\ array([1,0,2,4]), ('Root','Bacteria','Firmicutes','"Clostridia"'):\ array([1,3,1,1]), ('Root','Bacteria','Other','Other'):array([1,2,1,0])} exp_mapping = {'s1':0, 's2':1, 's3':2, 's4':3} self.assertEqual(obs_result, exp_result) self.assertEqual(obs_mapping, exp_mapping)
def test_sort_otu_table_by_mapping_field_error(self): """ sort_otu_table_by_mapping_field fails on samples in otu table but not mapping""" self.assertRaises(KeyError,sort_otu_table_by_mapping_field, parse_otu_table(self.otu_table1_bad_sampleID), parse_mapping_file(self.mapping_f2), sort_field = "Age")
def R_format_otu_table(otu_filepath, output_dir=None, write_to_tmp_file=True): """Formats OTU table for R (remove comments & column 1 header) If write_to_tmp_file, writes formatted file to tmp file and returns path else, returns lines to go in file """ sample_ids, otu_ids, otu_matrix, lineages = \ parse_otu_table(open(otu_filepath,'U').readlines()) # first line is sample ids, no header for first column (how R likes it) lines = ['\t'.join(sample_ids)] for i in xrange(len(otu_ids)): # note: casting array as a string and calling "split" is much faster # than mapping "str" onto the array array_as_strings = str(otu_matrix[i, :])[1:-1].split() lines.append(otu_ids[i] + '\t' + '\t'.join(array_as_strings)) if write_to_tmp_file: if output_dir is None: tmp_fp = get_tmp_filename(prefix='otus_R_format', suffix='.txt') else: tmp_fp = join(output_dir, 'otus_R_format.txt') fout = open(tmp_fp, 'w') fout.write('\n'.join(lines)) fout.close() return tmp_fp else: return lines
def filter_otus_from_otu_table(otu_table_lines,otus_to_discard,negate=False): """ Remove specified OTUs from otu_table """ otu_table_data = parse_otu_table(otu_table_lines) otu_lookup = {}.fromkeys([e.split()[0] for e in otus_to_discard]) new_otu_table_data = [] new_otu_ids = [] new_taxa = [] if negate: def keep_otu(s): return s in otu_lookup else: def keep_otu(s): return s not in otu_lookup sample_ids, otu_ids, otu_table_data, taxa = otu_table_data for row,otu_id,taxonomy in zip(otu_table_data,otu_ids,taxa): if keep_otu(otu_id): new_otu_table_data.append(row) new_otu_ids.append(otu_id) new_taxa.append(taxonomy) new_otu_table_data = array(new_otu_table_data) result = format_otu_table(sample_ids, new_otu_ids, new_otu_table_data, new_taxa).split('\n') return result
def test_filter_OTUs(self): """filter_OTUs works""" otu_table = """#Full OTU Counts #OTU ID\tsample1\tsample2\tsample3 0\t0\t2\t0 1\t1\t0\t0 2\t1\t1\t1""".split('\n') sample_ids, otu_ids, otu_data, lineages = \ parse_otu_table(otu_table, float) OTU_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) result = filter_OTUs(OTU_sample_info, 2) self.assertEqual(result, []) result = filter_OTUs(OTU_sample_info, 1) self.assertEqual(result, ['1', '0']) result = filter_OTUs(OTU_sample_info, 2, False) self.assertEqual(result, ['2']) result = filter_OTUs(OTU_sample_info, 1, False) self.assertEqual(result, ['1', '0', '2']) #test that is works if a category mapping file is supplied cat_mapping = {'sample2': '0', 'sample3': '1'} result = filter_OTUs(OTU_sample_info, 1,\ category_mapping_info=cat_mapping) self.assertEqual(result, ['0'])
def _filter_table_neg_control(otu_table_lines, samples): """removes OTUs from OTU_table that are found in one of the samples in the sample list """ sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_table_lines) new_otu_table = [] new_otu_ids = [] new_lineages = [] #get the sample indices to remove sample_indices = [] for i in samples: if i in sample_ids: index = sample_ids.index(i) sample_indices.append(index) for i, row in enumerate(otu_table): #figure out if the OTU is in any of the negative controls count = 0 for j in sample_indices: count += row[j] #only write it to the new OTU table if it is not if count == 0: if lineages: new_lineages.append(lineages[i]) new_otu_table.append(list(row)) new_otu_ids.append(otu_ids[i]) new_otu_table = array(new_otu_table) result = format_otu_table(sample_ids, new_otu_ids, new_otu_table, new_lineages) result = result.split('\n') #remove the samples return _filter_table_samples(result, 1)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table_data = parse_otu_table(open(opts.input_otu_table,'U')) sort_field = opts.sort_field mapping_fp = opts.mapping_fp sorted_sample_ids_fp = opts.sorted_sample_ids_fp if sort_field and mapping_fp: mapping_data = parse_mapping_file(open(mapping_fp,'U')) result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data, sort_field) elif sorted_sample_ids_fp: sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp,'U')) result = sort_otu_table(otu_table_data, sorted_sample_ids) else: parser.error("must provide either --sort_field and --mapping_fp OR --sorted_sample_ids_fp") # format and write the otu table result_str = format_otu_table(result[0],result[1],result[2],result[3]) of = open(opts.output_fp,'w') of.write(result_str) of.close()
def test_wrapper(test, otu_table, category_mapping, category, threshold, \ _filter, otu_include=None, ignore_val=None, \ otu_table_relative_abundance=False, individual_column='individual',\ timepoint_zero_column='timepoint_zero'): """runs statistical test to look for category/OTU associations""" if ignore_val == 'None': ignore_val = None if test == 'ANOVA' or test == 'correlation' or test == 'paired_T': if not otu_table_relative_abundance: otu_table = convert_OTU_table_relative_abundance(otu_table) all_samples = False elif test == 'g_test': all_samples = True else: raise ValueError("An invalid test statistic was given. (-s option). Valid values are ANOVA, correlation, g_test, paired_T.") sample_ids, otu_ids, otu_data, lineages = \ parse_otu_table(otu_table, float) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) mapping_data, header, comments = parse_mapping_file(category_mapping) if not test == 'paired_T': category_info, category_values = \ get_category_info(mapping_data, header, category, threshold) #do not apply the filter_OTUs to the longitudinal studies as they are #filtered later if test == 'ANOVA' or test == 'correlation' or test == 'g_test': OTU_list = filter_OTUs(otu_sample_info, _filter, \ all_samples= all_samples, category_mapping_info=category_info) elif test == 'longitudinal_correlation' or test == 'paired_T': OTU_list = otu_sample_info.keys() #filter OTU_list with the otu_include list if otu_include: otu_include = [line.strip() for line in otu_include] OTU_list = [OTU for OTU in OTU_list if OTU in otu_include] if len(OTU_list) == 0: raise ValueError("No OTUs remain after applying the filter. Try lowering the filter value (-f option)") if test == 'ANOVA': results = run_ANOVA_OTUs(OTU_list, category_info, otu_sample_info, \ category_values) output = output_results_ANOVA(results, category_values, taxonomy_info) elif test == 'correlation': results = run_correlation_OTUs(OTU_list, category_info, \ otu_sample_info, ignore_val=ignore_val, filter=_filter) output = output_results_correlation(results, taxonomy_info) elif test == 'g_test': results = run_G_test_OTUs(OTU_list, category_info, otu_sample_info, \ category_values) output = output_results_G_test(results, taxonomy_info) elif test == 'paired_T': #category info in this case should be the timepoint_zero column. #The timepoint_zero column should be used as the category in the wrapper results = run_paired_T_test_OTUs(OTU_list, mapping_data, header, \ individual_column, timepoint_zero_column, otu_ids, sample_ids, \ otu_data,ignore_val=ignore_val, filter=_filter) output = output_results_paired_T_test(results, taxonomy_info) return output
def compute_seqs_per_library_stats(otu_f): counts = [] sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_f) for i in range(otu_table.shape[1]): counts.append(sum(otu_table[:, i])) return min(counts), max(counts), median(counts), mean(counts),\ dict(zip(sample_ids,counts))
def test_wrapper_multiple(test, otu_table_paths, category_mapping, category, threshold, \ _filter, otu_include=None): """runs statistical test to look for category/OTU associations on multiple files. Unlike the test_wrapper() method, this method includes all OTUs, even when some have zero counts. """ mapping_data, header, comments = parse_mapping_file(category_mapping) category_info, category_values = \ get_category_info(mapping_data, header, category, threshold) # if this is the g_test, disallow otus that are present in all samples filter_all_samples = test == "g_test" OTU_list, taxonomy_all_OTUs = get_common_OTUs(otu_table_paths, _filter, \ category_info=category_info, \ filter_all_samples=filter_all_samples, \ otu_include=otu_include) all_results = {} count = 0 for otu_table_fp in otu_table_paths: count += 1 sys.stdout.flush() otu_table = open(otu_table_fp,'U') if test == 'ANOVA' or test == 'correlation': otu_table = convert_OTU_table_relative_abundance(otu_table) elif not test=='g_test': raise ValueError("An invalid test statistic was given. (-s option). Valid values are ANOVA, correlation, and g_test.") sample_ids, otu_ids, otu_data, lineages = \ parse_otu_table(otu_table, float) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) if test == 'ANOVA': results = run_ANOVA_OTUs(OTU_list, category_info, otu_sample_info, \ category_values) elif test == 'correlation': results = run_correlation_OTUs(OTU_list, category_info, otu_sample_info) elif test == 'g_test': results = run_G_test_OTUs(OTU_list, category_info, otu_sample_info, \ category_values, suppress_warnings=True) for OTU in results.keys(): if not all_results.has_key(OTU): all_results[OTU] = [] all_results[OTU].append(results[OTU]) # aggregate multiple results and create output string if test == 'ANOVA': all_results = aggregate_multiple_results_ANOVA(all_results) output = output_results_ANOVA(all_results, category_values, taxonomy_all_OTUs) elif test == 'correlation': all_results = aggregate_multiple_results_correlation(all_results) output = output_results_correlation(all_results, taxonomy_all_OTUs) elif test == 'g_test': all_results = aggregate_multiple_results_G_test(all_results) output = output_results_G_test(all_results, taxonomy_all_OTUs) return output
def _filter_table_samples(otu_table_lines, min_seqs_per_sample): """removes samples from OTU_table that have less than min_seqs_per_sample """ sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_table_lines) counts = sum(otu_table) big_enough_samples = (counts>=int(min_seqs_per_sample)).nonzero() res_otu_table = otu_table.copy() res_otu_table = res_otu_table[:,big_enough_samples[0]] res_sample_ids = map(sample_ids.__getitem__, big_enough_samples[0]) return format_otu_table(res_sample_ids, otu_ids, res_otu_table, lineages)
def setUp(self): self.otu_table_as_string = ["#Test otu table", "\t".join(["#OTU ID","S1","S2","S3"]), "\t".join(["0", "1" ,"0" ,"2" ]), "\t".join(["1", "1" ,"2" ,"0" ]), "\t".join(["2", "1" ,"0" ,"0" ]), "\t".join(["3", "1" ,"0" ,"2" ]), "\t".join(["4", "1" ,"1" ,"2" ])] _, _, self.otu_table, _ = parse_otu_table(self.otu_table_as_string)
def _filter_table_samples(otu_table_lines, min_seqs_per_sample): """removes samples from OTU_table that have less than min_seqs_per_sample """ sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_table_lines) counts = sum(otu_table) big_enough_samples = (counts >= int(min_seqs_per_sample)).nonzero() res_otu_table = otu_table.copy() res_otu_table = res_otu_table[:, big_enough_samples[0]] res_sample_ids = map(sample_ids.__getitem__, big_enough_samples[0]) return format_otu_table(res_sample_ids, otu_ids, res_otu_table, lineages)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp mapping_fp = opts.mapping_fp tree_fp = opts.tree_fp output_dir = opts.out_fp output_basename = splitext(split(otu_table_fp)[1])[0] if not output_dir: output_dir = 'make_tep_output/' create_dir(output_dir) tep_fp = '%s/%s.tep' % (output_dir,output_basename) # opts.out_fp+'.tep' jnlp_fp = '%s/%s.jnlp' % (output_dir,output_basename) tepfile = open(tep_fp, 'w') otu_lines = open(otu_table_fp, 'U').readlines() sample_ids, otu_ids, otu_table, metadata = parse_otu_table(otu_lines) mapping_lines = open(mapping_fp, 'U') tree_lines = open(tree_fp, 'U') lines = ['>>tre\n'] lines += tree_lines.readlines() lines += '\n' if(metadata): lines += '>>otm\n#OTU ID\tOTU Metadata\n' for i in range(len(otu_ids)): lines += otu_ids[i] + '\t' for m in metadata[i]: lines += m + ';' # lines = lines[:len(lines)-1] lines += '\n' lines += '>>osm\n' lines += otu_lines lines += '\n>>sam\n' lines += mapping_lines.readlines() tepfile.writelines(lines) jnlpfile = open(jnlp_fp, 'w') lines = [jnlp_top_block] if(opts.web_flag): lines += 'http://topiaryexplorer.sourceforge.net/app/' else: lines += 'file:'+load_qiime_config()['topiaryexplorer_project_dir'] lines += jnlp_middle_block if(opts.url): lines += opts.url else: lines += os.path.abspath(tep_fp) # lines += os.path.abspath(tep_fp) lines += jnlp_bottom_block jnlpfile.writelines(lines)
def setUp(self): """Define a few simple tables""" self.otu_str = """#Full OTU Counts #OTU ID\ta\tb\tc\td\te 1\t1\t2\t3\t4\t5 2\t5\t4\t3\t2\t1""" self.otu_table = parse_otu_table(StringIO(self.otu_str)) self.otu_tax_str = """#Full OTU Counts #OTU ID\ta\tb\tc\td\te\tConsensus Lineage 1\t1\t2\t3\t4\t5\tBacteria:Firmicutes 2\t5\t4\t3\t2\t1\tBacteria:Proteobacteria""" self.otu_tax_table = parse_otu_table(StringIO(self.otu_tax_str)) self.map_str = """#SampleID\tStudy\tBodySite\tDescription a\tDog\tStool\tx b\tDog\tStool\ty c\tHand\tPalm\tz d\tWholeBody\tPalm\ta e\tWholeBody\tStool\tb""" self.map_data, self.map_headers, self.map_comments =\ parse_mapping_file(StringIO(self.map_str))
def longitudinal_otu_table_conversion_wrapper(otu_table, category_mapping,\ individual_column, timepoint_zero_column): """returns the modified otu_table""" otu_table = parse_otu_table(otu_table) otu_table = convert_otu_table_relative(otu_table) sample_ids, otu_ids, otu_counts, consensus = otu_table mapping_data, header, comments = parse_mapping_file(category_mapping) samples_from_subject, sample_to_subtract = \ get_sample_individual_info(mapping_data, header, individual_column, \ timepoint_zero_column) return make_new_otu_counts(otu_ids, sample_ids, otu_counts, consensus, \ sample_to_subtract, samples_from_subject)
def get_otu_counts(fpath, data): """Reads the OTU table file into memory""" try: sample_ids, otu_ids, otu_table, lineages = parse_otu_table( open(fpath, 'U')) except (TypeError, IOError): raise MissingFileError, 'OTU Count file required for this analysis' if lineages == []: raise ValueError, '\n\nThe lineages are missing from the OTU table. If you used single_rarefaction.py to create your otu_table, make sure you pass the "--lineages_included" option.\n' return sample_ids, otu_ids, otu_table, lineages
def test_get_counts_by_cat(self): """get_counts_by_cat should return hand calculated values""" cat_otu_table, otus, taxonomy = get_counts_by_cat(\ self.otu_sample_file.split('\n'), self.num_cats,\ self.meta_dict,self.labels_lists_dict["Day"],"Day",self.num_samples_by_cat,False) cat_otu_table_test = [] for l in cat_otu_table: cat_otu_table_test.append('\t'.join(map(str,l))) self.assertEqual('\n'.join(cat_otu_table_test),self.cat_otu_table) self.assertEqual(otus,self.otus) self.assertEqual(taxonomy,self.taxonomy) cat_otu_table, otus, taxonomy = get_counts_by_cat(\ self.otu_sample_file.split('\n'), self.num_cats,\ self.meta_dict,self.labels_lists_dict["Day"],"Day",self.num_samples_by_cat,True) cat_otu_table_test = [] for l in cat_otu_table: cat_otu_table_test.append('\t'.join(map(str,l))) sams, otunames, obs, lineages = parse_otu_table(cat_otu_table_test,float) sams, otunames, exp, lineages = parse_otu_table(self.cat_otu_table_norm.split('\n'),float) self.assertEqual(otus,self.otus) self.assertEqual(taxonomy,self.taxonomy)
def get_otu_counts(fpath, data): """Reads the OTU table file into memory""" try: sample_ids,otu_ids,otu_table,lineages = \ parse_otu_table(open(fpath,'U'), count_map_f=float) except (TypeError, IOError): raise MissingFileError, 'OTU Count file required for this analysis' if lineages==[]: raise ValueError, '\n\nThe lineages are missing from the OTU table. If you used single_rarefaction.py to create your otu_table, make sure you pass the "--lineages_included" option.\n' return sample_ids,otu_ids,otu_table,lineages
def test_parse_otu_table(self): """parse_otu_table functions as expected with new-style OTU table """ data = self.otu_table1 data_f = (data.split('\n')) obs = parse_otu_table(data_f) exp = (['Fing','Key','NA'], ['0','1','2','3','4'], array([[19111,44536,42],[1216,3500,6],[1803,1184,2], [1722,4903,17], [589,2074,34]]), self.expected_lineages1) self.assertEqual(obs, exp)
def get_job_commands_single_otu_table(python_exe_fp, beta_diversity_fp, tree_fp, job_prefix, metrics, input_fp, output_dir, working_dir, jobs_to_start, command_prefix=None, command_suffix=None): """Generate beta diversity to split single OTU table to multiple jobs always passes -f to beta_diversity.py """ command_prefix = command_prefix or '/bin/bash; ' command_suffix = command_suffix or '; exit' commands = [] result_filepaths = [] sids = parse_otu_table(open(input_fp, 'U'))[0] sample_id_groups = merge_to_n_commands(sids, jobs_to_start, ',', '', '') for i, sample_id_group in enumerate(sample_id_groups): working_dir_i = os.path.join(working_dir, str(i)) output_dir_i = os.path.join(output_dir, str(i)) input_dir, input_fn = split(input_fp) sample_id_desc = sample_id_group.replace(',', '_') output_fns = ['%s_%s' % (metric, input_fn) \ for metric in metrics.split(',')] rename_command, current_result_filepaths = get_rename_command(\ output_fns,working_dir_i,output_dir_i) result_filepaths += current_result_filepaths command = '%s %s %s -i %s -o %s -t %s -m %s -f -r %s %s %s' %\ (command_prefix,\ python_exe_fp,\ beta_diversity_fp,\ input_fp, working_dir_i + '/', tree_fp, metrics, sample_id_group, rename_command, command_suffix) commands.append(command) return commands, result_filepaths
def test_add_summary_category_mapping(self): """make_new_summary_file works """ otu_table = parse_otu_table(self.otu_table, int) mapping, header, comments = parse_mapping_file(self.mapping) summary, taxon_order = add_summary_mapping(otu_table, mapping, 3) self.assertEqual(taxon_order, [('Root','Bacteria','Actinobacteria'), ('Root','Bacteria','Firmicutes'), ('Root','Bacteria','Other')]) self.assertEqual(summary, {'s1':[1,1,1], 's2':[0,3,2], 's3':[2,1,1], 's4':[4,1,0]})
def getOtuTable(self, otu_source): """Returns parsed OTU table from putative OTU source.""" #if we have a string starting with #, assume it's an OTU file, #otherwise assume it's a path # if 4-tuple, just return it if type(otu_source) == type((1, 3, 4, 44)): return otu_source if hasattr(otu_source, 'startswith') and otu_source.startswith('#'): try: return parse_otu_table(StringIO(otu_source)) except (TypeError, ValueError), e: raise OtuMissingError, \ "Tried to read OTUs from string starting with # but got "+e
def get_taxa(taxa_fname, sample_ids_kept=None): """Opens and returns taxon summaries Parameters sample_ids, optional list of ids; all other ids are removed Returns lineages, counts """ # future: pass in open file object instead taxa_f = open(taxa_fname, 'U') sample_ids, otu_ids, otu_table, lineages =\ parse_otu_table(taxa_f,count_map_f=float,remove_empty_rows=True) if sample_ids_kept: sam_idxs = [sample_ids.index(sam) for sam in sample_ids_kept] otu_table = otu_table[:, sam_idxs] return otu_ids, otu_table
def get_common_OTUs(otu_table_paths, _filter, category_info, \ filter_all_samples, otu_include): """Searches all OTU tables in dir, returns common OTUs and their taxonomies Applies filter within each OTU table.""" OTU_list = set() taxonomy_all_OTUs = {} count = 0 # get list of all observed OTUs and their taxonomies for otu_table_fp in otu_table_paths: count += 1 sys.stdout.flush() otu_table = open(otu_table_fp, 'U') sample_ids, otu_ids, otu_data, lineages = \ parse_otu_table(otu_table, float) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) otu_table.close() OTU_list_i = filter_OTUs(otu_sample_info, _filter, all_samples=filter_all_samples, \ category_mapping_info=category_info) if count == 1: OTU_list = set(OTU_list_i) else: OTU_list &= set(OTU_list_i) for OTU in taxonomy_info.keys(): taxonomy_all_OTUs[OTU] = taxonomy_info[OTU] #filter OTU_list with the otu_include list if not otu_include is None: otu_include = [line.strip() for line in otu_include] OTU_list &= set(otu_include) if len(OTU_list) == 0: raise ValueError( "No OTUs remain after applying the filter. Try lowering the filter value (-f option)" ) # remove taxonomies for OTUs not in OTU_list for k in taxonomy_all_OTUs.keys(): if not k in OTU_list: del (taxonomy_all_OTUs[k]) return OTU_list, taxonomy_all_OTUs
def test_parse_otu_table_float_counts(self): """parse_otu_table should return correct result from small table""" data = """#Full OTU Counts #OTU ID Fing Key NA Consensus Lineage 0 19111 44536 42 Bacteria; Actinobacteria; Actinobacteridae; Propionibacterineae; Propionibacterium 1 1216 3500 6 Bacteria; Firmicutes; Alicyclobacillaceae; Bacilli; Lactobacillales; Lactobacillales; Streptococcaceae; Streptococcus 2 1803 1184 2 Bacteria; Actinobacteria; Actinobacteridae; Gordoniaceae; Corynebacteriaceae 3 1722 4903 17 Bacteria; Firmicutes; Alicyclobacillaceae; Bacilli; Staphylococcaceae 4 589 2074 34 Bacteria; Cyanobacteria; Chloroplasts; vectors""" data_f = (data.split('\n')) obs = parse_otu_table(data_f, count_map_f=float) exp = (['Fing','Key','NA'], ['0','1','2','3','4'], array([[19111.,44536.,42.],[1216.,3500.,6.],[1803.,1184.,2.],\ [1722.,4903.,17.], [589,2074.,34.]]), [['Bacteria','Actinobacteria','Actinobacteridae','Propionibacterineae','Propionibacterium'], ['Bacteria','Firmicutes','Alicyclobacillaceae','Bacilli','Lactobacillales','Lactobacillales','Streptococcaceae','Streptococcus'], ['Bacteria','Actinobacteria','Actinobacteridae','Gordoniaceae','Corynebacteriaceae'], ['Bacteria','Firmicutes','Alicyclobacillaceae','Bacilli','Staphylococcaceae'], ['Bacteria','Cyanobacteria','Chloroplasts','vectors']]) self.assertEqual(obs, exp)
def test_parse_otu_table_file(self): """parse_otu_table should return correct result on fileio format object""" data = """#Full OTU Counts #OTU ID Fing Key NA Consensus Lineage 0 19111 44536 42 Bacteria; Actinobacteria; Actinobacteridae; Propionibacterineae; Propionibacterium 1 1216 3500 6 Bacteria; Firmicutes; Alicyclobacillaceae; Bacilli; Lactobacillales; Lactobacillales; Streptococcaceae; Streptococcus 2 1803 1184 2 Bacteria; Actinobacteria; Actinobacteridae; Gordoniaceae; Corynebacteriaceae 3 1722 4903 17 Bacteria; Firmicutes; Alicyclobacillaceae; Bacilli; Staphylococcaceae 4 589 2074 34 Bacteria; Cyanobacteria; Chloroplasts; vectors""" data_f = StringIO(data) obs = parse_otu_table(data_f) exp = (['Fing','Key','NA'], ['0','1','2','3','4'], array([[19111,44536,42],[1216,3500,6],[1803,1184,2],\ [1722,4903,17], [589,2074,34]]), [['Bacteria','Actinobacteria','Actinobacteridae','Propionibacterineae','Propionibacterium'], ['Bacteria','Firmicutes','Alicyclobacillaceae','Bacilli','Lactobacillales','Lactobacillales','Streptococcaceae','Streptococcus'], ['Bacteria','Actinobacteria','Actinobacteridae','Gordoniaceae','Corynebacteriaceae'], ['Bacteria','Firmicutes','Alicyclobacillaceae','Bacilli','Staphylococcaceae'], ['Bacteria','Cyanobacteria','Chloroplasts','vectors']]) self.assertEqual(obs, exp)
def merge_otu_tables(otu_table_f1, otu_table_f2): """ Merge two otu tables with the same sample IDs WARNING: The OTU ids must refer to the same OTUs, which typically only happens when OTUs were picked against a reference database, as with the BLAST OTU picker. """ sample_ids1, otu_ids1, otu_table1, lineages1 =\ parse_otu_table(otu_table_f1) sample_ids2, otu_ids2, otu_table2, lineages2 =\ parse_otu_table(otu_table_f2) assert set(sample_ids1) & set(sample_ids2) == set(),\ 'Overlapping sample ids detected.' sample_ids_result = sample_ids1 + sample_ids2 sample_ids_result_lookup = dict([ (sid, i) for i, sid in enumerate(sample_ids_result) ]) # Will need to add support for OTU tables wo tax info at some # point -- in a rush now so don't have time to add it without an # immediate use case. if lineages1 and lineages2: # map OTU ids to lineages -- in case of conflicts (i.e, OTU assigned) # different lineage in different otu tables, the lineage from # OTU table 1 will be taken lineages = True otu_id_to_lineage = dict(zip(otu_ids1, lineages1)) otu_id_to_lineage.update(dict([(otu_id,lineage)\ for otu_id,lineage in zip(otu_ids2,lineages2)\ if otu_id not in otu_id_to_lineage])) elif not (lineages1 or lineages2): lineages = False else: raise ValueError, ('Taxonomic information must be provided either' ' for all or none of the OTU tables') # Get the union of the otu IDs otu_ids_result = list(otu_ids1) otu_ids_lookup = {}.fromkeys(otu_ids1) otu_ids_result.extend([otu_id for otu_id in otu_ids2 \ if otu_id not in otu_ids_lookup]) otu_ids_result_lookup = dict([(oid, i) for i, oid in enumerate(otu_ids_result)]) otu_table = zeros(shape=(len(otu_ids_result), len(sample_ids_result)), dtype=int) for i, sample_id in enumerate(sample_ids1): #col_index = sample_ids_result.index(sample_id) col_index = sample_ids_result_lookup[sample_id] for j, otu_id in enumerate(otu_ids1): #row_index = otu_ids_result.index(otu_id) row_index = otu_ids_result_lookup[otu_id] otu_table[row_index, col_index] = otu_table1[j, i] for i, sample_id in enumerate(sample_ids2): #col_index = sample_ids_result.index(sample_id) col_index = sample_ids_result_lookup[sample_id] for j, otu_id in enumerate(otu_ids2): #row_index = otu_ids_result.index(otu_id) row_index = otu_ids_result_lookup[otu_id] otu_table[row_index, col_index] = otu_table2[j, i] if lineages: lineages_result = [ otu_id_to_lineage[otu_id] for otu_id in otu_ids_result ] else: lineages_result = None return sample_ids_result, otu_ids_result, otu_table, lineages_result
def plot_rank_abundance_graphs(sample_names, otu_table_fh, output_dir, file_type='pdf', absolute_counts=False, x_linear_scale=False, y_linear_scale=False, no_legend=False, log_fh=None): """plot rank-abundance curves for sample specified in sample_name. sample_names: comma separated string of sample names otu_table_fh: open file handle to otu table output_dir: existing directory to which files are written file_type: valid matplotlib file type x_linear_scale: if True draw x axis in linear scale, otherwise use log y_linear_scale: if True draw y axis in linear scale, otherwise use log no_legend: if True don't draw legend log_fh: open file handle to log file, if not None used to log """ sample_ids, otu_ids, otu_table, lineages = parse_otu_table(otu_table_fh) #figure out which samples to draw if sample_names=='*': user_sample_names = sample_ids else: user_sample_names = sample_names.split(',') if len(user_sample_names)<1: raise ValueError, "sample IDs must be comma separated list of "\ +"sample names - found %s" % sample_names # do the actual drawing ax=None for sample_name,color in zip(user_sample_names, cycle(data_color_order)): try: index = sample_ids.index(sample_name) except ValueError: if log_fh: log_fh.write("Warning: Sample name %s not in OTU table - skipping." % sample_name) continue ax = plot_rank_abundance_graph(otu_table[:,index], color=color, absolute=absolute_counts, label=sample_name) ax.set_label(sample_name) if ax==None: #ax should be defined if at least one series has been drawn raise ValueError("No data series drawn. Check your OTU table and sample names") #settings for all series ax.grid() ax.set_xlabel('Species rank') ax.set_ylabel('Relative abundance') if not x_linear_scale: ax.set_xscale('log') if not y_linear_scale: ax.set_yscale('log') if not no_legend: legend() #build output fp, if less than MAX_SAMPLES_... append the samples names output_fp = output_dir+ "/rank_abundance" MAX_SAMPLES_TO_SHOW_IN_FILENAME = 6 if len(user_sample_names) < MAX_SAMPLES_TO_SHOW_IN_FILENAME: output_fp += '_'.join(user_sample_names) output_fp += ".%s" % file_type savefig(output_fp, format=file_type)
# if 4-tuple, just return it if type(otu_source) == type((1, 3, 4, 44)): return otu_source if hasattr(otu_source, 'startswith') and otu_source.startswith('#'): try: return parse_otu_table(StringIO(otu_source)) except (TypeError, ValueError), e: raise OtuMissingError, \ "Tried to read OTUs from string starting with # but got "+e else: try: otu_file = open(otu_source, 'U') except (TypeError, IOError): raise OtuMissingError, \ "Couldn't read OTU file at path: %s" % otu_source result = parse_otu_table(otu_file) otu_file.close() return result def getTree(self, tree_source): """Returns parsed tree from putative tree source""" if isinstance(tree_source, PhyloNode): tree = tree_source #accept tree object directly for tests elif tree_source: try: f = open(tree_source, 'U') except (TypeError, IOError): raise TreeMissingError, \ "Couldn't read tree file at path: %s" % tree_source tree = parse_newick(f, PhyloNode) f.close()
def single_file_beta(input_path, metrics, tree_path, output_dir, rowids=None, full_tree=False): """ does beta diversity calc on a single otu table uses name in metrics to name output beta diversity files assumes input tree is already trimmed to contain only otus present in otu table, doesn't call getSubTree() inputs: input_path (str) metrics (str, comma delimited if more than 1 metric) tree_path (str) output_dir (str) rowids (comma separated str) """ f = open(input_path, 'U') samids, otuids, otumtx, lineages = parse_otu_table(f) # otu mtx is otus by samples f.close() tree = None if tree_path: f = open(tree_path, 'U') tree = parse_newick(f, PhyloNode) f.close() if not full_tree: tree = tree.getSubTree(otuids, ignore_missing=True) metrics_list = metrics.split(',') for metric in metrics_list: outfilepath = os.path.join(output_dir, metric + '_' + os.path.split(input_path)[1]) try: metric_f = get_nonphylogenetic_metric(metric) is_phylogenetic = False except AttributeError: try: metric_f = get_phylogenetic_metric(metric) is_phylogenetic = True if tree == None: stderr.write("metric %s requires a tree, but none found\n"\ % (metric,)) exit(1) except AttributeError: stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\ % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids == None: # standard, full way if is_phylogenetic: dissims = metric_f(otumtx.T, otuids, tree, samids) else: dissims = metric_f(otumtx.T) f = open(outfilepath, 'w') f.write(format_distance_matrix(samids, dissims)) f.close() else: # only calc d(rowid1, *) for each rowid rowids_list = rowids.split(',') row_dissims = [] # same order as rowids_list for rowid in rowids_list: rowidx = samids.index(rowid) # first test if we can the dissim is a fn of only the pair # if not, just calc the whole matrix if metric_f.__name__ == 'dist_chisq' or \ metric_f.__name__ == 'dist_gower' or \ metric_f.__name__ == 'dist_hellinger' or\ metric_f.__name__ == 'binary_dist_chisq': row_dissims.append(metric_f(otumtx.T)[rowidx]) else: try: row_metric = get_phylogenetic_row_metric(metric) except AttributeError: # do element by element dissims = [] for i in range(len(samids)): if is_phylogenetic: dissim = metric_f( otumtx.T[[rowidx, i], :], otuids, tree, [samids[rowidx], samids[i]])[0, 1] else: dissim = metric_f(otumtx.T[[rowidx, i], :])[0, 1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once dissims = row_metric(otumtx.T, otuids, tree, samids, rowid) row_dissims.append(dissims) # rows_outfilepath = os.path.join(output_dir, metric + '_' +\ # '_'.join(rowids_list) + '_' + os.path.split(input_path)[1]) f = open(outfilepath, 'w') f.write(format_matrix(row_dissims, rowids_list, samids)) f.close()
def test_wrapper_multiple(test, otu_table_paths, category_mapping, category, threshold, \ _filter, otu_include=None): """runs statistical test to look for category/OTU associations on multiple files. Unlike the test_wrapper() method, this method includes all OTUs, even when some have zero counts. """ mapping_data, header, comments = parse_mapping_file(category_mapping) category_info, category_values = \ get_category_info(mapping_data, header, category, threshold) # if this is the g_test, disallow otus that are present in all samples filter_all_samples = test == "g_test" OTU_list, taxonomy_all_OTUs = get_common_OTUs(otu_table_paths, _filter, \ category_info=category_info, \ filter_all_samples=filter_all_samples, \ otu_include=otu_include) all_results = {} count = 0 for otu_table_fp in otu_table_paths: count += 1 sys.stdout.flush() otu_table = open(otu_table_fp, 'U') if test == 'ANOVA' or test == 'correlation': otu_table = convert_OTU_table_relative_abundance(otu_table) elif not test == 'g_test': raise ValueError( "An invalid test statistic was given. (-s option). Valid values are ANOVA, correlation, and g_test." ) sample_ids, otu_ids, otu_data, lineages = \ parse_otu_table(otu_table, float) otu_sample_info, num_samples, taxonomy_info = \ get_otu_table_info(sample_ids, otu_ids, otu_data, lineages) if test == 'ANOVA': results = run_ANOVA_OTUs(OTU_list, category_info, otu_sample_info, \ category_values) elif test == 'correlation': results = run_correlation_OTUs(OTU_list, category_info, otu_sample_info) elif test == 'g_test': results = run_G_test_OTUs(OTU_list, category_info, otu_sample_info, \ category_values, suppress_warnings=True) for OTU in results.keys(): if not all_results.has_key(OTU): all_results[OTU] = [] all_results[OTU].append(results[OTU]) # aggregate multiple results and create output string if test == 'ANOVA': all_results = aggregate_multiple_results_ANOVA(all_results) output = output_results_ANOVA(all_results, category_values, taxonomy_all_OTUs) elif test == 'correlation': all_results = aggregate_multiple_results_correlation(all_results) output = output_results_correlation(all_results, taxonomy_all_OTUs) elif test == 'g_test': all_results = aggregate_multiple_results_G_test(all_results) output = output_results_G_test(all_results, taxonomy_all_OTUs) return output
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_coords = opts.input_coords map_fp = opts.map_fp output_dir = opts.output_dir color_by_column_names = opts.color_by add_unique_columns = opts.add_unique_columns custom_axes = opts.custom_axes ignore_missing_samples = opts.ignore_missing_samples missing_custom_axes_values = opts.missing_custom_axes_values jackknifing_method = opts.ellipsoid_method master_pcoa = opts.master_pcoa taxa_fp = opts.taxa_fp n_taxa_to_keep = opts.n_taxa_to_keep biplot_fp = opts.biplot_fp add_vectors = opts.add_vectors verbose_output = opts.verbose number_of_axes = opts.number_of_axes compare_plots = opts.compare_plots number_of_segments = opts.number_of_segments # add some metadata to the output emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML') # verifying that the number of axes requested is greater than 3 if number_of_axes<3: option_parser.error(('You need to plot at least 3 axes.')) # verifying that the number of segments is between the desired range if number_of_segments<4 or number_of_segments>14: option_parser.error(('number_of_segments should be between 4 and 14.')) # append headernames that the script didn't find in the mapping file # according to different criteria to the following variables offending_fields = [] non_numeric_categories = [] serial_comparison = True # can't do averaged pcoa plots _and_ custom axes in the same plot if custom_axes!=None and len(custom_axes.split(','))>1 and\ isdir(input_coords): option_parser.error(('Jackknifed plots are limited to one custom axis, ' 'currently trying to use: %s. Make sure you use only one.' % custom_axes)) # make sure the flag is not misunderstood from the command line interface if isdir(input_coords) == False and compare_plots: option_parser.error('Cannot use the \'--compare_plots\' flag unless the' ' input path is a directory.') # before creating any output, check correct parsing of the main input files try: mapping_data, header, comments = parse_mapping_file(open(map_fp,'U')) # use this set variable to make presence/absensce checks faster lookup_header = set(header) except: option_parser.error(('The metadata mapping file \'%s\' does not seem ' 'to be formatted correctly, verify the formatting is QIIME ' 'compliant by using check_id_map.py') % map_fp) # dir means jackknifing or coordinate comparison type of processing if isdir(input_coords): offending_coords_fp = [] coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[] # iterate only over the non-hidden files and not folders and if anything # ignore the procrustes results file that is generated by # transform_coordinate_matrices.py suffixed in procrustes_results.txt coord_fps = [join(input_coords, f) for f in listdir(input_coords) if not f.startswith('.') and not isdir(join(abspath(input_coords),f)) and not f.endswith('procrustes_results.txt')] # this could happen and we rather avoid this problem if len(coord_fps) == 0: option_parser.error('Could not use any of the files in the input ' 'directory.') # the master pcoa must be the first in the list of coordinates; however # if the visualization is not a jackknifed plot this gets ignored if master_pcoa and compare_plots == False: if master_pcoa in coord_fps: # remove it if duplicated coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + coord_fps # prepend it to the list # passing a master file means that the comparison is not serial elif master_pcoa and compare_plots: serial_comparison = False # guarantee that the master is the first and is not repeated if master_pcoa in coord_fps: coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps) # QIIME generates folders of transformed coordinates for the specific # purpose of connecting all coordinates to a set of origin coordinates. # The name of this file is suffixed as _transformed_reference.txt elif master_pcoa == None and len([f for f in coord_fps if f.endswith( '_transformed_reference.txt')]): master_pcoa = [f for f in coord_fps if f.endswith( '_transformed_reference.txt')][0] serial_comparison = False # Note: the following steps are to guarantee consistency. # remove the master from the list and re-add it as a first element # the rest of the files must be sorted alphabetically so the result # will be: ['unifrac_transformed_reference.txt', # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps) for fp in coord_fps: try: _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\ parse_coords(open(fp,'U')) # pack all the data correspondingly coords_headers.append(_coords_headers) coords_data.append(_coords_data) coords_eigenvalues.append(_coords_eigenvalues) coords_pct.append(_coords_pct) except ValueError: offending_coords_fp.append(fp) # in case there were files that couldn't be parsed if offending_coords_fp: option_parser.error(('The following file(s): \'%s\' could not be ' 'parsed properly. Make sure the input folder only contains ' 'coordinates files.') % ', '.join(offending_coords_fp)) # check all files contain the same sample identifiers by flattening the # list of available sample ids and returning the sample ids that are # in one of the sets of sample ids but not in the globablly shared ids non_shared_ids = set(sum([list(set(sum(coords_headers, []))^set(e)) for e in coords_headers],[])) if non_shared_ids and len(coords_headers) > 1: option_parser.error(('The following sample identifier(s): \'%s\'' 'are not shared between all the files. The files used to ' 'make a jackknifed PCoA plot or coordinate comparison plot (' 'procustes plot) must share all the same sample identifiers' 'between each other.')%', '.join(list(non_shared_ids))) # flatten the list of lists into a 1-d list _coords_headers = list(set(sum(coords_headers, []))) # number of samples ids that are shared between coords and mapping files sids_intersection=list(set(zip(*mapping_data)[0])&set(_coords_headers)) # sample ids that are not mapped but are in the coords sids_difference=list(set(_coords_headers)-set(zip(*mapping_data)[0])) # used to perform different validations in the script, very similar for # the case where the input is not a directory number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers[0]) else: try: coords_headers, coords_data, coords_eigenvalues, coords_pct =\ parse_coords(open(input_coords,'U')) # this exception was noticed when there were letters in the coords file # other exeptions should be catched here; code will be updated then except ValueError: option_parser.error(('The PCoA file \'%s\' does not seem to be a ' 'coordinates formatted file, verify by manually inspecting ' 'the contents.') % input_coords) # number of samples ids that are shared between coords and mapping files sids_intersection = list(set(zip(*mapping_data)[0])&set(coords_headers)) # sample ids that are not mapped but are in the coords sids_difference = list(set(coords_headers)-set(zip(*mapping_data)[0])) number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers) if taxa_fp: try: # for summarized tables the "otu_ids" are really the "lineages" otu_sample_ids, lineages, otu_table, _ = parse_otu_table(open( taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True) except ValueError, e: option_parser.error('There was a problem parsing the --taxa_fp: %s'% e.message) # make sure there are matching sample ids with the otu table if not len(list(set(sids_intersection)&set(otu_sample_ids))): option_parser.error('The sample identifiers in the OTU table must ' 'have at least one match with the data in the mapping file and ' 'with the coordinates file. Verify you are using input files ' 'that belong to the same dataset.') if len(lineages) <= 1: option_parser.error('Contingency tables with one or fewer rows are ' 'not supported, please try passing a contingency table with ' 'more than one row.')
def get_counts_by_cat(lines, num_meta, meta_dict, cat_list, category, num_samples_by_cat, normalize): con_by_sample = defaultdict(set) node_file_str = [] edge_file_str = [] red_nodes = defaultdict(int) red_node_file_str = [] red_edge_file_str = [] edge_from = [] to = [] otu_dc = defaultdict(int) degree_counts = defaultdict(int) sample_dc = defaultdict(int) sample_num_seq = defaultdict(int) samples_from_mapping = meta_dict.keys() con_list = [] label_list = [] norm_otu_table = [] sample_counts = defaultdict(int) cat_otu_table = [] otus = [] taxonomy = [] sample_ids, otu_ids, otu_table, lineages = parse_otu_table(lines) label_list = sample_ids if lineages == []: is_con = False else: is_con = True for idx, line in enumerate(otu_table): new_line = [] label_dict = defaultdict(int) data = line to_otu = otu_ids[idx] otus.append(to_otu) con = '' if is_con: con = '; '.join(lineages[idx]) counts = data else: counts = data taxonomy.append(con) if not normalize: for i, c in zip(label_list, counts): if i in samples_from_mapping: label_dict[meta_dict[i][0][0]] += c for i in cat_list: new_line.append(str(label_dict[i])) cat_otu_table.append(new_line) else: new_line.extend(counts) norm_otu_table.append(new_line) for i, c in zip(label_list, counts): sample_counts[i] += c total = 0 if normalize: for l in norm_otu_table: counts = l new_line = [] label_dict = defaultdict(float) getcontext().prec = 28 for i, c in zip(label_list, counts): if i in samples_from_mapping: label_dict[meta_dict[i][0] [0]] += float(c) / (sample_counts[i]) for i in cat_list: new_line.append( round((label_dict[i] / num_samples_by_cat[(category, i)]) * 100, 5)) cat_otu_table.append(new_line) return cat_otu_table, otus, taxonomy