def test_map_to_2D_dict(self): """Checks map_to_2D_dict is sane""" # Creates a pseudo-opening function test_map = StringIO( '#SampleID\tBIRTH_YEAR\tDEATH_YEAR\tSEX\tPROFESSION\tHOME_STATE\n' '00010\t1954\t2006\tmale\tMechanic\tKansas\n' '00100\t1954\t1983\tfemale\tHunter\tKansas\n' '00200\tNA\t2009\tfemale\tNurse\tMinnesota\n' '00111\t1979\t2007\tmale\tHunter\tImpala\n' '00112\t1983\t2006\tmale\tHunter\tImpala\n' '00211\t1990\t2009\tmale\tStudent\tMinnesota\n') # Sets up the known dictionary known_dict = {'00010': {'#SampleID': '00010', 'BIRTH_YEAR': '1954', 'DEATH_YEAR': '2006', 'SEX': 'male', 'PROFESSION': 'Mechanic', 'HOME_STATE': 'Kansas'}, '00100': {'#SampleID': '00100', 'BIRTH_YEAR': '1954', 'DEATH_YEAR': '1983', 'SEX': 'female', 'PROFESSION': 'Hunter', 'HOME_STATE': 'Kansas'}, '00200': {'#SampleID': '00200', 'BIRTH_YEAR': 'NA', 'DEATH_YEAR': '2009', 'SEX': 'female', 'PROFESSION': 'Nurse', 'HOME_STATE': 'Minnesota'}, '00111': {'#SampleID': '00111', 'BIRTH_YEAR': '1979', 'DEATH_YEAR': '2007', 'SEX': 'male', 'PROFESSION': 'Hunter', 'HOME_STATE': 'Impala'}, '00112': {'#SampleID': '00112', 'BIRTH_YEAR': '1983', 'DEATH_YEAR': '2006', 'SEX': 'male', 'PROFESSION': 'Hunter', 'HOME_STATE': 'Impala'}, '00211': {'#SampleID': '00211', 'BIRTH_YEAR': '1990', 'DEATH_YEAR': '2009', 'SEX': 'male', 'PROFESSION': 'Student', 'HOME_STATE': 'Minnesota'}} # Checks the test dictionary is loaded properly and equals the known test_dict = map_to_2D_dict(test_map) self.assertEqual(test_dict, known_dict)
else: with biom_open(args.input) as fp: tax_table = parse_biom_table(fp) # Checks the output directory is sane. if not args.output: parser.error('An output directory must be supplied.') elif not exists(args.output): mkdir(args.output) output_dir = args.output if args.mapping and not isfile(args.mapping): parser.error('The supplied mapping file does not exist in the path.') elif args.mapping: mapping = map_to_2D_dict(open(args.mapping, 'U')) else: mapping = args.mapping # Parses the sample IDs as a list if args.samples: samples_to_analyze = [] for sample in args.samples.split(','): samples_to_analyze.append(sample) else: samples_to_analyze = None main(taxa_table=tax_table, output_dir=output_dir, mapping=mapping, samples_to_analyze=samples_to_analyze)
def main(otu_table, mapping_data, cat_tables, output_dir, sample_type='fecal', samples_to_plot=None, legend=False, xaxis=True, debug=False): """Creates stacked bar plots for an otu table INPUTS: otu_table -- an open OTU table mapping_data -- a tab delimited string containing the mapping data passed from the mapping file. categories -- a dictionary keying a mapping category to the corresponding biom table output_dir -- the location of the directory where output files should be saved. If this directory does not exist, it will be created. samples_to_plot -- a list of sample ids to plot. If no value is passed, then all samples in the biom table are analyzed. debug -- ignore properly handling Michael Pollan's sample OUTPUTS: A pdf of stacked taxonomy will be generated for each sample and saved in the output directory. These will follow the file name format Figure_4_<SAMPLEID>.pdf """ # Sets constants for analyzing the data LEVEL = 2 CATEGORY = 'taxonomy' NUM_TAXA = 9 NUM_CATS_TO_PLOT = 7 # Sets up plotting constants COLORMAP = array([[0.8353, 0.2421, 0.3098], [0.9569, 0.4275, 0.2627], [0.9922, 0.6824, 0.3804], [0.9961, 0.8784, 0.5351], [0.9020, 0.9608, 0.5961], [0.6706, 0.8667, 0.6431], [0.4000, 0.7608, 0.6471], [0.1961, 0.5333, 0.7412], [0.3333, 0.3333, 0.3333]]) FIG_DIMS = (4.44444, 3.33333) AXIS_DIMS = array([[0.05, 0.05], [0.95, 0.95]]) # Common taxa are designated before processing to remain constant. COMMON_TAXA = [(u'k__Bacteria', u'p__Firmicutes'), (u'k__Bacteria', u'p__Bacteroidetes'), (u'k__Bacteria', u'p__Proteobacteria'), (u'k__Bacteria', u'p__Actinobacteria'), (u'k__Bacteria', u'p__Verrucomicrobia'), (u'k__Bacteria', u'p__Tenericutes'), (u'k__Bacteria', u'p__Cyanobacteria'), (u'k__Bacteria', u'p__Fusobacteria')] SKIPSET = set(('Sample', 'Average', 'MP')) # Names categories being plotted if sample_type == 'fecal': michael_pollan = '10317.000007108' cat_list = [ 'You', 'Average', 'Similar Diet', ' Similar BMI', 'Same Gender', 'Similar Age', 'Michael Pollan' ] order = [ 'Sample', 'Average', 'DIET_TYPE', 'BMI_CAT', 'SEX', 'AGE_CAT', 'MP' ] elif sample_type == 'skin': michael_pollan = '10317.000007113' cat_list = [ 'You', 'Average', 'Similar Cosmetic Use', 'Same Dominant Hand', 'Same Gender', 'Same Age', 'Michael Pollan' ] order = [ 'Sample', 'Average', 'COSMETICS_FREQUENCY', 'DOMINANT_HAND', 'SEX', 'AGE_CAT', 'MP' ] elif sample_type == 'oral': michael_pollan = '10317.000007109' cat_list = [ 'You', 'Average', 'Similar Diet', 'Flossing Frequency', 'Same Gender', 'Same Age', 'Michael Pollan' ] order = [ 'Sample', 'Average', 'DIET_TYPE', 'FLOSSING_FREQUENCY', 'SEX', 'AGE_CAT', 'MP' ] else: raise ValueError('%s is not a supported sample type.' % sample_type) # Gets the mapping file map_dict = map_to_2D_dict(mapping_data) # Gets the category file dictionary summarized with the common categories # Generates the category file dictionary categories = parse_category_files(raw_tables=cat_tables, common_groups=COMMON_TAXA[:8], level=LEVEL, metadata=CATEGORY) # Summarizes taxonomy for the category (whole_sample_ids, whole_summary, new_common_taxa) = \ summarize_common_categories(biom_table=otu_table, level=LEVEL, common_categories=COMMON_TAXA[:8], metadata_category=CATEGORY) # Converts the final taxa to a cleaned up list # Converts final taxa to a clean list common_phyla = [] for taxon in new_common_taxa: common_phyla.append(taxon[1].strip(' p__').strip('[').strip(']')) new_common_taxa = common_phyla # Checks that the crrect sample ids are plotted if samples_to_plot is None: sample_ids = whole_sample_ids else: sample_ids = samples_to_plot if len(sample_ids) > 1: # TODO: make the rest of the code reflect this... raise ValueError("SCRIPT NO LONGER SUPPORTS MULTIPLE SAMPLES") # Identifies Michael Pollan's pre-ABX sample if debug: mp_sample_pos = 2 else: mp_sample_pos = whole_sample_ids.tolist().index(michael_pollan) mp_sample_taxa = whole_summary[:, mp_sample_pos] # Gets the table average table_average = mean(whole_summary, 1) # Generates a figure for each sample for idx, sample_id in enumerate(whole_sample_ids): if sample_id in sample_ids: meta_data = map_dict[sample_id] # Prealocates a numpy array to hold the data tax_array = zeros((NUM_TAXA, NUM_CATS_TO_PLOT)) # Adds preset values to the array so the first column is the sample # the second column is the average and the last column is Michael # Pollan tax_array[:, 0] = whole_summary[:, idx] tax_array[:, 1] = table_average tax_array[:, -1] = mp_sample_taxa # Adds the categories to the table in the listed order for idx, cat in enumerate(order): # Skips over undesired categories if cat in SKIPSET: continue # Gets the sample metadata mapping_key = meta_data[cat] # Pulls taxonomic summary and group descriptions tax_summary = categories[cat]['Summary'] group_descriptions = categories[cat]['Groups'].tolist() # Appends plotting tables try: mapping_col = group_descriptions.index(mapping_key) except: raise ValueError('The %s cannot be found in %s.' % (mapping_key, cat)) tax_array[:, idx] = tax_summary[:, mapping_col] # Sets up the file to save the data filename = pjoin(output_dir, 'figure4.pdf') # Plots the data render_barchart(data_table=tax_array, x_axis=False, group_names=new_common_taxa, legend=False, sample_names=cat_list, y_axis=False, axis_dims=AXIS_DIMS, fig_dims=FIG_DIMS, file_out=filename, show_edge=False, colors=COLORMAP)
def main(otu_table, mapping_data, categories, output_dir, \ samples_to_plot = None, legend = False, xaxis = True): """Creates stacked bar plots for an otu table INPUTS: otu_table -- an open OTU table mapping_data -- a tab delimited string containing the mapping data passed from the mapping file. categories -- a dictionary keying a mapping category to the corresponding sample IDs and taxonomy for a collapsed biom table output_dir -- the location of the directory where output files should be saved. If this directory does not exist, it will be created. samples_to_plot -- a list of sample ids to plot. If no value is passed, then all samples in the biom table are analyzed. OUTPUTS: A pdf of stacked taxonomy will be generated for each sample and saved in the output directory. These will follow the file name format Figure_4_<SAMPLEID>.pdf """ # Sets constants LEVEL = 2 FILEPREFIX = 'Figure_4_' MICHAEL_POLLAN = '000007108.1075657' NUM_TAXA = 9 NUM_CATS_TO_PLOT = 7 # Loads the mapping file map_dict = map_to_2D_dict(mapping_data) (common_taxa, whole_sample_ids, whole_summary) = \ summarize_human_taxa(otu_table, LEVEL) # Converts final taxa to a clean list common_phyla = [] for taxon in common_taxa: common_phyla.append(taxon[1].strip(' p__').strip('[').strip(']')) common_taxa = common_phyla # Checks that the correct sample ids are plotted if samples_to_plot == None: sample_ids = whole_sample_ids else: sample_ids = samples_to_plot # Identifies Michael Pollan's pre-ABX sample mp_sample_pos = whole_sample_ids.index(MICHAEL_POLLAN) mp_sample_taxa = whole_summary[:,mp_sample_pos] # Loads the category dictionary categories = load_category_files(category_fp, LEVEL) # Generates a figure for each sample for idx, sample_id in enumerate(whole_sample_ids): if sample_id in sample_ids: # Preallocates a numpy array for the plotting data tax_array = zeros((NUM_TAXA, NUM_CATS_TO_PLOT)) meta_data = map_dict[sample_id] cat_list = ['You', 'Average', 'Similar Diet', ' Similar BMI', 'Same Gender', 'Similar Age', 'Michael Pollan', ''] #cat_list.append('Your Fecal Sample') #cat_list.append('Average Fecal Samples') tax_array[:,0] = whole_summary[:,idx] tax_array[:,1] = mean(whole_summary, 1) cat_watch = 2 # Identifies the appropriate metadata categories for cat in categories: # Pulls metadata for the sample and category mapping_key = meta_data[cat] # Pulls taxonomic summary and group descriptions for the category tax_summary = categories[cat]['Taxa Summary'] group_descriptions = categories[cat]['Groups'] # Amends plotting tables try: mapping_col = group_descriptions.index(mapping_key) except: raise ValueError, 'The %s cannot be found in %s.' \ % (mapping_key, cat) tax_array[:,cat_watch] = tax_summary[:,mapping_col] cat_watch = cat_watch + 1 tax_array[:,-1] = mp_sample_taxa # Plots the data filename = pjoin(output_dir, '%s%s.pdf' \ % (FILEPREFIX, sample_id)) plot_american_gut(tax_array, filename)
def test_map_to_2D_dict(self): """Checks map_to_2D_dict is sane""" # Creates a pseudo-opening function test_map = StringIO( '#SampleID\tBIRTH_YEAR\tDEATH_YEAR\tSEX\tPROFESSION\tHOME_STATE\n' '00010\t1954\t2006\tmale\tMechanic\tKansas\n' '00100\t1954\t1983\tfemale\tHunter\tKansas\n' '00200\tNA\t2009\tfemale\tNurse\tMinnesota\n' '00111\t1979\t2007\tmale\tHunter\tImpala\n' '00112\t1983\t2006\tmale\tHunter\tImpala\n' '00211\t1990\t2009\tmale\tStudent\tMinnesota\n') # Sets up the known dictionary known_dict = { '00010': { '#SampleID': '00010', 'BIRTH_YEAR': '1954', 'DEATH_YEAR': '2006', 'SEX': 'male', 'PROFESSION': 'Mechanic', 'HOME_STATE': 'Kansas' }, '00100': { '#SampleID': '00100', 'BIRTH_YEAR': '1954', 'DEATH_YEAR': '1983', 'SEX': 'female', 'PROFESSION': 'Hunter', 'HOME_STATE': 'Kansas' }, '00200': { '#SampleID': '00200', 'BIRTH_YEAR': 'NA', 'DEATH_YEAR': '2009', 'SEX': 'female', 'PROFESSION': 'Nurse', 'HOME_STATE': 'Minnesota' }, '00111': { '#SampleID': '00111', 'BIRTH_YEAR': '1979', 'DEATH_YEAR': '2007', 'SEX': 'male', 'PROFESSION': 'Hunter', 'HOME_STATE': 'Impala' }, '00112': { '#SampleID': '00112', 'BIRTH_YEAR': '1983', 'DEATH_YEAR': '2006', 'SEX': 'male', 'PROFESSION': 'Hunter', 'HOME_STATE': 'Impala' }, '00211': { '#SampleID': '00211', 'BIRTH_YEAR': '1990', 'DEATH_YEAR': '2009', 'SEX': 'male', 'PROFESSION': 'Student', 'HOME_STATE': 'Minnesota' } } # Checks the test dictionary is loaded properly and equals the known test_dict = map_to_2D_dict(test_map) self.assertEqual(test_dict, known_dict)
elif not isfile(args.input): parser.error("The supplied taxonomy file does not exist in the path.") else: with biom_open(args.input) as fp: tax_table = parse_biom_table(fp) # Checks the output directory is sane. if not args.output: parser.error("An output directory must be supplied.") elif not exists(args.output): mkdir(args.output) output_dir = args.output if args.mapping and not isfile(args.mapping): parser.error("The supplied mapping file does not exist in the path.") elif args.mapping: mapping = map_to_2D_dict(open(args.mapping, "U")) else: mapping = args.mapping # Parses the sample IDs as a list if args.samples: samples_to_analyze = [] for sample in args.samples.split(","): samples_to_analyze.append(sample) else: samples_to_analyze = None main(taxa_table=tax_table, output_dir=output_dir, mapping=mapping, samples_to_analyze=samples_to_analyze)
def main(otu_table, mapping_data, cat_tables, output_dir, sample_type='fecal', samples_to_plot=None, legend=False, xaxis=True, debug=False): """Creates stacked bar plots for an otu table INPUTS: otu_table -- an open OTU table mapping_data -- a tab delimited string containing the mapping data passed from the mapping file. categories -- a dictionary keying a mapping category to the corresponding biom table output_dir -- the location of the directory where output files should be saved. If this directory does not exist, it will be created. samples_to_plot -- a list of sample ids to plot. If no value is passed, then all samples in the biom table are analyzed. debug -- ignore properly handling Michael Pollan's sample OUTPUTS: A pdf of stacked taxonomy will be generated for each sample and saved in the output directory. These will follow the file name format Figure_4_<SAMPLEID>.pdf """ # Sets constants for analyzing the data LEVEL = 2 CATEGORY = 'taxonomy' NUM_TAXA = 9 NUM_CATS_TO_PLOT = 7 # Sets up file name constants FILEPREFIX = 'Figure_4_' FILE_END = '.pdf' # Sets up plotting constants COLORMAP = array([[0.8353, 0.2421, 0.3098], [0.9569, 0.4275, 0.2627], [0.9922, 0.6824, 0.3804], [0.9961, 0.8784, 0.5351], [0.9020, 0.9608, 0.5961], [0.6706, 0.8667, 0.6431], [0.4000, 0.7608, 0.6471], [0.1961, 0.5333, 0.7412], [0.3333, 0.3333, 0.3333]]) FIG_DIMS = (4.44444, 3.33333) AXIS_DIMS = array([[0.05, 0.05], [0.95, 0.95]]) # Common taxa are designated before processing to remain constant. COMMON_TAXA = [(u'k__Bacteria', u'p__Firmicutes'), (u'k__Bacteria', u'p__Bacteroidetes'), (u'k__Bacteria', u'p__Proteobacteria'), (u'k__Bacteria', u'p__Actinobacteria'), (u'k__Bacteria', u'p__Verrucomicrobia'), (u'k__Bacteria', u'p__Tenericutes'), (u'k__Bacteria', u'p__Cyanobacteria'), (u'k__Bacteria', u'p__Fusobacteria')] SKIPSET = set(('Sample', 'Average', 'MP')) # Names categories being plotted if sample_type == 'fecal': michael_pollan = '000007108.1075657' cat_list = ['You', 'Average', 'Similar Diet', ' Similar BMI', 'Same Gender', 'Similar Age', 'Michael Pollan'] order = ['Sample', 'Average', 'DIET_TYPE', 'BMI_CATEGORY', 'SEX', 'AGE_CATEGORY', 'MP'] elif sample_type == 'skin': michael_pollan = '7113.1075702' cat_list = ['You', 'Average', 'Similar Cosmetic Use', 'Same Dominant Hand', 'Same Gender', 'Same Age', 'Michael Pollan'] order = ['Sample', 'Average', 'COSMETICS_FREQUENCY', 'DOMINANT_HAND', 'SEX', 'AGE_CATEGORY', 'MP'] elif sample_type == 'oral': michael_pollan = '7109.1075688' cat_list = ['You', 'Average', 'Similar Diet', 'Flossing Frequency', 'Same Gender', 'Same Age', 'Michael Pollan'] order = ['Sample', 'Average', 'DIET_TYPE', 'FLOSSING_FREQUENCY', 'SEX', 'AGE_CATEGORY', 'MP'] else: raise ValueError('%s is not a supported sample type.' % sample_type) # Gets the mapping file map_dict = map_to_2D_dict(mapping_data) # Gets the category file dictionary summarized with the common categories # Generates the category file dictionary categories = parse_category_files(raw_tables=cat_tables, common_groups=COMMON_TAXA[:8], level=LEVEL, metadata=CATEGORY) # Summarizes taxonomy for the category (whole_sample_ids, whole_summary, new_common_taxa) = \ summarize_common_categories(biom_table=otu_table, level=LEVEL, common_categories=COMMON_TAXA[:8], metadata_category=CATEGORY) # Converts the final taxa to a cleaned up list # Converts final taxa to a clean list common_phyla = [] for taxon in new_common_taxa: common_phyla.append(taxon[1].strip(' p__').strip('[').strip(']')) new_common_taxa = common_phyla # Checks that the crrect sample ids are plotted if samples_to_plot is None: sample_ids = whole_sample_ids else: sample_ids = samples_to_plot # Identifies Michael Pollan's pre-ABX sample if debug: mp_sample_pos = 2 else: mp_sample_pos = whole_sample_ids.tolist().index(michael_pollan) mp_sample_taxa = whole_summary[:, mp_sample_pos] # Gets the table average table_average = mean(whole_summary, 1) # Generates a figure for each sample for idx, sample_id in enumerate(whole_sample_ids): if sample_id in sample_ids: meta_data = map_dict[sample_id] # Prealocates a numpy array to hold the data tax_array = zeros((NUM_TAXA, NUM_CATS_TO_PLOT)) # Adds preset values to the array so the first column is the sample # the second column is the average and the last column is Michael # Pollan tax_array[:, 0] = whole_summary[:, idx] tax_array[:, 1] = table_average tax_array[:, -1] = mp_sample_taxa # Adds the categories to the table in the listed order for idx, cat in enumerate(order): # Skips over undesired categories if cat in SKIPSET: continue # Gets the sample metadata mapping_key = meta_data[cat] # Pulls taxonomic summary and group descriptions tax_summary = categories[cat]['Summary'] group_descriptions = categories[cat]['Groups'].tolist() # Appends plotting tables try: mapping_col = group_descriptions.index(mapping_key) except: raise ValueError('The %s cannot be found in %s.' % (mapping_key, cat)) tax_array[:, idx] = tax_summary[:, mapping_col] # Sets up the file to save the data filename = pjoin(output_dir, '%s%s%s' % (FILEPREFIX, sample_id, FILE_END)) # Plots the data render_barchart(data_table=tax_array, x_axis=False, group_names=new_common_taxa, legend=False, sample_names=cat_list, y_axis=False, axis_dims=AXIS_DIMS, fig_dims=FIG_DIMS, file_out=filename, show_edge=False, colors=COLORMAP)