def test_map_to_2D_dict(self):
        """Checks map_to_2D_dict is sane"""
        # Creates a pseudo-opening function
        test_map = StringIO(
            '#SampleID\tBIRTH_YEAR\tDEATH_YEAR\tSEX\tPROFESSION\tHOME_STATE\n'
            '00010\t1954\t2006\tmale\tMechanic\tKansas\n'
            '00100\t1954\t1983\tfemale\tHunter\tKansas\n'
            '00200\tNA\t2009\tfemale\tNurse\tMinnesota\n'
            '00111\t1979\t2007\tmale\tHunter\tImpala\n'
            '00112\t1983\t2006\tmale\tHunter\tImpala\n'
            '00211\t1990\t2009\tmale\tStudent\tMinnesota\n')

        # Sets up the known dictionary
        known_dict = {'00010': {'#SampleID': '00010', 'BIRTH_YEAR': '1954',
                                'DEATH_YEAR': '2006', 'SEX': 'male',
                                'PROFESSION': 'Mechanic', 'HOME_STATE':
                                'Kansas'},
                      '00100': {'#SampleID': '00100', 'BIRTH_YEAR': '1954',
                                'DEATH_YEAR': '1983', 'SEX': 'female',
                                'PROFESSION': 'Hunter', 'HOME_STATE':
                                'Kansas'},
                      '00200': {'#SampleID': '00200', 'BIRTH_YEAR': 'NA',
                                'DEATH_YEAR': '2009', 'SEX': 'female',
                                'PROFESSION': 'Nurse',
                                'HOME_STATE': 'Minnesota'},
                      '00111': {'#SampleID': '00111', 'BIRTH_YEAR': '1979',
                                'DEATH_YEAR': '2007', 'SEX': 'male',
                                'PROFESSION': 'Hunter', 'HOME_STATE':
                                'Impala'},
                      '00112': {'#SampleID': '00112', 'BIRTH_YEAR': '1983',
                                'DEATH_YEAR': '2006', 'SEX': 'male',
                                'PROFESSION': 'Hunter', 'HOME_STATE':
                                'Impala'},
                      '00211': {'#SampleID': '00211', 'BIRTH_YEAR': '1990',
                                'DEATH_YEAR': '2009', 'SEX': 'male',
                                'PROFESSION': 'Student', 'HOME_STATE':
                                'Minnesota'}}

        # Checks the test dictionary is loaded properly and equals the known
        test_dict = map_to_2D_dict(test_map)
        self.assertEqual(test_dict, known_dict)
Example #2
0
    else:
        with biom_open(args.input) as fp:
            tax_table = parse_biom_table(fp)

    # Checks the output directory is sane.
    if not args.output:
        parser.error('An output directory must be supplied.')
    elif not exists(args.output):
        mkdir(args.output)

    output_dir = args.output

    if args.mapping and not isfile(args.mapping):
        parser.error('The supplied mapping file does not exist in the path.')
    elif args.mapping:
        mapping = map_to_2D_dict(open(args.mapping, 'U'))
    else:
        mapping = args.mapping

    # Parses the sample IDs as a list
    if args.samples:
        samples_to_analyze = []
        for sample in args.samples.split(','):
            samples_to_analyze.append(sample)
    else:
        samples_to_analyze = None

    main(taxa_table=tax_table,
         output_dir=output_dir,
         mapping=mapping,
         samples_to_analyze=samples_to_analyze)
def main(otu_table,
         mapping_data,
         cat_tables,
         output_dir,
         sample_type='fecal',
         samples_to_plot=None,
         legend=False,
         xaxis=True,
         debug=False):
    """Creates stacked bar plots for an otu table

    INPUTS:
        otu_table -- an open OTU table

        mapping_data -- a tab delimited string containing the mapping data
                    passed from the mapping file.

        categories -- a dictionary keying a mapping category to the
                    corresponding biom table

        output_dir -- the location of the directory where output files should
                    be saved. If this directory does not exist, it will be
                    created.

        samples_to_plot -- a list of sample ids to plot. If no value is passed,
                    then all samples in the biom table are analyzed.

        debug -- ignore properly handling Michael Pollan's sample

    OUTPUTS:
        A pdf of stacked taxonomy will be generated for each sample and saved
        in the output directory. These will follow the file name format
        Figure_4_<SAMPLEID>.pdf
    """

    # Sets constants for analyzing the data
    LEVEL = 2
    CATEGORY = 'taxonomy'
    NUM_TAXA = 9
    NUM_CATS_TO_PLOT = 7

    # Sets up plotting constants
    COLORMAP = array([[0.8353, 0.2421, 0.3098], [0.9569, 0.4275, 0.2627],
                      [0.9922, 0.6824, 0.3804], [0.9961, 0.8784, 0.5351],
                      [0.9020, 0.9608, 0.5961], [0.6706, 0.8667, 0.6431],
                      [0.4000, 0.7608, 0.6471], [0.1961, 0.5333, 0.7412],
                      [0.3333, 0.3333, 0.3333]])

    FIG_DIMS = (4.44444, 3.33333)
    AXIS_DIMS = array([[0.05, 0.05], [0.95, 0.95]])

    # Common taxa are designated before processing to remain constant.
    COMMON_TAXA = [(u'k__Bacteria', u'p__Firmicutes'),
                   (u'k__Bacteria', u'p__Bacteroidetes'),
                   (u'k__Bacteria', u'p__Proteobacteria'),
                   (u'k__Bacteria', u'p__Actinobacteria'),
                   (u'k__Bacteria', u'p__Verrucomicrobia'),
                   (u'k__Bacteria', u'p__Tenericutes'),
                   (u'k__Bacteria', u'p__Cyanobacteria'),
                   (u'k__Bacteria', u'p__Fusobacteria')]

    SKIPSET = set(('Sample', 'Average', 'MP'))

    # Names categories being plotted
    if sample_type == 'fecal':
        michael_pollan = '10317.000007108'
        cat_list = [
            'You', 'Average', 'Similar Diet', ' Similar BMI', 'Same Gender',
            'Similar Age', 'Michael Pollan'
        ]
        order = [
            'Sample', 'Average', 'DIET_TYPE', 'BMI_CAT', 'SEX', 'AGE_CAT', 'MP'
        ]

    elif sample_type == 'skin':
        michael_pollan = '10317.000007113'
        cat_list = [
            'You', 'Average', 'Similar Cosmetic Use', 'Same Dominant Hand',
            'Same Gender', 'Same Age', 'Michael Pollan'
        ]
        order = [
            'Sample', 'Average', 'COSMETICS_FREQUENCY', 'DOMINANT_HAND', 'SEX',
            'AGE_CAT', 'MP'
        ]

    elif sample_type == 'oral':
        michael_pollan = '10317.000007109'
        cat_list = [
            'You', 'Average', 'Similar Diet', 'Flossing Frequency',
            'Same Gender', 'Same Age', 'Michael Pollan'
        ]
        order = [
            'Sample', 'Average', 'DIET_TYPE', 'FLOSSING_FREQUENCY', 'SEX',
            'AGE_CAT', 'MP'
        ]

    else:
        raise ValueError('%s is not a supported sample type.' % sample_type)

    # Gets the mapping file
    map_dict = map_to_2D_dict(mapping_data)

    # Gets the category file dictionary summarized with the common categories
    # Generates the category file dictionary
    categories = parse_category_files(raw_tables=cat_tables,
                                      common_groups=COMMON_TAXA[:8],
                                      level=LEVEL,
                                      metadata=CATEGORY)

    # Summarizes taxonomy for the category
    (whole_sample_ids, whole_summary, new_common_taxa) = \
        summarize_common_categories(biom_table=otu_table,
                                    level=LEVEL,
                                    common_categories=COMMON_TAXA[:8],
                                    metadata_category=CATEGORY)

    # Converts the final taxa to a cleaned up list
    # Converts final taxa to a clean list
    common_phyla = []
    for taxon in new_common_taxa:
        common_phyla.append(taxon[1].strip(' p__').strip('[').strip(']'))
    new_common_taxa = common_phyla

    # Checks that the crrect sample ids are plotted
    if samples_to_plot is None:
        sample_ids = whole_sample_ids
    else:
        sample_ids = samples_to_plot

    if len(sample_ids) > 1:
        # TODO: make the rest of the code reflect this...
        raise ValueError("SCRIPT NO LONGER SUPPORTS MULTIPLE SAMPLES")

    # Identifies Michael Pollan's pre-ABX sample
    if debug:
        mp_sample_pos = 2
    else:
        mp_sample_pos = whole_sample_ids.tolist().index(michael_pollan)
    mp_sample_taxa = whole_summary[:, mp_sample_pos]

    # Gets the table average
    table_average = mean(whole_summary, 1)

    # Generates a figure for each sample
    for idx, sample_id in enumerate(whole_sample_ids):
        if sample_id in sample_ids:
            meta_data = map_dict[sample_id]
            # Prealocates a numpy array to hold the data
            tax_array = zeros((NUM_TAXA, NUM_CATS_TO_PLOT))

            # Adds preset values to the array so the first column is the sample
            # the second column is the average and the last column is Michael
            # Pollan
            tax_array[:, 0] = whole_summary[:, idx]
            tax_array[:, 1] = table_average
            tax_array[:, -1] = mp_sample_taxa

            # Adds the categories to the table in the listed order
            for idx, cat in enumerate(order):
                # Skips over undesired categories
                if cat in SKIPSET:
                    continue
                # Gets the sample metadata
                mapping_key = meta_data[cat]
                # Pulls taxonomic summary and group descriptions
                tax_summary = categories[cat]['Summary']
                group_descriptions = categories[cat]['Groups'].tolist()
                # Appends plotting tables
                try:
                    mapping_col = group_descriptions.index(mapping_key)
                except:
                    raise ValueError('The %s cannot be found in %s.' %
                                     (mapping_key, cat))
                tax_array[:, idx] = tax_summary[:, mapping_col]

            # Sets up the file to save the data
            filename = pjoin(output_dir, 'figure4.pdf')

            # Plots the data
            render_barchart(data_table=tax_array,
                            x_axis=False,
                            group_names=new_common_taxa,
                            legend=False,
                            sample_names=cat_list,
                            y_axis=False,
                            axis_dims=AXIS_DIMS,
                            fig_dims=FIG_DIMS,
                            file_out=filename,
                            show_edge=False,
                            colors=COLORMAP)
def main(otu_table, mapping_data, categories, output_dir, \
    samples_to_plot = None, legend = False, xaxis = True):
    """Creates stacked bar plots for an otu table
    INPUTS:
        otu_table -- an open OTU table

        mapping_data -- a tab delimited string containing the mapping data 
                    passed from the mapping file.

        categories -- a dictionary keying a mapping category to the 
                    corresponding sample IDs and taxonomy for a collapsed 
                    biom table

        output_dir -- the location of the directory where output files should be
                    saved. If this directory does not exist, it will be created.

        samples_to_plot -- a list of sample ids to plot. If no value is passed, 
                    then all samples in the biom table are analyzed.

    OUTPUTS:
        A pdf of stacked taxonomy will be generated for each sample and saved 
        in the output directory. These will follow the file name format 
        Figure_4_<SAMPLEID>.pdf
    """
    # Sets constants
    LEVEL = 2
    FILEPREFIX = 'Figure_4_'
    MICHAEL_POLLAN = '000007108.1075657'
    NUM_TAXA = 9
    NUM_CATS_TO_PLOT = 7
    
    # Loads the mapping file
    map_dict = map_to_2D_dict(mapping_data)
    
    (common_taxa, whole_sample_ids, whole_summary) = \
        summarize_human_taxa(otu_table, LEVEL)

    # Converts final taxa to a clean list
    common_phyla = []
    for taxon in common_taxa: 
        common_phyla.append(taxon[1].strip(' p__').strip('[').strip(']'))
    common_taxa = common_phyla
   
    # Checks that the correct sample ids are plotted
    if samples_to_plot == None:
        sample_ids = whole_sample_ids
    else:
        sample_ids = samples_to_plot

    # Identifies Michael Pollan's pre-ABX sample
    mp_sample_pos = whole_sample_ids.index(MICHAEL_POLLAN)
    mp_sample_taxa = whole_summary[:,mp_sample_pos]

    # Loads the category dictionary
    categories = load_category_files(category_fp, LEVEL)

    # Generates a figure for each sample
    for idx, sample_id in enumerate(whole_sample_ids):
        if sample_id in sample_ids:
            # Preallocates a numpy array for the plotting data
            tax_array = zeros((NUM_TAXA, NUM_CATS_TO_PLOT))        
            meta_data = map_dict[sample_id] 
            cat_list = ['You', 'Average', 'Similar Diet', ' Similar BMI', 
                        'Same Gender', 'Similar Age', 
                        'Michael Pollan', '']

            #cat_list.append('Your Fecal Sample')
            #cat_list.append('Average Fecal Samples')
        
            tax_array[:,0] = whole_summary[:,idx]
            tax_array[:,1] = mean(whole_summary, 1)
        
            cat_watch = 2
            # Identifies the appropriate metadata categories
            for cat in categories:                      
                # Pulls metadata for the sample and category
                mapping_key = meta_data[cat]
                # Pulls taxonomic summary and group descriptions for the category
                tax_summary = categories[cat]['Taxa Summary']
                group_descriptions = categories[cat]['Groups']               
                # Amends plotting tables
                try:
                    mapping_col = group_descriptions.index(mapping_key)
                except:
                    raise ValueError, 'The %s cannot be found in %s.' \
                    % (mapping_key, cat)
                tax_array[:,cat_watch] = tax_summary[:,mapping_col]

                cat_watch = cat_watch + 1

            tax_array[:,-1] = mp_sample_taxa
            # Plots the data
            filename = pjoin(output_dir, '%s%s.pdf' \
                % (FILEPREFIX, sample_id))
            plot_american_gut(tax_array, filename)
Example #5
0
    def test_map_to_2D_dict(self):
        """Checks map_to_2D_dict is sane"""
        # Creates a pseudo-opening function
        test_map = StringIO(
            '#SampleID\tBIRTH_YEAR\tDEATH_YEAR\tSEX\tPROFESSION\tHOME_STATE\n'
            '00010\t1954\t2006\tmale\tMechanic\tKansas\n'
            '00100\t1954\t1983\tfemale\tHunter\tKansas\n'
            '00200\tNA\t2009\tfemale\tNurse\tMinnesota\n'
            '00111\t1979\t2007\tmale\tHunter\tImpala\n'
            '00112\t1983\t2006\tmale\tHunter\tImpala\n'
            '00211\t1990\t2009\tmale\tStudent\tMinnesota\n')

        # Sets up the known dictionary
        known_dict = {
            '00010': {
                '#SampleID': '00010',
                'BIRTH_YEAR': '1954',
                'DEATH_YEAR': '2006',
                'SEX': 'male',
                'PROFESSION': 'Mechanic',
                'HOME_STATE': 'Kansas'
            },
            '00100': {
                '#SampleID': '00100',
                'BIRTH_YEAR': '1954',
                'DEATH_YEAR': '1983',
                'SEX': 'female',
                'PROFESSION': 'Hunter',
                'HOME_STATE': 'Kansas'
            },
            '00200': {
                '#SampleID': '00200',
                'BIRTH_YEAR': 'NA',
                'DEATH_YEAR': '2009',
                'SEX': 'female',
                'PROFESSION': 'Nurse',
                'HOME_STATE': 'Minnesota'
            },
            '00111': {
                '#SampleID': '00111',
                'BIRTH_YEAR': '1979',
                'DEATH_YEAR': '2007',
                'SEX': 'male',
                'PROFESSION': 'Hunter',
                'HOME_STATE': 'Impala'
            },
            '00112': {
                '#SampleID': '00112',
                'BIRTH_YEAR': '1983',
                'DEATH_YEAR': '2006',
                'SEX': 'male',
                'PROFESSION': 'Hunter',
                'HOME_STATE': 'Impala'
            },
            '00211': {
                '#SampleID': '00211',
                'BIRTH_YEAR': '1990',
                'DEATH_YEAR': '2009',
                'SEX': 'male',
                'PROFESSION': 'Student',
                'HOME_STATE': 'Minnesota'
            }
        }

        # Checks the test dictionary is loaded properly and equals the known
        test_dict = map_to_2D_dict(test_map)
        self.assertEqual(test_dict, known_dict)
    elif not isfile(args.input):
        parser.error("The supplied taxonomy file does not exist in the path.")
    else:
        with biom_open(args.input) as fp:
            tax_table = parse_biom_table(fp)

    # Checks the output directory is sane.
    if not args.output:
        parser.error("An output directory must be supplied.")
    elif not exists(args.output):
        mkdir(args.output)

    output_dir = args.output

    if args.mapping and not isfile(args.mapping):
        parser.error("The supplied mapping file does not exist in the path.")
    elif args.mapping:
        mapping = map_to_2D_dict(open(args.mapping, "U"))
    else:
        mapping = args.mapping

    # Parses the sample IDs as a list
    if args.samples:
        samples_to_analyze = []
        for sample in args.samples.split(","):
            samples_to_analyze.append(sample)
    else:
        samples_to_analyze = None

    main(taxa_table=tax_table, output_dir=output_dir, mapping=mapping, samples_to_analyze=samples_to_analyze)
def main(otu_table, mapping_data, cat_tables, output_dir, sample_type='fecal',
         samples_to_plot=None, legend=False, xaxis=True, debug=False):
    """Creates stacked bar plots for an otu table

    INPUTS:
        otu_table -- an open OTU table

        mapping_data -- a tab delimited string containing the mapping data
                    passed from the mapping file.

        categories -- a dictionary keying a mapping category to the
                    corresponding biom table

        output_dir -- the location of the directory where output files should
                    be saved. If this directory does not exist, it will be
                    created.

        samples_to_plot -- a list of sample ids to plot. If no value is passed,
                    then all samples in the biom table are analyzed.

        debug -- ignore properly handling Michael Pollan's sample

    OUTPUTS:
        A pdf of stacked taxonomy will be generated for each sample and saved
        in the output directory. These will follow the file name format
        Figure_4_<SAMPLEID>.pdf
    """

    # Sets constants for analyzing the data
    LEVEL = 2
    CATEGORY = 'taxonomy'
    NUM_TAXA = 9
    NUM_CATS_TO_PLOT = 7

    # Sets up file name constants
    FILEPREFIX = 'Figure_4_'
    FILE_END = '.pdf'

    # Sets up plotting constants
    COLORMAP = array([[0.8353, 0.2421, 0.3098],
                      [0.9569, 0.4275, 0.2627],
                      [0.9922, 0.6824, 0.3804],
                      [0.9961, 0.8784, 0.5351],
                      [0.9020, 0.9608, 0.5961],
                      [0.6706, 0.8667, 0.6431],
                      [0.4000, 0.7608, 0.6471],
                      [0.1961, 0.5333, 0.7412],
                      [0.3333, 0.3333, 0.3333]])

    FIG_DIMS = (4.44444, 3.33333)
    AXIS_DIMS = array([[0.05, 0.05],
                       [0.95, 0.95]])

    # Common taxa are designated before processing to remain constant.
    COMMON_TAXA = [(u'k__Bacteria', u'p__Firmicutes'),
                   (u'k__Bacteria', u'p__Bacteroidetes'),
                   (u'k__Bacteria', u'p__Proteobacteria'),
                   (u'k__Bacteria', u'p__Actinobacteria'),
                   (u'k__Bacteria', u'p__Verrucomicrobia'),
                   (u'k__Bacteria', u'p__Tenericutes'),
                   (u'k__Bacteria', u'p__Cyanobacteria'),
                   (u'k__Bacteria', u'p__Fusobacteria')]

    SKIPSET = set(('Sample', 'Average', 'MP'))

    # Names categories being plotted
    if sample_type == 'fecal':
        michael_pollan = '000007108.1075657'
        cat_list = ['You', 'Average', 'Similar Diet', ' Similar BMI',
                    'Same Gender', 'Similar Age', 'Michael Pollan']
        order = ['Sample', 'Average', 'DIET_TYPE', 'BMI_CATEGORY', 'SEX',
                 'AGE_CATEGORY', 'MP']

    elif sample_type == 'skin':
        michael_pollan = '7113.1075702'
        cat_list = ['You', 'Average', 'Similar Cosmetic Use',
                    'Same Dominant Hand', 'Same Gender', 'Same Age',
                    'Michael Pollan']
        order = ['Sample', 'Average', 'COSMETICS_FREQUENCY',
                 'DOMINANT_HAND', 'SEX', 'AGE_CATEGORY', 'MP']

    elif sample_type == 'oral':
        michael_pollan = '7109.1075688'
        cat_list = ['You', 'Average', 'Similar Diet', 'Flossing Frequency',
                    'Same Gender', 'Same Age', 'Michael Pollan']
        order = ['Sample', 'Average', 'DIET_TYPE', 'FLOSSING_FREQUENCY',
                 'SEX', 'AGE_CATEGORY', 'MP']

    else:
        raise ValueError('%s is not a supported sample type.' % sample_type)

    # Gets the mapping file
    map_dict = map_to_2D_dict(mapping_data)

    # Gets the category file dictionary summarized with the common categories
    # Generates the category file dictionary
    categories = parse_category_files(raw_tables=cat_tables,
                                      common_groups=COMMON_TAXA[:8],
                                      level=LEVEL,
                                      metadata=CATEGORY)

    # Summarizes taxonomy for the category
    (whole_sample_ids, whole_summary, new_common_taxa) = \
        summarize_common_categories(biom_table=otu_table,
                                    level=LEVEL,
                                    common_categories=COMMON_TAXA[:8],
                                    metadata_category=CATEGORY)

    # Converts the final taxa to a cleaned up list
    # Converts final taxa to a clean list
    common_phyla = []
    for taxon in new_common_taxa:
        common_phyla.append(taxon[1].strip(' p__').strip('[').strip(']'))
    new_common_taxa = common_phyla

    # Checks that the crrect sample ids are plotted
    if samples_to_plot is None:
        sample_ids = whole_sample_ids
    else:
        sample_ids = samples_to_plot

    # Identifies Michael Pollan's pre-ABX sample
    if debug:
        mp_sample_pos = 2
    else:
        mp_sample_pos = whole_sample_ids.tolist().index(michael_pollan)
    mp_sample_taxa = whole_summary[:, mp_sample_pos]

    # Gets the table average
    table_average = mean(whole_summary, 1)

    # Generates a figure for each sample
    for idx, sample_id in enumerate(whole_sample_ids):
        if sample_id in sample_ids:
            meta_data = map_dict[sample_id]
            # Prealocates a numpy array to hold the data
            tax_array = zeros((NUM_TAXA, NUM_CATS_TO_PLOT))

            # Adds preset values to the array so the first column is the sample
            # the second column is the average and the last column is Michael
            # Pollan
            tax_array[:, 0] = whole_summary[:, idx]
            tax_array[:, 1] = table_average
            tax_array[:, -1] = mp_sample_taxa

            # Adds the categories to the table in the listed order
            for idx, cat in enumerate(order):
                # Skips over undesired categories
                if cat in SKIPSET:
                    continue
                # Gets the sample metadata
                mapping_key = meta_data[cat]
                # Pulls taxonomic summary and group descriptions
                tax_summary = categories[cat]['Summary']
                group_descriptions = categories[cat]['Groups'].tolist()
                # Appends plotting tables
                try:
                    mapping_col = group_descriptions.index(mapping_key)
                except:
                    raise ValueError('The %s cannot be found in %s.'
                                     % (mapping_key, cat))
                tax_array[:, idx] = tax_summary[:, mapping_col]

            # Sets up the file to save the data
            filename = pjoin(output_dir, '%s%s%s'
                             % (FILEPREFIX, sample_id, FILE_END))

            # Plots the data
            render_barchart(data_table=tax_array,
                            x_axis=False,
                            group_names=new_common_taxa,
                            legend=False,
                            sample_names=cat_list,
                            y_axis=False,
                            axis_dims=AXIS_DIMS,
                            fig_dims=FIG_DIMS,
                            file_out=filename,
                            show_edge=False,
                            colors=COLORMAP)