Ejemplo n.º 1
0
def main(tax_table, output_dir, samples_to_analyze=None):
    """Generates pie chart of the most abundant twelve taxa in the sample
    INPUTS:
        otu_table -- a biom formatted taxonomy table at the desired level of
        resolution

        output_dir -- the location of the directory where output files should
                    be stored.

        samples_to_analyze -- a list of sample ids to plot. If no value is
                    passed, then all samples in the biom table are analyzed.

    OUTPUTS:
        A pdf of the piechart summarizing the most abundant taxa will be
        generated and saved to the output directory. These will follow the
        naming convention PIECHART_<SAMPLEID>.pdf.
    """

    # Creates the text around hte file name
    FILENAME_BEFORE = 'piechart_'
    FILENAME_AFTER = '.pdf'

    # Handles string cleaning
    RENDER = 'LATEX'
    UNCLASSIFIED = False

    # Sets up the rare threshhold for
    RARE_THRESH = 0.0
    SUM_MIN = 1

    # Sets up axis parameters
    AXIS_LENGTH = 7.25
    AXIS_BORDER = 0.01
    AXIS_TITLE = 0
    AXIS_LEGEND = 7
    # Modifies the axis limits
    AX_LIMS = [-1.05, 1.05]
    # Sets up constants for getting the colormap and plotting
    MAP_NAME = 'BrBG'
    NUM_SHOW = 12
    OTHER_COLOR = array([[85 / 255, 85 / 255, 85 / 255]])
    # Sets up plotting parameters
    FIG_LEGEND = True
    FIG_COLOR_EDGE = False
    FIG_LEG_FRAME = False
    FIG_LEG_OFFSET = [0.95, 0.025, 1.0, 0.95]
    # Sets up the the legend font
    LEG_FONT = FontProperties()
    LEG_FONT.set_size(28)
    LEG_FONT.set_family('sans-serif')
    # Sets the general font properties
    use_latex = True
    rc_font_family = 'sans-serif'
    rc_font = ['Helvetica', 'Arial']

    # Sets up the colormap
    colormap = translate_colors((NUM_SHOW - 1), MAP_NAME)
    colormap = vstack((colormap, OTHER_COLOR))

    # Sets up plotting constants
    (axis_dims,
     fig_dims) = calculate_dimensions_rectangle(axis_width=AXIS_LENGTH,
                                                axis_height=AXIS_LENGTH,
                                                border=AXIS_BORDER,
                                                title=AXIS_TITLE,
                                                legend=AXIS_LEGEND)

    # Walks over a taxa tree and prioritizes based on taxonomy
    (tree, all_taxa) = build_tree_from_taxontable(tax_table)

    # Sets up samples for which tables are being generated
    if samples_to_analyze is not None:
        samples_to_test = samples_to_analyze
    else:
        samples_to_test = all_taxa.keys()

    # Checks the samples exist
    if samples_to_test:
        samples_to_test = set(samples_to_test)
        tmp = {k: v for k, v in all_taxa.items() if k in samples_to_test}
        all_taxa = tmp
        if not samples_to_test:
            raise ValueError("No samples!")

    # Walks over the table
    filt_fun = lambda v, i, md: v.sum() > 0
    for samp, filtered_table, rare, unique in sample_rare_unique(
            tree, tax_table, all_taxa, RARE_THRESH):
        # abund_fun = lambda v, i, md: i in all_taxa[samp]
        filtered_table = tax_table.filterObservations(filt_fun)
        sample_data = filtered_table.sampleData(samp)
        taxa = filtered_table.ObservationIds

        # Calculates abundance and limits to the top n samples.
        abund_rank = calculate_abundance(sample=sample_data,
                                         taxa=taxa,
                                         sum_min=SUM_MIN)

        abund_rank = abund_rank[:(NUM_SHOW - 1)]

        # Cleans the greengenes strings and adds an "Other" Category for
        # missing taxa
        [sample_tax, sample_freq] = [list(a) for a in zip(*abund_rank)]
        clean_tax = [
            clean_greengenes_string(tax, RENDER, unclassified=UNCLASSIFIED)
            for tax in sample_tax
        ]
        clean_tax.append('Other')
        sample_freq.append(1 - sum(sample_freq))

        # Sets up the sample filename
        filename = pjoin(output_dir,
                         '%s%s%s' % (FILENAME_BEFORE, samp, FILENAME_AFTER))

        # Creates the pie chart
        render_single_pie(data_vec=sample_freq,
                          group_names=clean_tax,
                          axis_dims=axis_dims,
                          fig_dims=fig_dims,
                          file_out=filename,
                          legend=FIG_LEGEND,
                          colors=colormap,
                          show_edge=FIG_COLOR_EDGE,
                          legend_frame=FIG_LEG_FRAME,
                          rc_font=rc_font,
                          legend_offset=FIG_LEG_OFFSET,
                          rc_fam=rc_font_family,
                          legend_font=LEG_FONT,
                          use_latex=use_latex,
                          x_lims=AX_LIMS,
                          y_lims=AX_LIMS)
Ejemplo n.º 2
0
def main(tax_table, output_dir, samples_to_analyze=None):
    """Generates pie chart of the most abundant twelve taxa in the sample
    INPUTS:
        otu_table -- a biom formatted taxonomy table at the desired level of
        resolution

        output_dir -- the location of the directory where output files should
                    be stored.

        samples_to_analyze -- a list of sample ids to plot. If no value is
                    passed, then all samples in the biom table are analyzed.

    OUTPUTS:
        A pdf of the piechart summarizing the most abundant taxa will be
        generated and saved to the output directory. These will follow the
        naming convention PIECHART_<SAMPLEID>.pdf.
    """

    # Creates the text around hte file name
    FILENAME_BEFORE = 'piechart_'
    FILENAME_AFTER = '.pdf'

    # Handles string cleaning
    RENDER = 'LATEX'
    UNCLASSIFIED = False

    # Sets up the rare threshhold for
    RARE_THRESH = 0.0
    SUM_MIN = 1

    # Sets up axis parameters
    AXIS_LENGTH = 7.25
    AXIS_BORDER = 0.01
    AXIS_TITLE = 0
    AXIS_LEGEND = 7
    # Modifies the axis limits
    AX_LIMS = [-1.05, 1.05]
    # Sets up constants for getting the colormap and plotting
    MAP_NAME = 'BrBG'
    NUM_SHOW = 12
    OTHER_COLOR = array([[85/255, 85/255, 85/255]])
    # Sets up plotting parameters
    FIG_LEGEND = True
    FIG_COLOR_EDGE = False
    FIG_LEG_FRAME = False
    FIG_LEG_OFFSET = [0.95, 0.025, 1.0, 0.95]
    # Sets up the the legend font
    LEG_FONT = FontProperties()
    LEG_FONT.set_size(28)
    LEG_FONT.set_family('sans-serif')
    # Sets the general font properties
    use_latex = True
    rc_font_family = 'sans-serif'
    rc_font = ['Helvetica', 'Arial']

    # Sets up the colormap
    colormap = translate_colors((NUM_SHOW-1), MAP_NAME)
    colormap = vstack((colormap, OTHER_COLOR))

     # Sets up plotting constants
    (axis_dims, fig_dims) = calculate_dimensions_rectangle(
        axis_width=AXIS_LENGTH, axis_height=AXIS_LENGTH, border=AXIS_BORDER,
        title=AXIS_TITLE, legend=AXIS_LEGEND)

    # Walks over a taxa tree and prioritizes based on taxonomy
    (tree, all_taxa) = build_tree_from_taxontable(tax_table)

    # Sets up samples for which tables are being generated
    if samples_to_analyze is not None:
        samples_to_test = samples_to_analyze
    else:
        samples_to_test = all_taxa.keys()

    # Checks the samples exist
    if samples_to_test:
        samples_to_test = set(samples_to_test)
        tmp = {k: v for k, v in all_taxa.items() if k in samples_to_test}
        all_taxa = tmp
        if not samples_to_test:
            raise ValueError("No samples!")

    # Walks over the table
    filt_fun = lambda v, i, md: v.sum() > 0
    for samp, filtered_table, rare, unique in sample_rare_unique(tree,
                                                                 tax_table,
                                                                 all_taxa,
                                                                 RARE_THRESH):
        # abund_fun = lambda v, i, md: i in all_taxa[samp]
        filtered_table = tax_table.filterObservations(filt_fun)
        sample_data = filtered_table.sampleData(samp)
        taxa = filtered_table.ObservationIds

        # Calculates abundance and limits to the top n samples.
        abund_rank = calculate_abundance(sample=sample_data,
                                         taxa=taxa,
                                         sum_min=SUM_MIN)

        abund_rank = abund_rank[:(NUM_SHOW-1)]

        # Cleans the greengenes strings and adds an "Other" Category for
        # missing taxa
        [sample_tax, sample_freq] = [list(a) for a in zip(*abund_rank)]
        clean_tax = [clean_greengenes_string(tax, RENDER,
                                             unclassified=UNCLASSIFIED)
                     for tax in sample_tax]
        clean_tax.append('Other')
        sample_freq.append(1-sum(sample_freq))

        # Sets up the sample filename
        filename = pjoin(output_dir, '%s%s%s' % (FILENAME_BEFORE, samp,
                                                 FILENAME_AFTER))


        # Creates the pie chart
        render_single_pie(data_vec=sample_freq,
                          group_names=clean_tax,
                          axis_dims=axis_dims,
                          fig_dims=fig_dims,
                          file_out=filename,
                          legend=FIG_LEGEND,
                          colors=colormap,
                          show_edge=FIG_COLOR_EDGE,
                          legend_frame=FIG_LEG_FRAME,
                          rc_font=rc_font,
                          legend_offset=FIG_LEG_OFFSET,
                          rc_fam=rc_font_family,
                          legend_font=LEG_FONT,
                          use_latex=use_latex,
                          x_lims=AX_LIMS,
                          y_lims=AX_LIMS)
    def test_calculate_abundance(self):
        """Checks that abundance is calculated sanely"""
        # Checks errors are thrown when the sample and taxa are different
        # lengths
        with self.assertRaises(ValueError):
            calculate_abundance(self.sample[0:2], self.taxa)

        # Sets up known value
        known_abundance_95 = [['k__Bacteria; p__[Proteobacteria]', 0.7],
                              ['k__Bacteria; p__Actinobacteria; '
                               'c__Coriobacteriia; o__Coriobacteriales; '
                               'f__Coriobacteriaceae; g__', 0.1427],
                              ['k__Bacteria; p__Proteobacteria; '
                               'c__Gammaproteobacteria; o__Enterobacteriales; '
                               'f__Enterbacteriaceae; g__Escherichia', 0.1],
                              ['k__Archaea; p__Crenarchaeota; '
                               'c__Thaumarchaeota; o__Cenarchaeales; '
                               'f__Cenarchaeaceae; g__Nitrosopumilus', 0.03]]

        known_abundance_99 = [['k__Bacteria; p__[Proteobacteria]', 0.7],
                              ['k__Bacteria; p__Actinobacteria; '
                               'c__Coriobacteriia; o__Coriobacteriales; '
                               'f__Coriobacteriaceae; g__', 0.1427],
                              ['k__Bacteria; p__Proteobacteria; '
                               'c__Gammaproteobacteria; o__Enterobacteriales; '
                               'f__Enterbacteriaceae; g__Escherichia', 0.1],
                              ['k__Archaea; p__Crenarchaeota; '
                               'c__Thaumarchaeota; o__Cenarchaeales; '
                               'f__Cenarchaeaceae; g__Nitrosopumilus', 0.03],
                              ['k__Bacteria', 0.02]]

        known_abundance_1 = [['k__Bacteria; p__[Proteobacteria]', 0.7],
                             ['k__Bacteria; p__Actinobacteria; '
                              'c__Coriobacteriia; o__Coriobacteriales; '
                              'f__Coriobacteriaceae; g__', 0.1427],
                             ['k__Bacteria; p__Proteobacteria; '
                              'c__Gammaproteobacteria; o__Enterobacteriales; '
                              'f__Enterbacteriaceae; g__Escherichia', 0.1],
                             ['k__Archaea; p__Crenarchaeota; '
                              'c__Thaumarchaeota; o__Cenarchaeales; '
                              'f__Cenarchaeaceae; g__Nitrosopumilus', 0.03],
                             ['k__Bacteria', 0.02],
                             ['k__Bacteria; p__Proteobacteria; '
                              'c__Gammaproteobacteria', 0.005],
                             ['k__Bacteria; p__Proteobacteria; '
                              'c__Gammaproteobacteria; o__Enterobacteriales; '
                              'f__Enterbacteriaceae', 0.002],
                             ['k__Bacteria; p__Proteobacteria; '
                              'c__Gammaproteobacteria; o__Enterobacteriales; '
                              'f__Enterbacteriaceae; g__Escherichia; s__coli',
                              0.0002],
                             ['k__Bacteria; p__Proteobacteria; '
                              'c__Gammaproteobacteria; o__Enterobacteriales',
                              0.0001],
                             ['k__Bacteria; p__Actinobacteria; '
                              'c__Actinobacteria; o__Actinomycetales; '
                              'f__Dietziaceae; g__', 0]]

        # Generates the value for testing
        test_abundance_def = calculate_abundance(sample=self.sample,
                                                 taxa=self.taxa)
        self.assertEqual(test_abundance_def, known_abundance_95)

        test_abundance_99 = calculate_abundance(sample=self.sample,
                                                taxa=self.taxa,
                                                sum_min=0.99)
        self.assertEqual(test_abundance_99, known_abundance_99)

        test_abundance_1 = calculate_abundance(sample=self.sample,
                                               taxa=self.taxa,
                                               sum_min=1.000)
        self.assertEqual(test_abundance_1, known_abundance_1)
Ejemplo n.º 4
0
def main(taxa_table, output_dir, mapping=None, samples_to_analyze=None):
    """Creates LaTeX formatted significant OTU lists

    INPUTS:
        tax_table -- a numpy array with the relative frequencies of taxonomies
            (rows) for each give sample (column)

        output_dir -- a directory where the final files should be saved.

        mapping -- a 2D dictionary of mapping data where the sample id is keyed
                    to a dictionary of metadata.

        samples_to_analyze -- a list of samples_to_analyze which should be used
                    to generate data. If None, all the samples will be used.
                    DEFAULT: None

    OUTPUTS:
        Generates text files containing LaTex encoded strings which creates a
        LaTeX macro dictionary with the information for creating a table of
        most abundant taxa, most enriched taxa, and rare and unique taxa. Rare
        defined as present in less than 10% of the total population. The unique
        taxa are bolded in the lists.
    """
    # Sets up the way samples should be converted
    SAMPLE_CONVERTER = {'feces': 'fecal',
                        'oral_cavity': 'oral',
                        'oral cavity': 'oral',
                        'skin': 'skin'}

    DUMMY = ['', '', '', '']
    COUNT = [0, 1, 2, 3, 4, 5, 6, 7]
    # Sets table constants
    RENDERING = "LATEX"
    RARE_THRESH = 0.1

    SUM_MIN = 1

    FORMAT_SIGNIFIGANCE = ['%1.2f', "%1.2f", "%i", "SKIP"]
    SIGNIFIGANCE_HUNDRED = [True, True, False, False]
    MACRO_CATS_SIGNIFICANCE = ['enrichTaxon', 'enrichSampl', 'enrichPopul',
                               'enrichFold']
    MACRO_FORM_SIGNIFICANCE = [lambda x: clean_greengenes_string(x,
                               render_mode='LATEX'),
                               lambda x: x,
                               lambda x: x,
                               lambda x: x]

    DUMMY = ['', '', '', '']
    COUNT = [0, 1, 2, 3, 4, 5, 6, 7]

    FORMAT_ABUNDANCE = ["%1.1f"]
    ABUNDANCE_HUNDRED = [True]
    MACRO_CATS_ABUNDANCE = ['abundTaxon', 'abundSampl']
    MACRO_FORM_ABUNDANCE = [lambda x: clean_greengenes_string(x,
                            render_mode='LATEX'), lambda x: x]

    DATE_FIELD = 'COLLECTION_DATE'
    DATE_FORMAT_SHORT = '%m/%d/%y'
    DATE_FORMAT_LONG = '%m/%d/%Y'

    UNKNOWNS = set(['None', 'NONE', 'none', 'NA', 'na', 'UNKNOWN', 'unknown'])
    DATE_OUT = '%B %d, %Y'
    TIME_FIELD = 'COLLECTION_TIME'

    # Number of taxa shown is an indexing value, it is one less than what is
    # actually shown.
    NUM_TAXA_SHOW = 5

    # Builds the the taxomnomy tree for the table and identifies the
    # rare/unique taxa in each sample
    tree, all_taxa = build_tree_from_taxontable(taxa_table)

    # Sets up samples for which tables are being generated
    if samples_to_analyze is not None:
        samples_to_test = samples_to_analyze
    else:
        samples_to_test = all_taxa.keys()

    if samples_to_test:
        samples_to_test = set(samples_to_test)
        tmp = {k: v for k, v in all_taxa.items() if k in samples_to_test}
        all_taxa = tmp
        if not samples_to_test:
            raise ValueError("No samples!")

    # Generates lists and tables for each sample
    for samp, filtered_table, rare, unique in sample_rare_unique(tree,
                                                                 tax_table,
                                                                 all_taxa,
                                                                 RARE_THRESH):
        # Sets up filename
        file_name = pjoin(output_dir, 'macros.tex')

        def filt_fun(v, i, md):
            return v.sum() > 0

        filtered_table = filtered_table.filter(filt_fun, axis='observation',
                                               inplace=False)
        abund_table = tax_table.filter(filt_fun, axis='observation',
                                       inplace=False)

        # Gets sample information for the whole table
        abund_sample = abund_table.data(samp)
        abund_taxa = abund_table.ids(axis='observation')

        # Gets sample information for other filtered samples
        filt_taxa = filtered_table.ids(axis='observation')
        population = array([filtered_table.data(i, axis='observation') for i in
                            filtered_table.ids(axis='observation')])

        sample_position = filtered_table.index(samp, axis='sample')
        filt_sample = filtered_table.data(samp)

        population = delete(population, sample_position, 1)

        # Converts the lists into greengenes strings for later processing
        greengenes_rare = []
        greengenes_unique = []
        for taxon in rare:
            greengenes_rare.append(';'.join(taxon))
        for taxon in unique:
            greengenes_unique.append(';'.join(taxon))

        # Formats the rare and unique lists
        rare_format = []
        rare_combined = []
        for taxon in greengenes_unique:
            rare_combined.append(taxon)
            rare_format.append('COLOR')
        for taxon in greengenes_rare:
            rare_combined.append(taxon)
            rare_format.append('REG')

        number_rare_tax = len(rare_combined)
        num_rare = len(rare)
        num_unique = len(unique)

        rare_formatted = \
            convert_taxa_to_list(rare_combined[0:NUM_TAXA_SHOW],
                                 tax_format=rare_format,
                                 render_mode=RENDERING,
                                 comma=True)

        if num_unique > 0:
            unique_string = ' and \\textcolor{red}{%i unique}' % num_unique
        else:
            unique_string = ''

        if number_rare_tax == 0:
            rare_formatted = "There were no rare or unique taxa found in "\
                "your sample."

        elif 0 < number_rare_tax <= NUM_TAXA_SHOW:
            rare_formatted = 'Your sample contained the following rare%s '\
                'taxa: %s.' % (unique_string, rare_formatted)

        else:
            rare_formatted = 'Your sample contained %i rare%s taxa, '\
                'including the following: %s.' \
                % (num_rare, unique_string,
                   rare_formatted)

        # Calculates abundance rank
        (abundance) = calculate_abundance(abund_sample, abund_taxa,
                                          sum_min=SUM_MIN)

        # Generates formatted abundance table
        formatted_abundance = convert_taxa(abundance[0:NUM_TAXA_SHOW],
                                           formatting_keys=FORMAT_ABUNDANCE,
                                           hundredx=ABUNDANCE_HUNDRED)

        abundance_formatted = \
            build_latex_macro(formatted_abundance,
                              categories=MACRO_CATS_ABUNDANCE,
                              format=MACRO_FORM_ABUNDANCE)

        (high, low) = calculate_tax_rank_1(sample=filt_sample,
                                           population=population,
                                           taxa=filt_taxa,
                                           critical_value=0.05)

        if len(high) == 0:
            formatted_high = [['', '', '', '']]*NUM_TAXA_SHOW

        elif len(high) < NUM_TAXA_SHOW:
            # Formats the known high taxa
            formatted_high = \
                convert_taxa(high[0:NUM_TAXA_SHOW],
                             formatting_keys=FORMAT_SIGNIFIGANCE,
                             hundredx=SIGNIFIGANCE_HUNDRED)

            # Adds the dummy list to the end
            for idx in COUNT:
                if idx == (NUM_TAXA_SHOW - len(high)):
                    break
                formatted_high.append(DUMMY)

        else:
            formatted_high = convert_taxa(high[0:NUM_TAXA_SHOW],
                                          formatting_keys=FORMAT_SIGNIFIGANCE,
                                          hundredx=SIGNIFIGANCE_HUNDRED)

        high_formatted = build_latex_macro(formatted_high,
                                           categories=MACRO_CATS_SIGNIFICANCE,
                                           format=MACRO_FORM_SIGNIFICANCE)

        # Handles date parsing
        if mapping is not None and mapping[samp][DATE_FIELD] not in UNKNOWNS:
            try:
                sample_date = format_date(mapping[samp],
                                          date_field=DATE_FIELD,
                                          d_form_in=DATE_FORMAT_SHORT,
                                          format_out=DATE_OUT)
            except:
                sample_date = format_date(mapping[samp],
                                          date_field=DATE_FIELD,
                                          d_form_in=DATE_FORMAT_LONG,
                                          format_out=DATE_OUT)
        else:
            sample_date = 'unknown'

        # Removes a zero character from the date
        if ',' in sample_date and sample_date[sample_date.index(',')-2] == '0':
                zero_pos = sample_date.index(',')-2
                sample_date = ''.join([sample_date[:zero_pos],
                                       sample_date[zero_pos+1:]])

        else:
            sample_date = 'unknown'

        # Handles sample parsing
        if mapping is not None and mapping[samp][TIME_FIELD] not in UNKNOWNS:
            sample_time = mapping[samp][TIME_FIELD].lower()
        else:
            sample_time = 'unknown'

        if mapping is not None:
            sample_type_prelim = mapping[samp]['BODY_HABITAT'].split(':')[1]
            if sample_type_prelim in SAMPLE_CONVERTER:
                sample_type = SAMPLE_CONVERTER[sample_type_prelim]
            elif sample_type in UNKNOWNS:
                sample_time = 'unknown'
            else:
                sample_type = sample_type_prelim.lower()
        else:
            sample_type = 'unknown'

        # Saves the file
        file_for_editing = open(file_name, 'w')
        file_for_editing.write('%% Barcode\n\\def\\barcode{%s}\n\n'
                               % samp.split('.')[0])
        file_for_editing.write('%% Sample Type\n\\def\\sampletype{%s}\n\n'
                               % sample_type)
        file_for_editing.write('%% Sample Date\n\\def\\sampledate{%s}\n'
                               '\\def\\sampletime{%s}\n\n\n'
                               % (sample_date, sample_time))
        file_for_editing.write('%% Abundance Table\n%s\n\n\n'
                               % abundance_formatted)
        file_for_editing.write('%% Enrichment Table\n%s\n\n\n'
                               % high_formatted)
        file_for_editing.write('%% Rare List\n\\def\\rareList{%s}\n'
                               % rare_formatted)
        file_for_editing.close()
def main(taxa_table, output_dir, samples_to_analyze = None):

    """Creates LaTeX formatted significant OTU lists

    INPUTS:
        taxa -- a numpy vector with greengenes taxonomy strings

        tax_table -- a numpy array with the relative frequencies of taxonomies
            (rows) for each give sample (column)

        samples_to_analyze -- a numpy vector of sample ids associated with the 
            tax_table values

        output_dir -- a directory where the final files should be saved.

        samples_to_analyze -- a list of samples_to_analyze which should be used 
                    to generate data. If this is left empty, all the samples in 
                    the table will be used.

    OUTPUTS:
        Generates text files containing LaTex encoded strings which creates a 
        LaTeX macro dictionary with the information for creating a table of most
        abundant taxa, most enriched taxa, and rare and unique taxa. Rare 
        defined as present in less than 10% of the total population. The unique 
        taxa are bolded in the lists. 
    """

    # Sets table constants
    RARE_THRESHHOLD = 0.1
    RENDERING = "LATEX"
    FORMAT_SIGNIFIGANCE = ['%1.1f', "%1.2f", "%i", "SKIP"]
    SIGNIFIGANCE_HUNDRED = [True, True, False, False]
    DUMMY  = ['','','','']    
    COUNT = [0, 1, 2, 3, 4, 5, 6, 7]
    FORMAT_ABUNDANCE = ["%1.1f"]
    ABUNDANCE_HUNDRED = [True]
    MACRO_CATS_SIGNIFICANCE = ['enrichTaxon','enrichSampl', 'enrichPopul', 
        'enrichFoldd']
    MACRO_CATS_ABUNDANCE = ['abundTaxon', 'abundSampl']

    FILE_PRECURSER = 'macros_'
    FILE_EXTENSION = '.tex'

    # Number of taxa shown is an indexing value, it is one less than what is 
    # actually shown.
    NUMBER_OF_TAXA_SHOWN = 5

    # Builds the the taxomnomy tree for the table and identifies the 
    # rare/unique taxa in each sample
    tree, all_taxa = build_tree_from_taxontable(taxa_table)

    # Sets up samples for which tables are being generated    
    if not samples_to_analyze == None:
        samples_to_test = samples_to_analyze
    else:
        samples_to_test = all_taxa.keys()

    if samples_to_test:
        samples_to_test = set(samples_to_test)
        tmp = {k:v for k,v in all_taxa.items() if k in samples_to_test}
        all_taxa = tmp
        if not samples_to_test:
            raise ValueError, "No samples!"

    # Generates lists and tables for each sample
    for samp, filtered_table, rare, unique in sample_rare_unique(tree, \
        taxa_table, all_taxa, RARE_THRESHHOLD):
        filtered_table = filtered_table.filterObservations(lambda v,i,md:\
        v.sum() > 0)

        # Gets sample information for other samples
        taxa = filtered_table.ObservationIds       
        population = array([filtered_table.observationData(i) for i in \
            filtered_table.ObservationIds])
        
        sample_position = filtered_table.getSampleIndex(samp)
        sample = filtered_table.sampleData(samp)
        print samp, sum(sample > 0)
                
        population = delete(population, sample_position, 1)
        
        # Converts the lists into greengenes strings for later processing
        greengenes_rare = []
        greengenes_unique = []
        for taxon in rare:
            greengenes_rare.append('; '.join(taxon))
        for taxon in unique:
            greengenes_unique.append('; '.join(taxon))

        # Formats the rare and unique lists          
        rare_format = []
        rare_combined = []
        for taxon in greengenes_unique:
            rare_combined.append(taxon)
            rare_format.append('COLOR')
        for taxon in greengenes_rare:
            rare_combined.append(taxon)
            rare_format.append('REG')

        number_rare_tax = len(rare_combined)
        num_rare = len(rare)
        num_unique = len(unique)

        rare_formatted = convert_taxa_to_list(rare_combined[0:NUMBER_OF_TAXA_SHOWN], 
                                               tax_format = rare_format,
                                               render_mode = RENDERING, 
                                               comma = True)        
     
        if num_unique > 0:
            unique_string = ' and \\textcolor{red}{%i unique}' % num_unique
        else:
            unique_string = ''

        if number_rare_tax == 0:
            rare_formatted = "There were no rare or unique taxa found in "\
                "your sample." 

        elif 0 < number_rare_tax <= NUMBER_OF_TAXA_SHOWN:
            rare_formatted = 'Your sample contained the following rare %s '\
                'taxa: %s.' % (unique_string, rare_formatted)

        else:
            rare_formatted = 'Your sample contained %i rare and %s taxa, '\
                'including the following: %s.' \
                % (num_rare, unique_string, 
                   rare_formatted)


        # Calculates abundance rank
        (abundance) = calculate_abundance(sample, taxa, 
                                          abundance_threshhold = 1)

        # Generates formatted abundance table
        formatted_abundance = convert_taxa(abundance[0:NUMBER_OF_TAXA_SHOWN],                                        
                                        formatting_keys = FORMAT_ABUNDANCE,
                                        hundredx = ABUNDANCE_HUNDRED)

        abundance_formatted = generate_latex_macro(formatted_abundance, \
            categories = MACRO_CATS_ABUNDANCE)

        (high, low) = calculate_tax_rank_1(sample = sample, 
                                           population = population, 
                                           taxa = taxa,
                                           critical_value = 0.05)

        if len(high) < NUMBER_OF_TAXA_SHOWN:
            # Formats the known high taxa
            formatted_high = convert_taxa(high[0:NUMBER_OF_TAXA_SHOWN],
                                          formatting_keys = FORMAT_SIGNIFIGANCE,
                                          hundredx = SIGNIFIGANCE_HUNDRED)
            # Adds the dummy list to the end
            for idx in COUNT:
                if idx == (NUMBER_OF_TAXA_SHOWN - len(high)):
                    break
                formatted_high.append(DUMMY)           

            high_formatted = generate_latex_macro(formatted_high, \
                categories = MACRO_CATS_SIGNIFICANCE)

        else:
            formatted_high = convert_taxa(high[0:NUMBER_OF_TAXA_SHOWN],
                                          formatting_keys = FORMAT_SIGNIFIGANCE,
                                          hundredx = SIGNIFIGANCE_HUNDRED)

            high_formatted = generate_latex_macro(formatted_high, \
                categories = MACRO_CATS_SIGNIFICANCE)


    
        file_name = pjoin(output_dir, '%s%s%s' % (FILE_PRECURSER, samp, 
            FILE_EXTENSION))

        # Saves the file
        file_for_editing = open(file_name, 'w')
        # file_for_editing.write('% Participant Name\n\\def\\yourname'\
        #     '{Michael Pollan or longer name}\n\n')
        file_for_editing.write('%% Abundance Table\n%s\n\n\n' \
            % abundance_formatted)
        file_for_editing.write('%% Enrichment Table\n%s\n\n\n' \
            % high_formatted)
        file_for_editing.write('%% Rare List\n\\def\\rareList{%s}\n' \
            % rare_formatted)
        file_for_editing.close()
Ejemplo n.º 6
0
    def test_calculate_abundance(self):
        """Checks that abundance is calculated sanely"""
        # Checks errors are thrown when the sample and taxa are different
        # lengths
        with self.assertRaises(ValueError):
            calculate_abundance(self.sample[0:2], self.taxa)

        # Sets up known value
        known_abundance_95 = [
            ['k__Bacteria; p__[Proteobacteria]', 0.7],
            [
                'k__Bacteria; p__Actinobacteria; '
                'c__Coriobacteriia; o__Coriobacteriales; '
                'f__Coriobacteriaceae; g__', 0.1427
            ],
            [
                'k__Bacteria; p__Proteobacteria; '
                'c__Gammaproteobacteria; o__Enterobacteriales; '
                'f__Enterbacteriaceae; g__Escherichia', 0.1
            ],
            [
                'k__Archaea; p__Crenarchaeota; '
                'c__Thaumarchaeota; o__Cenarchaeales; '
                'f__Cenarchaeaceae; g__Nitrosopumilus', 0.03
            ]
        ]

        known_abundance_99 = [
            ['k__Bacteria; p__[Proteobacteria]', 0.7],
            [
                'k__Bacteria; p__Actinobacteria; '
                'c__Coriobacteriia; o__Coriobacteriales; '
                'f__Coriobacteriaceae; g__', 0.1427
            ],
            [
                'k__Bacteria; p__Proteobacteria; '
                'c__Gammaproteobacteria; o__Enterobacteriales; '
                'f__Enterbacteriaceae; g__Escherichia', 0.1
            ],
            [
                'k__Archaea; p__Crenarchaeota; '
                'c__Thaumarchaeota; o__Cenarchaeales; '
                'f__Cenarchaeaceae; g__Nitrosopumilus', 0.03
            ], ['k__Bacteria', 0.02]
        ]

        known_abundance_1 = [
            ['k__Bacteria; p__[Proteobacteria]', 0.7],
            [
                'k__Bacteria; p__Actinobacteria; '
                'c__Coriobacteriia; o__Coriobacteriales; '
                'f__Coriobacteriaceae; g__', 0.1427
            ],
            [
                'k__Bacteria; p__Proteobacteria; '
                'c__Gammaproteobacteria; o__Enterobacteriales; '
                'f__Enterbacteriaceae; g__Escherichia', 0.1
            ],
            [
                'k__Archaea; p__Crenarchaeota; '
                'c__Thaumarchaeota; o__Cenarchaeales; '
                'f__Cenarchaeaceae; g__Nitrosopumilus', 0.03
            ], ['k__Bacteria', 0.02],
            [
                'k__Bacteria; p__Proteobacteria; '
                'c__Gammaproteobacteria', 0.005
            ],
            [
                'k__Bacteria; p__Proteobacteria; '
                'c__Gammaproteobacteria; o__Enterobacteriales; '
                'f__Enterbacteriaceae', 0.002
            ],
            [
                'k__Bacteria; p__Proteobacteria; '
                'c__Gammaproteobacteria; o__Enterobacteriales; '
                'f__Enterbacteriaceae; g__Escherichia; s__coli', 0.0002
            ],
            [
                'k__Bacteria; p__Proteobacteria; '
                'c__Gammaproteobacteria; o__Enterobacteriales', 0.0001
            ],
            [
                'k__Bacteria; p__Actinobacteria; '
                'c__Actinobacteria; o__Actinomycetales; '
                'f__Dietziaceae; g__', 0
            ]
        ]

        # Generates the value for testing
        test_abundance_def = calculate_abundance(sample=self.sample,
                                                 taxa=self.taxa)
        self.assertEqual(test_abundance_def, known_abundance_95)

        test_abundance_99 = calculate_abundance(sample=self.sample,
                                                taxa=self.taxa,
                                                sum_min=0.99)
        self.assertEqual(test_abundance_99, known_abundance_99)

        test_abundance_1 = calculate_abundance(sample=self.sample,
                                               taxa=self.taxa,
                                               sum_min=1.000)
        self.assertEqual(test_abundance_1, known_abundance_1)
    def test_calculate_abundance(self):
        # Sets up known value
        known_abundance_95 = [['k__Bacteria; p__[Proteobacteria]', 0.7],
                              ['k__Bacteria; p__Actinobacteria; '\
                               'c__Coriobacteriia; o__Coriobacteriales; '\
                               'f__Coriobacteriaceae; g__', 0.1427],
                              ['k__Bacteria; p__Proteobacteria; '\
                               'c__Gammaproteobacteria; o__Enterobacteriales; '\
                               'f__Enterbacteriaceae; g__Escherichia', 0.1],
                               ['k__Archaea; p__Crenarchaeota; '\
                               'c__Thaumarchaeota; o__Cenarchaeales; '\
                               'f__Cenarchaeaceae; g__Nitrosopumilus', 0.03]]

        known_abundance_99 = [['k__Bacteria; p__[Proteobacteria]', 0.7],
                              ['k__Bacteria; p__Actinobacteria; '\
                               'c__Coriobacteriia; o__Coriobacteriales; '\
                               'f__Coriobacteriaceae; g__', 0.1427],
                              ['k__Bacteria; p__Proteobacteria; '\
                               'c__Gammaproteobacteria; o__Enterobacteriales; '\
                               'f__Enterbacteriaceae; g__Escherichia', 0.1],
                              ['k__Archaea; p__Crenarchaeota; '\
                               'c__Thaumarchaeota; o__Cenarchaeales; '\
                               'f__Cenarchaeaceae; g__Nitrosopumilus', 0.03],
                               ['k__Bacteria', 0.02]]

        known_abundance_1 = [['k__Bacteria; p__[Proteobacteria]', 0.7], 
                             ['k__Bacteria; p__Actinobacteria; '\
                              'c__Coriobacteriia; o__Coriobacteriales; '\
                              'f__Coriobacteriaceae; g__', 0.1427], 
                             ['k__Bacteria; p__Proteobacteria; '\
                              'c__Gammaproteobacteria; o__Enterobacteriales; '\
                              'f__Enterbacteriaceae; g__Escherichia', 0.1],
                             ['k__Archaea; p__Crenarchaeota; '\
                              'c__Thaumarchaeota; o__Cenarchaeales; '\
                              'f__Cenarchaeaceae; g__Nitrosopumilus', 0.03],
                             ['k__Bacteria', 0.02],
                             ['k__Bacteria; p__Proteobacteria; '\
                              'c__Gammaproteobacteria', 0.005],
                             ['k__Bacteria; p__Proteobacteria; '\
                              'c__Gammaproteobacteria; o__Enterobacteriales; '\
                              'f__Enterbacteriaceae', 0.002],
                             ['k__Bacteria; p__Proteobacteria; '\
                              'c__Gammaproteobacteria; o__Enterobacteriales; '\
                              'f__Enterbacteriaceae; g__Escherichia; s__coli', 
                               0.0002],
                             ['k__Bacteria; p__Proteobacteria; '\
                              'c__Gammaproteobacteria; o__Enterobacteriales', 
                               0.0001],
                             ['k__Bacteria; p__Actinobacteria; '\
                              'c__Actinobacteria; o__Actinomycetales; '\
                              'f__Dietziaceae; g__', 0]]

        # Generates the value for testing
        test_abundance_def = calculate_abundance(sample = self.sample, 
                                                 taxa = self.taxa)
        self.assertEqual(test_abundance_def, known_abundance_95)


        test_abundance_99 = calculate_abundance(sample = self.sample, 
                                                taxa = self.taxa, 
                                                abundance_threshhold = 0.99)
        self.assertEqual(test_abundance_99, known_abundance_99)

        test_abundance_1 = calculate_abundance(sample = self.sample, 
                                                taxa = self.taxa, 
                                                abundance_threshhold = 1.000)
        self.assertEqual(test_abundance_1, known_abundance_1)