def alpha_diversity_by_sample_type(adiv_fs, mapping_f,
                                   mapping_category='Sample_Type'):
    mapping_dict, mapping_comments = parse_mapping_file_to_dict(mapping_f)
    sample_type_map = {}
    #sample_type_counts = defaultdict(int)
    for samp_id in mapping_dict:
        sample_type_map[samp_id] = mapping_dict[samp_id][mapping_category]
        #sample_type_counts[sample_type_map[samp_id]] += 1

    sample_type_to_adiv = defaultdict(list)
    for adiv_f in adiv_fs:
        adiv_data = [line.strip().split('\t')
                     for line in adiv_f if line.strip()][1:]

        for samp_id, adiv in adiv_data:
            sample_type = sample_type_map[samp_id]
            # TODO do we need to normalize this? how?
            #adiv = float(adiv) / sample_type_counts[sample_type]
            adiv = float(adiv)
            sample_type_to_adiv[sample_type].append(adiv)

    plotting_data = [(median(v), '%s (n=%d)' % (k, len(v)), v) for k, v in
                     sample_type_to_adiv.items()]
    plotting_data.sort()

    plot_fig = generate_box_plots([dist[2] for dist in
            plotting_data], x_tick_labels=[dist[1] for dist in plotting_data],
            x_label=mapping_category, y_label='Alpha Diversity',
            title='Alpha Diversity by %s' % mapping_category)
    tight_layout()
    return plotting_data, plot_fig
Ejemplo n.º 2
0
    def test_generate_box_plots_box_colors(self):
        """Test correctly handles coloring of box plots."""
        # Coloring works with all empty distributions.
        fig = generate_box_plots([[], [], []],
                                 box_colors=['blue', 'red', 'yellow'])
        ax = fig.get_axes()[0]
        self.assertEqual(len(ax.get_xticklabels()), 3)

        fig = generate_box_plots([[], [], []], box_colors='pink')
        ax = fig.get_axes()[0]
        self.assertEqual(len(ax.get_xticklabels()), 3)

        # Coloring works with some empty distributions.
        fig = generate_box_plots([[], [1, 2, 3.5], []],
                                 box_colors=['blue', 'red', 'yellow'])
        ax = fig.get_axes()[0]
        self.assertEqual(len(ax.get_xticklabels()), 3)
Ejemplo n.º 3
0
    def test_generate_box_plots_box_colors(self):
        """Test correctly handles coloring of box plots."""
        # Coloring works with all empty distributions.
        fig = generate_box_plots([[], [], []],
                                 box_colors=['blue', 'red', 'yellow'])
        ax = fig.get_axes()[0]
        self.assertEqual(len(ax.get_xticklabels()), 3)

        fig = generate_box_plots([[], [], []], box_colors='pink')
        ax = fig.get_axes()[0]
        self.assertEqual(len(ax.get_xticklabels()), 3)

        # Coloring works with some empty distributions.
        fig = generate_box_plots([[], [1, 2, 3.5], []],
                                 box_colors=['blue', 'red', 'yellow'])
        ax = fig.get_axes()[0]
        self.assertEqual(len(ax.get_xticklabels()), 3)
Ejemplo n.º 4
0
 def test_generate_box_plots(self):
     """generate_box_plots() should return a valid Figure object."""
     fig = generate_box_plots(self.ValidTypicalBoxData, [1, 4, 10],
                              ["Data 1", "Data 2", "Data 3"], "Test",
                              "x-axis label", "y-axis label")
     ax = fig.get_axes()[0]
     self.assertEqual(ax.get_title(), "Test")
     self.assertEqual(ax.get_xlabel(), "x-axis label")
     self.assertEqual(ax.get_ylabel(), "y-axis label")
     self.assertEqual(len(ax.get_xticklabels()), 3)
     self.assertFloatEqual(ax.get_xticks(), [1, 4, 10])
Ejemplo n.º 5
0
 def test_generate_box_plots(self):
     """generate_box_plots() should return a valid Figure object."""
     fig = generate_box_plots(self.ValidTypicalBoxData, [1, 4, 10],
                              ["Data 1", "Data 2", "Data 3"], "Test",
                              "x-axis label", "y-axis label")
     ax = fig.get_axes()[0]
     self.assertEqual(ax.get_title(), "Test")
     self.assertEqual(ax.get_xlabel(), "x-axis label")
     self.assertEqual(ax.get_ylabel(), "y-axis label")
     self.assertEqual(len(ax.get_xticklabels()), 3)
     self.assertFloatEqual(ax.get_xticks(), [1, 4, 10])
Ejemplo n.º 6
0
def alpha_diversity_by_sample_type(adiv_fs,
                                   mapping_f,
                                   mapping_category='Sample_Type',
                                   min_num_samples=11,
                                   category_values_to_exclude=None):
    """Will exclude 'NA' category value by default if this parameter is not
    provided"""
    if category_values_to_exclude is None:
        category_values_to_exclude = ['NA']

    mapping_dict, mapping_comments = parse_mapping_file_to_dict(mapping_f)
    sample_type_map = {}
    #sample_type_counts = defaultdict(int)
    for samp_id in mapping_dict:
        sample_type_map[samp_id] = mapping_dict[samp_id][mapping_category]
        #sample_type_counts[sample_type_map[samp_id]] += 1

    sample_type_to_adiv = defaultdict(list)
    for adiv_f in adiv_fs:
        adiv_data = [
            line.strip().split('\t') for line in adiv_f if line.strip()
        ][1:]

        for samp_id, adiv in adiv_data:
            try:
                sample_type = sample_type_map[samp_id]
            except KeyError:
                sample_type = 'Unknown'
            # TODO do we need to normalize this? how?
            #adiv = float(adiv) / sample_type_counts[sample_type]
            adiv = float(adiv)
            sample_type_to_adiv[sample_type].append(adiv)

    plotting_data = [(median(v), '%s (n=%d)' % (k, len(v)), v)
                     for k, v in sample_type_to_adiv.items()
                     if k != 'Unknown' and k not in category_values_to_exclude
                     and len(v) >= min_num_samples]
    plotting_data.sort()

    plot_fig = generate_box_plots(
        [dist[2] for dist in plotting_data],
        x_tick_labels=[dist[1] for dist in plotting_data],
        x_label=mapping_category,
        y_label='Alpha Diversity',
        title='Alpha Diversity by %s' % mapping_category)
    plot_fig.set_size_inches(12, 12)
    try:
        plot_fig.tight_layout()
    except ValueError:
        print "tight_layout() failed. Try making the plot figure larger " + \
              "with Figure.set_size_inches(). The labels will be cut off " + \
              "otherwise."
    return plotting_data, plot_fig
Ejemplo n.º 7
0
    def test_generate_box_plots_empty_distributions(self):
        """Test functions correctly with empty distributions."""
        fig = generate_box_plots([[1, 2, 3], [], [4, 5, 6]], [1, 4, 10],
                                 ["Data 1", "Data 2", "Data 3"], "Test",
                                 "x-axis label", "y-axis label")
        ax = fig.get_axes()[0]
        self.assertEqual(ax.get_title(), "Test")
        self.assertEqual(ax.get_xlabel(), "x-axis label")
        self.assertEqual(ax.get_ylabel(), "y-axis label")
        self.assertEqual(len(ax.get_xticklabels()), 3)
        self.assertFloatEqual(ax.get_xticks(), [1, 4, 10])

        # All distributions are empty.
        fig = generate_box_plots([[], [], []], [1, 4, 10],
                                 ["Data 1", "Data 2", "Data 3"], "Test",
                                 "x-axis label", "y-axis label")
        ax = fig.get_axes()[0]
        self.assertEqual(ax.get_title(), "Test")
        self.assertEqual(ax.get_xlabel(), "x-axis label")
        self.assertEqual(ax.get_ylabel(), "y-axis label")
        self.assertEqual(len(ax.get_xticklabels()), 3)
        self.assertFloatEqual(ax.get_xticks(), [1, 4, 10])
Ejemplo n.º 8
0
    def test_generate_box_plots_empty_distributions(self):
        """Test functions correctly with empty distributions."""
        fig = generate_box_plots([[1, 2, 3], [], [4, 5, 6]], [1, 4, 10],
                                 ["Data 1", "Data 2", "Data 3"], "Test",
                                 "x-axis label", "y-axis label")
        ax = fig.get_axes()[0]
        self.assertEqual(ax.get_title(), "Test")
        self.assertEqual(ax.get_xlabel(), "x-axis label")
        self.assertEqual(ax.get_ylabel(), "y-axis label")
        self.assertEqual(len(ax.get_xticklabels()), 3)
        self.assertFloatEqual(ax.get_xticks(), [1, 4, 10])

        # All distributions are empty.
        fig = generate_box_plots([[], [], []], [1, 4, 10],
                                 ["Data 1", "Data 2", "Data 3"], "Test",
                                 "x-axis label", "y-axis label")
        ax = fig.get_axes()[0]
        self.assertEqual(ax.get_title(), "Test")
        self.assertEqual(ax.get_xlabel(), "x-axis label")
        self.assertEqual(ax.get_ylabel(), "y-axis label")
        self.assertEqual(len(ax.get_xticklabels()), 3)
        self.assertFloatEqual(ax.get_xticks(), [1, 4, 10])
Ejemplo n.º 9
0
def alpha_diversity_by_sample_type(adiv_fs, mapping_f,
                                   mapping_category='Sample_Type',
                                   min_num_samples=11,
                                   category_values_to_exclude=None):
    """Will exclude 'NA' category value by default if this parameter is not
    provided"""
    if category_values_to_exclude is None:
        category_values_to_exclude = ['NA']

    mapping_dict, mapping_comments = parse_mapping_file_to_dict(mapping_f)
    sample_type_map = {}
    #sample_type_counts = defaultdict(int)
    for samp_id in mapping_dict:
        sample_type_map[samp_id] = mapping_dict[samp_id][mapping_category]
        #sample_type_counts[sample_type_map[samp_id]] += 1

    sample_type_to_adiv = defaultdict(list)
    for adiv_f in adiv_fs:
        adiv_data = [line.strip().split('\t')
                     for line in adiv_f if line.strip()][1:]

        for samp_id, adiv in adiv_data:
            try:
                sample_type = sample_type_map[samp_id]
            except KeyError:
                sample_type = 'Unknown'
            # TODO do we need to normalize this? how?
            #adiv = float(adiv) / sample_type_counts[sample_type]
            adiv = float(adiv)
            sample_type_to_adiv[sample_type].append(adiv)

    plotting_data = [(median(v), '%s (n=%d)' % (k, len(v)), v) for k, v in
                     sample_type_to_adiv.items()
                     if k != 'Unknown' and k not in
                     category_values_to_exclude and
                     len(v) >= min_num_samples]
    plotting_data.sort()

    plot_fig = generate_box_plots([dist[2] for dist in
            plotting_data], x_tick_labels=[dist[1] for dist in plotting_data],
            x_label=mapping_category, y_label='Alpha Diversity',
            title='Alpha Diversity by %s' % mapping_category)
    plot_fig.set_size_inches(12, 12)
    try:
        plot_fig.tight_layout()
    except ValueError:
        print "tight_layout() failed. Try making the plot figure larger " + \
              "with Figure.set_size_inches(). The labels will be cut off " + \
              "otherwise."
    return plotting_data, plot_fig
Ejemplo n.º 10
0
def create_plot(raw_data, x_label, y_label, title):
    plot = generate_box_plots(
            [e[2] for e in raw_data],
            x_tick_labels=[e[1] for e in raw_data],
            x_label=x_label,
            y_label=y_label,
            title=title)
    plot.set_size_inches(12, 12)
    try:
        plot.tight_layout()
    except ValueError:
        print "tight_layout() failed. Try making the plot figure larger " + \
              "with Figure.set_size_inches(). The labels will be cut off " + \
              "otherwise."
    return plot
Ejemplo n.º 11
0
def _generate_alpha_diversity_boxplots(collated_adiv_dir, map_fp,
                                       split_category, comparison_category,
                                       rarefaction_depth, output_dir):
    """Generates per-body-site self vs. other alpha diversity boxplots.

    Creates a plot for each input collated alpha diversity file (i.e. metric)
    in collated_adiv_dir. Returns a list of plot filenames that were created in
    output_dir.

    Arguments:
        collated_adiv_dir - path to directory containing one or more collated
            alpha diversity files
        map_fp - filepath to metadata mapping file
        split_category - category to split on, e.g. body site. A boxplot will
            be created for each category value (e.g. tongue, palm, etc.)
        comparison_category - category to split on within each of the split
            categories (e.g. self, other)
        rarefaction_depth - rarefaction depth to use when pulling data from
            rarefaction files
        output_dir - directory to write output plot images to
    """
    metadata_map = MetadataMap.parseMetadataMap(open(map_fp, 'U'))
    collated_adiv_fps = glob(join(collated_adiv_dir, '*.txt'))
    plot_title = 'Alpha diversity (%d seqs/sample)' % rarefaction_depth

    # Generate a plot for each collated alpha diversity metric file.
    created_files = []
    for collated_adiv_fp in collated_adiv_fps:
        adiv_metric = splitext(basename(collated_adiv_fp))[0]

        x_tick_labels, dists = _collect_alpha_diversity_boxplot_data(
                open(collated_adiv_fp, 'U'), metadata_map, rarefaction_depth,
                split_category, comparison_category)

        plot_figure = generate_box_plots(dists,
                                         x_tick_labels=x_tick_labels,
                                         title=plot_title,
                                         x_label='Grouping',
                                         y_label=format_title(adiv_metric))
        plot_fp = join(output_dir, '%s.png' % adiv_metric)
        plot_figure.savefig(plot_fp)
        created_files.append(basename(plot_fp))

    return created_files
Ejemplo n.º 12
0
def make_distance_boxplots(dm_f,
                           map_f,
                           fields,
                           width=None,
                           height=6.0,
                           suppress_all_within=False,
                           suppress_all_between=False,
                           suppress_individual_within=False,
                           suppress_individual_between=False,
                           y_min=0.0,
                           y_max=1.0,
                           whisker_length=1.5,
                           box_width=0.5,
                           box_color=None,
                           color_individual_within_by_field=None,
                           sort=False):
    """Generates various types of boxplots for distance comparisons.

    Returns a list of tuples, one for each field. Each tuple contains the
    following:
        1) the name of the field (string)
        2) a matplotlib.figure.Figure object containing the boxplots
        3) a list of lists containing the raw plot data that was passed to mpl
        4) a list of labels for each of the boxplots (string)
        5) a list of mpl-compatible colors (one for each boxplot)

    The Figure can be saved, and the raw data and labels can be useful (for
    example) performing statistical tests or writing the raw data to disk.

    The input arguments are exactly derived from the make_distance_boxplots.py
    script (see the script options for details). To avoid duplicated effort,
    their descriptions are not reproduced here.
    """
    # Parse data files and do some preliminary error checking.
    dm_header, dm_data = parse_distmat(dm_f)
    map_data, map_header, map_comments = parse_mapping_file(map_f)

    if fields is None or len(fields) < 1:
        raise ValueError("You must provide at least one field to analyze.")

    for field in fields:
        if field not in map_header:
            raise ValueError("The field '%s' is not in the provided mapping "
                             "file. Please supply correct fields "
                             "corresponding to fields in the mapping file." %
                             field)

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = _cast_y_axis_extrema(y_min)
    y_max = _cast_y_axis_extrema(y_max)

    # Collate the distributions of distances that will comprise each boxplot.
    # Suppress the generation of the indicated types of boxplots.
    results = []
    for field in fields:
        plot_data = []
        plot_labels = []
        plot_colors = []
        legend = None

        # Little bit of duplicate code here... not sure it's worth the effort
        # to clean up though.
        if not suppress_all_within:
            plot_data.append(get_all_grouped_distances(dm_header, dm_data,
                             map_header, map_data, field, within=True))
            plot_labels.append("All within %s" % field)

            if color_individual_within_by_field is not None:
                plot_colors.append(None)
            else:
                plot_colors.append(box_color)

        if not suppress_all_between:
            plot_data.append(get_all_grouped_distances(dm_header, dm_data,
                             map_header, map_data, field, within=False))
            plot_labels.append("All between %s" % field)

            if color_individual_within_by_field is not None:
                plot_colors.append(None)
            else:
                plot_colors.append(box_color)

        if not suppress_individual_within:
            within_dists = get_grouped_distances(dm_header, dm_data,
                                                 map_header, map_data, field,
                                                 within=True)
            field_states = []
            for grouping in within_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))
                field_states.append(grouping[0])

            # If we need to color these boxplots by a field, build up a
            # list of colors and a legend.
            if color_individual_within_by_field is not None:
                colors, color_mapping = _color_field_states(
                        format_mapping_file(map_header, map_data).split('\n'),
                        dm_header, field, field_states,
                        color_individual_within_by_field)
                plot_colors.extend(colors)
                legend = (color_mapping.values(), color_mapping.keys())
            else:
                plot_colors.extend([box_color] * len(field_states))

        if not suppress_individual_between:
            between_dists = get_grouped_distances(dm_header, dm_data,
                    map_header, map_data, field, within=False)

            for grouping in between_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))

                if color_individual_within_by_field is not None:
                    plot_colors.append(None)
                else:
                    plot_colors.append(box_color)

        assert (len(plot_data) == len(plot_labels) and
                len(plot_labels) == len(plot_colors)), "The number " +\
                "of boxplot labels and colors do not match the number of " +\
                "boxplots."

        # We now have our data and labels ready, so plot them!
        if plot_data:
            if sort:
                plot_data, plot_labels, plot_colors = \
                        _sort_distributions_by_median(plot_data, plot_labels,
                                                      plot_colors)

            if width is None:
                width = len(plot_data) * box_width + 2
            if width <= 0 or height <= 0:
                raise ValueError("The specified width and height of the plot "
                                 "must be greater than zero.")

            plot_figure = generate_box_plots(plot_data,
                    x_tick_labels=plot_labels, title="%s Distances" % field,
                    x_label="Grouping", y_label="Distance",
                    x_tick_labels_orientation='vertical', y_min=y_min,
                    y_max=y_max, whisker_length=whisker_length,
                    box_width=box_width, box_colors=plot_colors,
                    figure_width=width, figure_height=height, legend=legend)

            results.append((field, plot_figure, plot_data, plot_labels,
                            plot_colors))
        else:
            raise ValueError("The generation of all plots was suppressed. At "
                             "least one type of plot must be unsuppressed.")

    return results
Ejemplo n.º 13
0
def make_distance_boxplots(dm_f,
                           map_f,
                           fields,
                           width=None,
                           height=6.0,
                           suppress_all_within=False,
                           suppress_all_between=False,
                           suppress_individual_within=False,
                           suppress_individual_between=False,
                           y_min=0.0,
                           y_max=1.0,
                           whisker_length=1.5,
                           box_width=0.5,
                           box_color=None,
                           color_individual_within_by_field=None,
                           sort=False):
    """Generates various types of boxplots for distance comparisons.

    Returns a list of tuples, one for each field. Each tuple contains the
    following:
        1) the name of the field (string)
        2) a matplotlib.figure.Figure object containing the boxplots
        3) a list of lists containing the raw plot data that was passed to mpl
        4) a list of labels for each of the boxplots (string)
        5) a list of mpl-compatible colors (one for each boxplot)

    The Figure can be saved, and the raw data and labels can be useful (for
    example) performing statistical tests or writing the raw data to disk.

    The input arguments are exactly derived from the make_distance_boxplots.py
    script (see the script options for details). To avoid duplicated effort,
    their descriptions are not reproduced here.
    """
    # Parse data files and do some preliminary error checking.
    dm_header, dm_data = parse_distmat(dm_f)
    map_data, map_header, map_comments = parse_mapping_file(map_f)

    if fields is None or len(fields) < 1:
        raise ValueError("You must provide at least one field to analyze.")

    for field in fields:
        if field not in map_header:
            raise ValueError("The field '%s' is not in the provided mapping "
                             "file. Please supply correct fields "
                             "corresponding to fields in the mapping file." %
                             field)

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = _cast_y_axis_extrema(y_min)
    y_max = _cast_y_axis_extrema(y_max)

    # Collate the distributions of distances that will comprise each boxplot.
    # Suppress the generation of the indicated types of boxplots.
    results = []
    for field in fields:
        plot_data = []
        plot_labels = []
        plot_colors = []
        legend = None

        # Little bit of duplicate code here... not sure it's worth the effort
        # to clean up though.
        if not suppress_all_within:
            plot_data.append(get_all_grouped_distances(dm_header, dm_data,
                             map_header, map_data, field, within=True))
            plot_labels.append("All within %s" % field)

            if color_individual_within_by_field is not None:
                plot_colors.append(None)
            else:
                plot_colors.append(box_color)

        if not suppress_all_between:
            plot_data.append(get_all_grouped_distances(dm_header, dm_data,
                             map_header, map_data, field, within=False))
            plot_labels.append("All between %s" % field)

            if color_individual_within_by_field is not None:
                plot_colors.append(None)
            else:
                plot_colors.append(box_color)

        if not suppress_individual_within:
            within_dists = get_grouped_distances(dm_header, dm_data,
                                                 map_header, map_data, field,
                                                 within=True)
            field_states = []
            for grouping in within_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))
                field_states.append(grouping[0])

            # If we need to color these boxplots by a field, build up a
            # list of colors and a legend.
            if color_individual_within_by_field is not None:
                colors, color_mapping = _color_field_states(
                        format_mapping_file(map_header, map_data).split('\n'),
                        dm_header, field, field_states,
                        color_individual_within_by_field)
                plot_colors.extend(colors)
                legend = (color_mapping.values(), color_mapping.keys())
            else:
                plot_colors.extend([box_color] * len(field_states))

        if not suppress_individual_between:
            between_dists = get_grouped_distances(dm_header, dm_data,
                    map_header, map_data, field, within=False)

            for grouping in between_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))

                if color_individual_within_by_field is not None:
                    plot_colors.append(None)
                else:
                    plot_colors.append(box_color)

        assert (len(plot_data) == len(plot_labels) and
                len(plot_labels) == len(plot_colors)), "The number " +\
                "of boxplot labels and colors do not match the number of " +\
                "boxplots."

        # We now have our data and labels ready, so plot them!
        if plot_data:
            if sort:
                plot_data, plot_labels, plot_colors = \
                        _sort_distributions_by_median(plot_data, plot_labels,
                                                      plot_colors)

            if width is None:
                width = len(plot_data) * box_width + 2
            if width <= 0 or height <= 0:
                raise ValueError("The specified width and height of the plot "
                                 "must be greater than zero.")

            plot_figure = generate_box_plots(plot_data,
                    x_tick_labels=plot_labels, title="%s Distances" % field,
                    x_label="Grouping", y_label="Distance",
                    x_tick_labels_orientation='vertical', y_min=y_min,
                    y_max=y_max, whisker_length=whisker_length,
                    box_width=box_width, box_colors=plot_colors,
                    figure_width=width, figure_height=height, legend=legend)

            results.append((field, plot_figure, plot_data, plot_labels,
                            plot_colors))
        else:
            raise ValueError("The generation of all plots was suppressed. At "
                             "least one type of plot must be unsuppressed.")

    return results
Ejemplo n.º 14
0
def generate_new_diversity_plots(otu_table_fs, gg_f, mapping_f,
                                 mapping_category='Sample_Type'):
    mapping_dict, mapping_comments = parse_mapping_file_to_dict(mapping_f)
    sample_type_map = {}
    for samp_id in mapping_dict:
        sample_type_map[samp_id] = mapping_dict[samp_id][mapping_category]

    gg_otus = [seq_id.split()[0] for seq_id, s in MinimalFastaParser(gg_f)]

    # Track by sample ID, which allows multiple OTU tables (even with
    # overlapping sample IDs) to be supported.
    success_counts = defaultdict(int)
    failure_counts = defaultdict(int)
    new_otus = defaultdict(list)
    for otu_table_f in otu_table_fs:
        otu_table = parse_biom_table(otu_table_f)
        novel_otus = set(otu_table.ObservationIds) - set(gg_otus)

        for counts, otu_id, md in otu_table.iterObservations():
            if otu_id in novel_otus:
                for samp_id, count in zip(otu_table.SampleIds, counts):
                    failure_counts[samp_id] += count
                    if count > 0:
                        new_otus[samp_id].append(otu_id)
            else:
                for samp_id, count in zip(otu_table.SampleIds, counts):
                    success_counts[samp_id] += count

    percent_failures_result = defaultdict(list)
    num_new_otus_result = defaultdict(list)
    for samp_id in set(success_counts.keys() + failure_counts.keys()):
        samp_type = sample_type_map[samp_id]
        failure_count = failure_counts[samp_id]
        success_count = success_counts[samp_id]
        percent_failures = (failure_count /
                            (success_count + failure_count)) * 100.0
        percent_failures_result[samp_type].append(percent_failures)
        num_new_otus_result[samp_type].append(len(set(new_otus[samp_id])))

    # TODO remove this duplicate code
    percent_failures_data = [(median(v), '%s (n=%d)' % (k, len(v)), v)
                             for k, v in percent_failures_result.items()]
    percent_failures_data.sort()
    percent_failures_plot = generate_box_plots(
            [e[2] for e in percent_failures_data],
            x_tick_labels=[e[1] for e in percent_failures_data],
            x_label=mapping_category,
            y_label='% Novel Seqs',
            title='%% Novel Seqs by %s' % mapping_category)
    percent_failures_plot.tight_layout()

    num_new_otus_data = [(median(v), '%s (n=%d)' % (k, len(v)), v)
                         for k, v in num_new_otus_result.items()]
    num_new_otus_data.sort()
    num_new_otus_plot = generate_box_plots(
            [e[2] for e in num_new_otus_data],
            x_tick_labels=[e[1] for e in num_new_otus_data],
            x_label=mapping_category,
            y_label='Number of Novel OTUs',
            title='Number of Novel OTUs by %s' % mapping_category)
    num_new_otus_plot.tight_layout()

    return percent_failures_data, percent_failures_plot, num_new_otus_data, \
           num_new_otus_plot