def generate_correlation_box_plots(query_pearson_spearman, subject_pearson_spearman, metric, x_label="Method"):
    """ Generate box plots for correlation coefficient
        
        query_pearson_spearman: pearson and spearman data as returned
         from compute_pearson_spearman for query data
        subject_pearson_spearman: pearson and spearman data as returned
         from compute_pearson_spearman for subject data
        metric: metric to generate plots for (choices are "pearson", 
         "spearman")
        x_label: x axis label for the plot (default: "Method")
    """
    metric_lookup = {"pearson": (4, "r"), "spearman": (6, "rho")}

    try:
        metric_idx, y_label = metric_lookup[metric.lower()]
    except KeyError:
        available_metric_desc = ", ".join(metric_lookup.keys())
        error_msg = "Unknown metric: %s. Available choices are: %s" % (metric, available_metric_desc)
        raise KeyError, error_msg

    distributions_by_method = defaultdict(list)
    for e in query_pearson_spearman:
        distributions_by_method[e[2]].append(e[metric_idx])
    for e in subject_pearson_spearman:
        distributions_by_method[e[2]].append(e[metric_idx])

    x_tick_labels, distributions = zip(*distributions_by_method.items())
    generate_box_plots(
        distributions, x_tick_labels=x_tick_labels, x_label=x_label, y_label=y_label, y_min=-1.0, y_max=1.0
    )
Ejemplo n.º 2
0
def generate_alpha_diversity_boxplots(rarefaction_lines,
                                      mapping_lines,
                                      category,
                                      depth=None):
    rarefaction_data = parse_rarefaction(rarefaction_lines)

    category_value_to_sample_ids = \
     get_category_value_to_sample_ids(mapping_lines,
                                      category)

    per_sample_average_diversities = \
     get_per_sample_average_diversities(rarefaction_data,
                                        category,
                                        depth)

    per_category_value_average_diversities = \
     collapse_sample_diversities_by_category_value(category_value_to_sample_ids,
                                                   per_sample_average_diversities)

    x_tick_labels = []
    distributions = []
    for cat, avg_diversities in per_category_value_average_diversities.items():
        x_tick_labels.append("%s (n=%d)" % (cat, len(avg_diversities)))
        distributions.append(avg_diversities)

    return generate_box_plots(distributions, x_tick_labels=x_tick_labels)
def generate_alpha_diversity_boxplots(rarefaction_lines,
                                      mapping_lines,
                                      category,
                                      depth=None):
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    
    category_value_to_sample_ids = \
     get_category_value_to_sample_ids(mapping_lines,
                                      category)
    
    per_sample_average_diversities = \
     get_per_sample_average_diversities(rarefaction_data,
                                        category,
                                        depth)
    
    per_category_value_average_diversities = \
     collapse_sample_diversities_by_category_value(category_value_to_sample_ids,
                                                   per_sample_average_diversities)
    
    # sort the data alphabetically
    sorted_per_category_value_average_diversities = \
     per_category_value_average_diversities.items()
    sorted_per_category_value_average_diversities.sort()
    
    x_tick_labels = []
    distributions = []
    for cat, avg_diversities in sorted_per_category_value_average_diversities:
        x_tick_labels.append("%s (n=%d)" % (cat, len(avg_diversities)))
        distributions.append(avg_diversities)
    
    return generate_box_plots(distributions,
                              x_tick_labels=x_tick_labels)
 def test_generate_box_plots(self):
     """generate_box_plots() should return a valid Figure object."""
     fig = generate_box_plots(self.ValidTypicalBoxData, [1, 4, 10],
                              ["Data 1", "Data 2", "Data 3"], "Test",
                              "x-axis label", "y-axis label")
     ax = fig.get_axes()[0]
     self.assertEqual(ax.get_title(), "Test")
     self.assertEqual(ax.get_xlabel(), "x-axis label")
     self.assertEqual(ax.get_ylabel(), "y-axis label")
     self.assertEqual(len(ax.get_xticklabels()), 3)
     self.assertFloatEqual(ax.get_xticks(), [1, 4, 10])
 def test_generate_box_plots(self):
     """generate_box_plots() should return a valid Figure object."""
     fig = generate_box_plots(self.ValidTypicalBoxData, [1, 4, 10],
                              ["Data 1", "Data 2", "Data 3"], "Test",
                              "x-axis label", "y-axis label")
     ax = fig.get_axes()[0]
     self.assertEqual(ax.get_title(), "Test")
     self.assertEqual(ax.get_xlabel(), "x-axis label")
     self.assertEqual(ax.get_ylabel(), "y-axis label")
     self.assertEqual(len(ax.get_xticklabels()), 3)
     self.assertFloatEqual(ax.get_xticks(), [1, 4, 10])
def generate_prf_box_plots(query_prf, subject_prf, metric, x_label="Method"):
    """ Generate box plots for precision, recall, or f-measure
        
        query_prf: precision, recall, and f-measure values as returned 
         from compute_prfs for query data
        subject_prf: precision, recall, and f-measure values as returned 
         from compute_prfs for subject data
        metric: metric to generate plots for (choices are "precision", 
         "recall", and "f-measure")
        x_label: x axis label for the plot (default: "Method")
    """
    metric_lookup = {
        "precision": (4, "Precision"),
        "p": (4, "Precision"),
        "recall": (5, "Recall"),
        "r": (5, "Recall"),
        "f-measure": (6, "F-measure"),
        "f": (6, "F-measure"),
    }
    try:
        metric_idx, y_label = metric_lookup[metric.lower()]
    except KeyError:
        available_metric_desc = ", ".join(metric_lookup.keys())
        error_msg = "Unknown metric: %s. Available choices are: %s" % (metric, available_metric_desc)
        raise KeyError, error_msg

    distributions_by_method = defaultdict(list)
    for e in subject_prf:
        distributions_by_method[e[2]].append(e[metric_idx])
    for e in query_prf:
        distributions_by_method[e[2]].append(e[metric_idx])

    x_tick_labels, distributions = zip(*distributions_by_method.items())
    generate_box_plots(
        distributions, x_tick_labels=x_tick_labels, x_label=x_label, y_label=y_label, y_min=0.0, y_max=1.0
    )
Ejemplo n.º 7
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    # Parse the distance matrix and mapping file.
    try:
        dist_matrix_header, dist_matrix = parse_distmat(
            open(opts.distance_matrix_fp, 'U'))
    except:
        option_parser.error(
            "This does not look like a valid distance matrix "
            "file. Please supply a valid distance matrix file using the -d "
            "option.")

    try:
        mapping, mapping_header, mapping_comments = parse_mapping_file(
            open(opts.mapping_fp, 'U'))
    except QiimeParseError:
        option_parser.error(
            "This does not look like a valid metadata mapping "
            "file. Please supply a valid mapping file using the -m option.")

    fields = opts.fields
    fields = map(strip, fields.split(','))
    fields = [field.strip('"').strip("'") for field in fields]

    if fields is None:
        option_parser.error("You must provide at least one field using the -f "
                            "option.")

    # Make sure each field is in the mapping file.
    for field in fields:
        if field not in mapping_header:
            option_parser.error(
                "The field '%s' is not in the provided "
                "mapping file. Please supply correct fields (using the -f "
                "option) corresponding to fields in the mapping file." % field)

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = opts.y_min
    y_max = opts.y_max
    try:
        y_min = float(y_min)
    except ValueError:
        if y_min == 'auto':
            y_min = None
        else:
            option_parser.error("The --y_min option must be either a number "
                                "or 'auto'.")
    try:
        y_max = float(y_max)
    except ValueError:
        if y_max == 'auto':
            y_max = None
        else:
            option_parser.error("The --y_max option must be either a number "
                                "or 'auto'.")

    # Generate the various boxplots, depending on what the user wanted
    # suppressed. Add them all to one encompassing plot.
    for field in fields:
        plot_data = []
        plot_labels = []

        if not opts.suppress_all_within:
            plot_data.append(
                get_all_grouped_distances(dist_matrix_header,
                                          dist_matrix,
                                          mapping_header,
                                          mapping,
                                          field,
                                          within=True))
            plot_labels.append("All within %s" % field)
        if not opts.suppress_all_between:
            plot_data.append(
                get_all_grouped_distances(dist_matrix_header,
                                          dist_matrix,
                                          mapping_header,
                                          mapping,
                                          field,
                                          within=False))
            plot_labels.append("All between %s" % field)
        if not opts.suppress_individual_within:
            within_dists = get_grouped_distances(dist_matrix_header,
                                                 dist_matrix,
                                                 mapping_header,
                                                 mapping,
                                                 field,
                                                 within=True)
            for grouping in within_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))
        if not opts.suppress_individual_between:
            between_dists = get_grouped_distances(dist_matrix_header,
                                                  dist_matrix,
                                                  mapping_header,
                                                  mapping,
                                                  field,
                                                  within=False)
            for grouping in between_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))

        # We now have our data and labels ready, so plot them!
        assert (len(plot_data) == len(plot_labels)), "The number " +\
                "of boxplot labels does not match the number of " +\
                "boxplots."
        if plot_data:
            if opts.sort:
                # Sort our plot data in order of increasing median.
                sorted_data = []
                for label, distribution in zip(plot_labels, plot_data):
                    sorted_data.append(
                        (label, distribution, median(distribution)))
                sorted_data.sort(key=itemgetter(2))
                plot_labels = []
                plot_data = []
                for label, distribution, median_value in sorted_data:
                    plot_labels.append(label)
                    plot_data.append(distribution)

            width = opts.width
            height = opts.height
            if width is None:
                width = len(plot_data) * opts.box_width + 2
            if width <= 0 or height <= 0:
                option_parser.error("The specified width and height of the "
                                    "image must be greater than zero.")

            plot_figure = generate_box_plots(
                plot_data,
                x_tick_labels=plot_labels,
                title="%s Distances" % field,
                x_label="Grouping",
                y_label="Distance",
                x_tick_labels_orientation='vertical',
                y_min=y_min,
                y_max=y_max,
                whisker_length=opts.whisker_length,
                box_width=opts.box_width,
                box_color=opts.box_color,
                figure_width=width,
                figure_height=height)

            output_plot_fp = join(opts.output_dir,
                                  "%s_Distances.%s" % (field, opts.imagetype))
            plot_figure.savefig(output_plot_fp,
                                format=opts.imagetype,
                                transparent=opts.transparent)
        else:
            option_parser.error("You have chosen to suppress all plots. At "
                                "least one type of plot must be unsuppressed.")

        if not opts.suppress_significance_tests:
            sig_tests_f = open(join(opts.output_dir, "%s_Stats.xls" % field),
                               'w')
            sig_tests_results = all_pairs_t_test(
                plot_labels,
                plot_data,
                tail_type=opts.tail_type,
                num_permutations=opts.num_permutations)
            sig_tests_f.write(sig_tests_results)
            sig_tests_f.close()

        if opts.save_raw_data:
            # Write the raw plot data into a tab-delimited file.
            assert (len(plot_labels) == len(plot_data))
            raw_data_fp = join(opts.output_dir, "%s_Distances.xls" % field)
            raw_data_f = open(raw_data_fp, 'w')

            for label, data in zip(plot_labels, plot_data):
                raw_data_f.write(label.replace(" ", "_") + "\t")
                raw_data_f.write("\t".join(map(str, data)))
                raw_data_f.write("\n")
            raw_data_f.close()
Ejemplo n.º 8
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    # Parse the distance matrix and mapping file.
    try:
        dist_matrix_header, dist_matrix = parse_distmat(
            open(opts.distance_matrix_fp, 'U'))
    except:
        option_parser.error("This does not look like a valid distance matrix "
            "file. Please supply a valid distance matrix file using the -d "
            "option.")

    try:
        mapping, mapping_header, mapping_comments = parse_mapping_file(
            open(opts.mapping_fp, 'U'))
    except QiimeParseError:
        option_parser.error("This does not look like a valid metadata mapping "
            "file. Please supply a valid mapping file using the -m option.")

    fields = opts.fields
    fields = map(strip, fields.split(','))
    fields = [field.strip('"').strip("'") for field in fields]

    if fields is None:
        option_parser.error("You must provide at least one field using the -f "
                            "option.")

    # Make sure each field is in the mapping file.
    for field in fields:
        if field not in mapping_header:
            option_parser.error("The field '%s' is not in the provided "
                "mapping file. Please supply correct fields (using the -f "
                "option) corresponding to fields in the mapping file."
                % field)

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = opts.y_min
    y_max = opts.y_max
    try:
        y_min = float(y_min)
    except ValueError:
        if y_min == 'auto':
            y_min = None
        else:
            option_parser.error("The --y_min option must be either a number "
                                "or 'auto'.")
    try:
        y_max = float(y_max)
    except ValueError:
        if y_max == 'auto':
            y_max = None
        else:
            option_parser.error("The --y_max option must be either a number "
                                "or 'auto'.")

    # Generate the various boxplots, depending on what the user wanted
    # suppressed. Add them all to one encompassing plot.
    for field in fields:
        plot_data = []
        plot_labels = []

        if not opts.suppress_all_within:
            plot_data.append(get_all_grouped_distances(dist_matrix_header,
                    dist_matrix, mapping_header, mapping, field, within=True))
            plot_labels.append("All within %s" % field)
        if not opts.suppress_all_between:
            plot_data.append(get_all_grouped_distances(dist_matrix_header,
                    dist_matrix, mapping_header, mapping, field, within=False))
            plot_labels.append("All between %s" % field)
        if not opts.suppress_individual_within:
            within_dists = get_grouped_distances(dist_matrix_header,
                    dist_matrix, mapping_header, mapping, field, within=True)
            for grouping in within_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))
        if not opts.suppress_individual_between:
            between_dists = get_grouped_distances(dist_matrix_header,
                    dist_matrix, mapping_header, mapping, field, within=False)
            for grouping in between_dists:
                plot_data.append(grouping[2])
                plot_labels.append("%s vs. %s" % (grouping[0], grouping[1]))

        # We now have our data and labels ready, so plot them!
        assert (len(plot_data) == len(plot_labels)), "The number " +\
                "of boxplot labels does not match the number of " +\
                "boxplots."
        if plot_data:
            if opts.sort:
                # Sort our plot data in order of increasing median.
                sorted_data = []
                for label, distribution in zip(plot_labels, plot_data):
                    sorted_data.append((label, distribution,
                        median(distribution)))
                sorted_data.sort(key=itemgetter(2))
                plot_labels = []
                plot_data = []
                for label, distribution, median_value in sorted_data:
                    plot_labels.append(label)
                    plot_data.append(distribution)

            width = opts.width
            height = opts.height
            if width is None:
                width = len(plot_data) * opts.box_width + 2
            if width <= 0 or height <= 0:
                option_parser.error("The specified width and height of the "
                                    "image must be greater than zero.")

            plot_figure = generate_box_plots(plot_data,
                    x_tick_labels=plot_labels, title="%s Distances" % field,
                    x_label="Grouping", y_label="Distance",
                    x_tick_labels_orientation='vertical', y_min=y_min,
                    y_max=y_max, whisker_length=opts.whisker_length,
                    box_width=opts.box_width, box_color=opts.box_color,
                    figure_width=width, figure_height=height)

            output_plot_fp = join(opts.output_dir, "%s_Distances.%s"
                                       % (field, opts.imagetype))
            plot_figure.savefig(output_plot_fp, format=opts.imagetype,
                    transparent=opts.transparent)
        else:
            option_parser.error("You have chosen to suppress all plots. At "
                                "least one type of plot must be unsuppressed.")

        if not opts.suppress_significance_tests:
            sig_tests_f = open(join(
                opts.output_dir, "%s_Stats.xls" % field), 'w')
            sig_tests_results = all_pairs_t_test(plot_labels, plot_data,
                    tail_type=opts.tail_type,
                    num_permutations=opts.num_permutations)
            sig_tests_f.write(sig_tests_results)
            sig_tests_f.close()

        if opts.save_raw_data:
            # Write the raw plot data into a tab-delimited file.
            assert(len(plot_labels) == len(plot_data))
            raw_data_fp = join(opts.output_dir, "%s_Distances.xls"
                                    % field)
            raw_data_f = open(raw_data_fp, 'w')

            for label, data in zip(plot_labels, plot_data):
                raw_data_f.write(label.replace(" ", "_") + "\t")
                raw_data_f.write("\t".join(map(str, data)))
                raw_data_f.write("\n")
            raw_data_f.close()