def generate_correlation_box_plots(query_pearson_spearman, subject_pearson_spearman, metric, x_label="Method"): """ Generate box plots for correlation coefficient query_pearson_spearman: pearson and spearman data as returned from compute_pearson_spearman for query data subject_pearson_spearman: pearson and spearman data as returned from compute_pearson_spearman for subject data metric: metric to generate plots for (choices are "pearson", "spearman") x_label: x axis label for the plot (default: "Method") """ metric_lookup = {"pearson": (4, "r"), "spearman": (6, "rho")} try: metric_idx, y_label = metric_lookup[metric.lower()] except KeyError: available_metric_desc = ", ".join(metric_lookup.keys()) error_msg = "Unknown metric: %s. Available choices are: %s" % (metric, available_metric_desc) raise KeyError, error_msg distributions_by_method = defaultdict(list) for e in query_pearson_spearman: distributions_by_method[e[2]].append(e[metric_idx]) for e in subject_pearson_spearman: distributions_by_method[e[2]].append(e[metric_idx]) x_tick_labels, distributions = zip(*distributions_by_method.items()) generate_box_plots( distributions, x_tick_labels=x_tick_labels, x_label=x_label, y_label=y_label, y_min=-1.0, y_max=1.0 )
def generate_alpha_diversity_boxplots(rarefaction_lines, mapping_lines, category, depth=None): rarefaction_data = parse_rarefaction(rarefaction_lines) category_value_to_sample_ids = \ get_category_value_to_sample_ids(mapping_lines, category) per_sample_average_diversities = \ get_per_sample_average_diversities(rarefaction_data, category, depth) per_category_value_average_diversities = \ collapse_sample_diversities_by_category_value(category_value_to_sample_ids, per_sample_average_diversities) x_tick_labels = [] distributions = [] for cat, avg_diversities in per_category_value_average_diversities.items(): x_tick_labels.append("%s (n=%d)" % (cat, len(avg_diversities))) distributions.append(avg_diversities) return generate_box_plots(distributions, x_tick_labels=x_tick_labels)
def generate_alpha_diversity_boxplots(rarefaction_lines, mapping_lines, category, depth=None): rarefaction_data = parse_rarefaction(rarefaction_lines) category_value_to_sample_ids = \ get_category_value_to_sample_ids(mapping_lines, category) per_sample_average_diversities = \ get_per_sample_average_diversities(rarefaction_data, category, depth) per_category_value_average_diversities = \ collapse_sample_diversities_by_category_value(category_value_to_sample_ids, per_sample_average_diversities) # sort the data alphabetically sorted_per_category_value_average_diversities = \ per_category_value_average_diversities.items() sorted_per_category_value_average_diversities.sort() x_tick_labels = [] distributions = [] for cat, avg_diversities in sorted_per_category_value_average_diversities: x_tick_labels.append("%s (n=%d)" % (cat, len(avg_diversities))) distributions.append(avg_diversities) return generate_box_plots(distributions, x_tick_labels=x_tick_labels)
def test_generate_box_plots(self): """generate_box_plots() should return a valid Figure object.""" fig = generate_box_plots(self.ValidTypicalBoxData, [1, 4, 10], ["Data 1", "Data 2", "Data 3"], "Test", "x-axis label", "y-axis label") ax = fig.get_axes()[0] self.assertEqual(ax.get_title(), "Test") self.assertEqual(ax.get_xlabel(), "x-axis label") self.assertEqual(ax.get_ylabel(), "y-axis label") self.assertEqual(len(ax.get_xticklabels()), 3) self.assertFloatEqual(ax.get_xticks(), [1, 4, 10])
def generate_prf_box_plots(query_prf, subject_prf, metric, x_label="Method"): """ Generate box plots for precision, recall, or f-measure query_prf: precision, recall, and f-measure values as returned from compute_prfs for query data subject_prf: precision, recall, and f-measure values as returned from compute_prfs for subject data metric: metric to generate plots for (choices are "precision", "recall", and "f-measure") x_label: x axis label for the plot (default: "Method") """ metric_lookup = { "precision": (4, "Precision"), "p": (4, "Precision"), "recall": (5, "Recall"), "r": (5, "Recall"), "f-measure": (6, "F-measure"), "f": (6, "F-measure"), } try: metric_idx, y_label = metric_lookup[metric.lower()] except KeyError: available_metric_desc = ", ".join(metric_lookup.keys()) error_msg = "Unknown metric: %s. Available choices are: %s" % (metric, available_metric_desc) raise KeyError, error_msg distributions_by_method = defaultdict(list) for e in subject_prf: distributions_by_method[e[2]].append(e[metric_idx]) for e in query_prf: distributions_by_method[e[2]].append(e[metric_idx]) x_tick_labels, distributions = zip(*distributions_by_method.items()) generate_box_plots( distributions, x_tick_labels=x_tick_labels, x_label=x_label, y_label=y_label, y_min=0.0, y_max=1.0 )
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") # Parse the distance matrix and mapping file. try: dist_matrix_header, dist_matrix = parse_distmat( open(opts.distance_matrix_fp, 'U')) except: option_parser.error( "This does not look like a valid distance matrix " "file. Please supply a valid distance matrix file using the -d " "option.") try: mapping, mapping_header, mapping_comments = parse_mapping_file( open(opts.mapping_fp, 'U')) except QiimeParseError: option_parser.error( "This does not look like a valid metadata mapping " "file. Please supply a valid mapping file using the -m option.") fields = opts.fields fields = map(strip, fields.split(',')) fields = [field.strip('"').strip("'") for field in fields] if fields is None: option_parser.error("You must provide at least one field using the -f " "option.") # Make sure each field is in the mapping file. for field in fields: if field not in mapping_header: option_parser.error( "The field '%s' is not in the provided " "mapping file. Please supply correct fields (using the -f " "option) corresponding to fields in the mapping file." % field) # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = opts.y_min y_max = opts.y_max try: y_min = float(y_min) except ValueError: if y_min == 'auto': y_min = None else: option_parser.error("The --y_min option must be either a number " "or 'auto'.") try: y_max = float(y_max) except ValueError: if y_max == 'auto': y_max = None else: option_parser.error("The --y_max option must be either a number " "or 'auto'.") # Generate the various boxplots, depending on what the user wanted # suppressed. Add them all to one encompassing plot. for field in fields: plot_data = [] plot_labels = [] if not opts.suppress_all_within: plot_data.append( get_all_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=True)) plot_labels.append("All within %s" % field) if not opts.suppress_all_between: plot_data.append( get_all_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False)) plot_labels.append("All between %s" % field) if not opts.suppress_individual_within: within_dists = get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=True) for grouping in within_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) if not opts.suppress_individual_between: between_dists = get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False) for grouping in between_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) # We now have our data and labels ready, so plot them! assert (len(plot_data) == len(plot_labels)), "The number " +\ "of boxplot labels does not match the number of " +\ "boxplots." if plot_data: if opts.sort: # Sort our plot data in order of increasing median. sorted_data = [] for label, distribution in zip(plot_labels, plot_data): sorted_data.append( (label, distribution, median(distribution))) sorted_data.sort(key=itemgetter(2)) plot_labels = [] plot_data = [] for label, distribution, median_value in sorted_data: plot_labels.append(label) plot_data.append(distribution) width = opts.width height = opts.height if width is None: width = len(plot_data) * opts.box_width + 2 if width <= 0 or height <= 0: option_parser.error("The specified width and height of the " "image must be greater than zero.") plot_figure = generate_box_plots( plot_data, x_tick_labels=plot_labels, title="%s Distances" % field, x_label="Grouping", y_label="Distance", x_tick_labels_orientation='vertical', y_min=y_min, y_max=y_max, whisker_length=opts.whisker_length, box_width=opts.box_width, box_color=opts.box_color, figure_width=width, figure_height=height) output_plot_fp = join(opts.output_dir, "%s_Distances.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) else: option_parser.error("You have chosen to suppress all plots. At " "least one type of plot must be unsuppressed.") if not opts.suppress_significance_tests: sig_tests_f = open(join(opts.output_dir, "%s_Stats.xls" % field), 'w') sig_tests_results = all_pairs_t_test( plot_labels, plot_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file. assert (len(plot_labels) == len(plot_data)) raw_data_fp = join(opts.output_dir, "%s_Distances.xls" % field) raw_data_f = open(raw_data_fp, 'w') for label, data in zip(plot_labels, plot_data): raw_data_f.write(label.replace(" ", "_") + "\t") raw_data_f.write("\t".join(map(str, data))) raw_data_f.write("\n") raw_data_f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") # Parse the distance matrix and mapping file. try: dist_matrix_header, dist_matrix = parse_distmat( open(opts.distance_matrix_fp, 'U')) except: option_parser.error("This does not look like a valid distance matrix " "file. Please supply a valid distance matrix file using the -d " "option.") try: mapping, mapping_header, mapping_comments = parse_mapping_file( open(opts.mapping_fp, 'U')) except QiimeParseError: option_parser.error("This does not look like a valid metadata mapping " "file. Please supply a valid mapping file using the -m option.") fields = opts.fields fields = map(strip, fields.split(',')) fields = [field.strip('"').strip("'") for field in fields] if fields is None: option_parser.error("You must provide at least one field using the -f " "option.") # Make sure each field is in the mapping file. for field in fields: if field not in mapping_header: option_parser.error("The field '%s' is not in the provided " "mapping file. Please supply correct fields (using the -f " "option) corresponding to fields in the mapping file." % field) # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = opts.y_min y_max = opts.y_max try: y_min = float(y_min) except ValueError: if y_min == 'auto': y_min = None else: option_parser.error("The --y_min option must be either a number " "or 'auto'.") try: y_max = float(y_max) except ValueError: if y_max == 'auto': y_max = None else: option_parser.error("The --y_max option must be either a number " "or 'auto'.") # Generate the various boxplots, depending on what the user wanted # suppressed. Add them all to one encompassing plot. for field in fields: plot_data = [] plot_labels = [] if not opts.suppress_all_within: plot_data.append(get_all_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=True)) plot_labels.append("All within %s" % field) if not opts.suppress_all_between: plot_data.append(get_all_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False)) plot_labels.append("All between %s" % field) if not opts.suppress_individual_within: within_dists = get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=True) for grouping in within_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) if not opts.suppress_individual_between: between_dists = get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False) for grouping in between_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) # We now have our data and labels ready, so plot them! assert (len(plot_data) == len(plot_labels)), "The number " +\ "of boxplot labels does not match the number of " +\ "boxplots." if plot_data: if opts.sort: # Sort our plot data in order of increasing median. sorted_data = [] for label, distribution in zip(plot_labels, plot_data): sorted_data.append((label, distribution, median(distribution))) sorted_data.sort(key=itemgetter(2)) plot_labels = [] plot_data = [] for label, distribution, median_value in sorted_data: plot_labels.append(label) plot_data.append(distribution) width = opts.width height = opts.height if width is None: width = len(plot_data) * opts.box_width + 2 if width <= 0 or height <= 0: option_parser.error("The specified width and height of the " "image must be greater than zero.") plot_figure = generate_box_plots(plot_data, x_tick_labels=plot_labels, title="%s Distances" % field, x_label="Grouping", y_label="Distance", x_tick_labels_orientation='vertical', y_min=y_min, y_max=y_max, whisker_length=opts.whisker_length, box_width=opts.box_width, box_color=opts.box_color, figure_width=width, figure_height=height) output_plot_fp = join(opts.output_dir, "%s_Distances.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) else: option_parser.error("You have chosen to suppress all plots. At " "least one type of plot must be unsuppressed.") if not opts.suppress_significance_tests: sig_tests_f = open(join( opts.output_dir, "%s_Stats.xls" % field), 'w') sig_tests_results = all_pairs_t_test(plot_labels, plot_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file. assert(len(plot_labels) == len(plot_data)) raw_data_fp = join(opts.output_dir, "%s_Distances.xls" % field) raw_data_f = open(raw_data_fp, 'w') for label, data in zip(plot_labels, plot_data): raw_data_f.write(label.replace(" ", "_") + "\t") raw_data_f.write("\t".join(map(str, data))) raw_data_f.write("\n") raw_data_f.close()