def test_color_mapping(self): """ Testing the color-group mapping for obtaining colors for visualizations from mapping file. :return:Returns OK if test goals were achieved, otherwise raises error. """ colormap1 = ut.color_mapping(self.map_data, self.map_header, "Treatment", "Color") self.assertEqual({ "Control": "#008000", "Fast": "#0000CC" }, colormap1, msg="Color-group mapping not computed " "accurately. Please check category and color " "columns.") colormap2 = ut.color_mapping(self.map_data, self.map_header, "Treatment") self.assertEqual({ "Control": "#8DD3C7", "Fast": "#FFFFB3" }, colormap2, msg="With no color column given, the " "color-group mapping not computed accurately.")
def test_color_mapping(self): """ Testing the color-group mapping for obtaining colors for visualizations from mapping file. :return:Returns OK if test goals were achieved, otherwise raises error. """ colormap1 = ut.color_mapping(self.map_data, self.map_header, "Treatment", "Color") self.assertEqual({"Control": "#008000", "Fast": "#0000CC"}, colormap1, msg="Color-group mapping not computed " "accurately. Please check category and color " "columns.") colormap2 = ut.color_mapping(self.map_data, self.map_header, "Treatment") self.assertEqual({"Control": "#8DD3C7", "Fast": "#FFFFB3"}, colormap2, msg="With no color column given, the " "color-group mapping not computed accurately.")
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # map groups to colors class_colors = util.color_mapping(imap, header, args.group_by, args.color_by) core_files = args.core_files tsv = False if args.core_files is None: core_files = args.tsv_core_files tsv = True # map each core file to its matching category in the mapping file group_cores = OrderedDict() for group, fp in zip(class_colors, core_files): if not tsv: core = load_core_file(fp) group_cores[group] = [ name.replace("_", " ") for name in core.values() if not name.startswith("Unclassified") ] else: group_cores[group] = load_tsv_core(fp, args.skipheader) # create the overlap set of OTUs and plot overlap = set() overlap.update(*group_cores.values()) plot_overlaps(overlap, group_cores, class_colors, out_fp=args.out_fp, fig_size=args.figsize, title=args.title, filter_common=args.filtercommon)
def main(): args = handle_program_options() # Read in the distance data try: dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: sys.exit("\nError reading in distance matrix file: {}.".format(ioe)) # mapping and colors info for plotting try: header, map_data = util.parse_map_file(args.map_fp) except IOError as ioe: sys.exit("\nError reading mapping file: {}.".format(ioe)) y = [map_data[sid][header.index(args.group_by)] for sid in dm_data.index] cond_colors = util.color_mapping(map_data, header, args.group_by, args.color_by) # Prep input data for t-SNE X = dm_data[range(dm_data.shape[1])].values X_tsne = TSNE(n_components=3, metric="precomputed").fit_transform(X) # Plot t-SNE result fig = plt.figure(figsize=(14, 8)) for cond, sid, xy in zip(y, dm_data.index, X_tsne): plt.scatter(x=xy[0], y=xy[1], s=150, c=cond_colors[cond], alpha=0.85, edgecolors="k") if args.annotate: plt.annotate(s=sid, xy=(xy[0], xy[1]), xytext=(12, 12), textcoords="offset points", ha="center", va="center", alpha=1, style="italic") if args.plot_title is not None: plt.title(args.plot_title, fontsize=16, weight="bold") l = [plt.scatter([], [], c=cond_colors[cond], s=150, edgecolors="k") for cond in cond_colors] plt.legend(l, ["{}".format(cond) for cond in cond_colors], loc="best", scatterpoints=3, frameon=True, framealpha=1, fontsize=14) plt.xlabel("t-SNE 1", fontsize=16) plt.ylabel("t-SNE 2", fontsize=16) plt.xticks(size=12) plt.yticks(size=12) plt.grid() plt.show()
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # map groups to colors class_colors = util.color_mapping(imap, header, args.group_by, args.color_by) core_files = args.core_files tsv = False if args.core_files is None: core_files = args.tsv_core_files tsv = True # map each core file to its matching category in the mapping file group_cores = OrderedDict() for group, fp in zip(class_colors, core_files): if not tsv: core = load_core_file(fp) group_cores[group] = [name.replace("_", " ") for name in core.values() if not name.startswith("Unclassified")] else: group_cores[group] = load_tsv_core(fp, args.skipheader) # create the overlap set of OTUs and plot overlap = set() overlap.update(*group_cores.values()) plot_overlaps(overlap, group_cores, class_colors, out_fp=args.out_fp, fig_size=args.figsize, title=args.title, filter_common=args.filtercommon)
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) if not os.path.exists(args.output_dir): try: os.mkdir(args.output_dir) except OSError as oe: if os.errno == 2: msg = ("One or more directories in the path provided for " + "--output-dir ({}) do not exist. If you are specifying " + "a new directory for output, please ensure all other " + "directories in the path currently exist.") sys.exit(msg.format(args.output_dir)) else: msg = ("An error occurred trying to create the output " + "directory ({}) with message: {}") sys.exit(msg.format(args.output_dir, oe.strerror)) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = get_relative_abundance(biomtbl) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = {cat: {"pc1": [], "pc2": [], "size": []} for cat in category_ids} for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print "{} not found in {} sample.".format(ke, sid) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print "Saving chart for {}".format(" ".join(otuname.split("_"))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # check that the output dir exists, create it if not util.ensure_dir(args.output_dir) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = bc.relative_abundance(biomtbl) rel_abd = bc.arcsine_sqrt_transform(rel_abd) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = {cat: {"pc1": [], "pc2": [], "size": []} for cat in category_ids} for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Obtain group colors try: assert args.colors is not None except AssertionError: categories = {v[category_idx] for k, v in imap.items()} color_cycle = cycle(Set3_12.hex_colors) class_colors = {c: color_cycle.next() for c in categories} else: class_colors = util.color_mapping(imap, header, args.group_by, args.colors) if args.dist_matrix_file: try: dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) dm_data.insert(0, "Condition", [imap[str(sid)][category_idx] for sid in dm_data.index]) if args.annotate_points: sampleids = [str(sid) for sid in dm_data.index] else: sampleids = None if args.save_lda_input: dm_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(dm_data) else: # Load biom file and calculate relative abundance try: biomf = biom.load_table(args.otu_table) except IOError as ioe: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ioe)) # Get normalized relative abundances rel_abd = bc.relative_abundance(biomf) rel_abd = bc.arcsine_sqrt_transform(rel_abd) df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert(0, "Condition", [imap[sid][category_idx] for sid in df_rel_abd.index]) if args.annotate_points: sampleids = df_rel_abd.index else: sampleids = None if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Plot LDA if args.dimensions == 3: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, dim=3, zangles=args.z_angles, pt_size=args.point_size, out_fp=args.out_fp) else: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, pt_size=args.point_size, out_fp=args.out_fp)
def main(): args = handle_program_options() try: with open(args.coord_fp): pass except IOError as ioe: err_msg = "\nError in input principal coordinates filepath (-i): {}\n" sys.exit(err_msg.format(ioe)) try: with open(args.map_fp): pass except IOError as ioe: err_msg = "\nError in input metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) with open(args.coord_fp) as F: pcd = F.readlines() pcd = [line.split("\t") for line in pcd] map_header, imap = util.parse_map_file(args.map_fp) data_gather = util.gather_categories(imap, map_header, args.group_by.split(",")) categories = OrderedDict([(condition, {"pc1": [], "pc2": [], "pc3": []}) for condition in data_gather.keys()]) bcolors = itertools.cycle(Set3_12.hex_colors) if not args.colors: colors = [bcolors.next() for _ in categories] else: colors = util.color_mapping(imap, map_header, args.group_by, args.colors) colors = colors.values() parsed_unifrac = util.parse_unifrac(args.coord_fp) pco = args.pc_order if args.dimensions == 3: pco.append(3) pc1v = parsed_unifrac["varexp"][pco[0] - 1] pc2v = parsed_unifrac["varexp"][pco[1] - 1] if args.dimensions == 3: pc3v = parsed_unifrac["varexp"][pco[2] - 1] for sid, points in parsed_unifrac["pcd"].items(): for condition, dc in data_gather.items(): if sid in dc.sids: cat = condition break categories[cat]["pc1"].append((sid, points[pco[0] - 1])) categories[cat]["pc2"].append((sid, points[pco[1] - 1])) if args.dimensions == 3: categories[cat]["pc3"].append((sid, points[pco[2] - 1])) axis_str = "PC{} (Percent Explained Variance {:.3f}%)" # initialize plot fig = plt.figure(figsize=args.figsize) if args.dimensions == 3: ax = fig.add_subplot(111, projection="3d") ax.view_init(elev=args.z_angles[1], azim=args.z_angles[0]) ax.set_zlabel(axis_str.format(3, pc3v), labelpad=args.label_padding) if args.z_limits: ax.set_zlim(args.z_limits) else: ax = fig.add_subplot(111) # plot data for i, cat in enumerate(categories): if args.dimensions == 3: ax.scatter(xs=[e[1] for e in categories[cat]["pc1"]], ys=[e[1] for e in categories[cat]["pc2"]], zs=[e[1] for e in categories[cat]["pc3"]], zdir="z", c=colors[i], s=args.point_size, label=cat, edgecolors="k") else: ax.scatter([e[1] for e in categories[cat]["pc1"]], [e[1] for e in categories[cat]["pc2"]], c=colors[i], s=args.point_size, label=cat, edgecolors="k") # Script to annotate PCoA sample points. if args.annotate_points: for x, y in zip(categories[cat]["pc1"], categories[cat]["pc2"]): ax.annotate( x[0], xy=(x[1], y[1]), xytext=(-10, -15), textcoords="offset points", ha="center", va="center", ) # customize plot options if args.x_limits: ax.set_xlim(args.x_limits) if args.y_limits: ax.set_ylim(args.y_limits) ax.set_xlabel(axis_str.format(pco[0], float(pc1v)), labelpad=args.label_padding) ax.set_ylabel(axis_str.format(pco[1], float(pc2v)), labelpad=args.label_padding) leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1) leg.get_frame().set_edgecolor('k') # Set the font characteristics font = {"family": "normal", "weight": "bold", "size": args.font_size} mpl.rc("font", **font) if args.title: ax.set_title(args.title) if args.ggplot2_style and not args.dimensions == 3: gu.ggplot2_style(ax) # save or display result if args.out_fp: fig.savefig(args.out_fp, facecolor="white", edgecolor="none", bbox_inches="tight", pad_inches=0.2) else: plt.show()
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Obtain group colors try: assert args.colors is not None except AssertionError: categories = {v[category_idx] for k, v in imap.items()} color_cycle = cycle(Set3_12.hex_colors) class_colors = {c: color_cycle.next() for c in categories} else: class_colors = util.color_mapping(imap, header, args.group_by, args.colors) if args.dist_matrix_file: try: dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) dm_data.insert(0, "Condition", [imap[str(sid)][category_idx] for sid in dm_data.index]) if args.annotate_points: sampleids = [str(sid) for sid in dm_data.index] else: sampleids = None if args.save_lda_input: dm_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(dm_data) else: # Load biom file and calculate relative abundance try: biomf = biom.load_table(args.otu_table) except IOError as ioe: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ioe)) # Get normalized relative abundances rel_abd = bc.relative_abundance(biomf) rel_abd = bc.arcsine_sqrt_transform(rel_abd) df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert( 0, "Condition", [imap[sid][category_idx] for sid in df_rel_abd.index]) if args.annotate_points: sampleids = df_rel_abd.index else: sampleids = None if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Plot LDA if args.dimensions == 3: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, dim=3, zangles=args.z_angles, pt_size=args.point_size, out_fp=args.out_fp) else: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, pt_size=args.point_size, out_fp=args.out_fp)
def main(): args = handle_program_options() # Read in the distance data try: dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) dm_data_sids = dm_data.index dm_data = pairwise_distances(dm_data[range(dm_data.shape[1])].values, metric="precomputed") except IOError as ioe: sys.exit("\nError reading in distance matrix file: {}.".format(ioe)) # Mapping and colors info for plotting try: header, map_data = util.parse_map_file(args.map_fp) except IOError as ioe: sys.exit("\nError reading mapping file: {}.".format(ioe)) y = [map_data[sid][header.index(args.group_by)] for sid in dm_data_sids] # Get colors for all categories if not args.color_by: categories = set(y) bcolors = itertools.cycle(Set1_9.hex_colors) cond_colors = {c: bcolors.next() for c in categories} else: cond_colors = util.color_mapping(map_data, header, args.group_by, args.color_by) # Prep input data for t-SNE X_tsne = TSNE(n_components=3, perplexity=args.perplexity, metric="precomputed", method="exact", verbose=2, random_state=0, angle=0.8) X_new = X_tsne.fit_transform(dm_data) print("KL divergence after optimization: {}\n".format(X_tsne.kl_divergence_)) x_min, x_max = np.min(X_new, 0), np.max(X_new, 0) X_new = (X_new - x_min) / (x_max - x_min) # Plot t-SNE result fig = plt.figure(figsize=(14, 8)) for cond, sid, xy in zip(y, dm_data_sids, X_new): ax = fig.add_subplot(111) ax.scatter(x=xy[0], y=xy[1], s=args.point_size, c=cond_colors[cond], alpha=0.9, edgecolors="k") if args.annotate: ax.annotate(s=sid, xy=(xy[0], xy[1]), xytext=(12, 12), textcoords="offset points", ha="center", va="center", alpha=1, style="italic") if args.plot_title is not None: ax.set_title(args.plot_title, fontsize=16, weight="bold") l = [plt.scatter([], [], c=cond_colors[cond], s=150, edgecolors="k") for cond in cond_colors] plt.legend(l, ["{}".format(cond) for cond in cond_colors], loc="best", scatterpoints=3, frameon=True, framealpha=1, fontsize=14) ax.set_xlabel("t-SNE 1", fontsize=14) ax.set_ylabel("t-SNE 2", fontsize=14) plt.tight_layout() if args.ggplot2_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # save or display result if args.out_fp: plt.savefig(args.out_fp, facecolor=fc, edgecolor="none", dpi=300, pad_inches=0.1, bbox_inches="tight") else: plt.show()
def main(): args = handle_program_options() metrics = [m for m in alpha.__all__ if "_ci" not in m] try: metrics.remove("faith_pd") except ValueError: pass if args.show_available_metrics: print "\nAvailable alpha diversity metrics:" return "\n".join(metrics) # check that the output dir exists, create it if not msg = putil.ensure_dir(args.output_dir) # if an error occurs, print and exit if msg: sys.exit(msg) # parse mapping file try: header, sample_map = putil.parse_map_file(args.map_file) except Exception as ioe: err_msg = "\nError while processing the mapping file: {}\n" sys.exit(err_msg.format(ioe)) # parse BIOM table try: biom_tbl = biom.load_table(args.biom_fp) except Exception as ioe: err_msg = "\nError loading BIOM table file: {}\n" sys.exit(err_msg.format(ioe)) # group samples by category if args.category not in header: sys.exit("Category '{}' not found".format(args.category)) cat_idx = header.index(args.category) cat_vals = {entry[cat_idx] for entry in sample_map.values()} plot_title = args.plot_title colors = putil.color_mapping(sample_map, header, args.category, args.color_by) # Perform diversity calculations and density plotting for method, x_label in izip_longest(args.diversity, args.x_label): if x_label is None: x_label = method.title() if method not in alpha.__all__: sys.exit("ERROR: Diversity metric not found: {}.".format(method)) elif method in alpha.__all__ and method not in metrics: sys.exit("Currently, PhyloToAST does not support {} metric.".format(method)) metric = eval("alpha."+method) div_calc, sample_ids = calc_diversity(metric, sample_map, biom_tbl, cat_vals, cat_idx) if args.save_calculations: write_diversity_metrics(div_calc, sample_ids, args.save_calculations) plot_group_diversity(div_calc, colors, plot_title, x_label, args.output_dir, args.image_type) # calculate and print significance testing results if not args.suppress_stats: print "Diversity significance testing: {}".format(x_label) if len(cat_vals) == 2: print_MannWhitneyU(div_calc) elif len(cat_vals) > 2: print_KruskalWallisH(div_calc) print else: continue
def main(): args = handle_program_options() metrics = [m for m in alpha.__all__ if "_ci" not in m] try: metrics.remove("faith_pd") except ValueError: pass if args.show_available_metrics: print "\nAvailable alpha diversity metrics:" return "\n".join(metrics) # check that the output dir exists, create it if not msg = putil.ensure_dir(args.output_dir) # if an error occurs, print and exit if msg: sys.exit(msg) # parse mapping file try: header, sample_map = putil.parse_map_file(args.map_file) except Exception as ioe: err_msg = "\nError while processing the mapping file: {}\n" sys.exit(err_msg.format(ioe)) # parse BIOM table try: biom_tbl = biom.load_table(args.biom_fp) except Exception as ioe: err_msg = "\nError loading BIOM table file: {}\n" sys.exit(err_msg.format(ioe)) # group samples by category if args.category not in header: sys.exit("Category '{}' not found".format(args.category)) cat_idx = header.index(args.category) cat_vals = {entry[cat_idx] for entry in sample_map.values()} plot_title = args.plot_title colors = putil.color_mapping(sample_map, header, args.category, args.color_by) # Perform diversity calculations and density plotting for method, x_label in izip_longest(args.diversity, args.x_label): if x_label is None: x_label = method.title() if method not in alpha.__all__: sys.exit("ERROR: Diversity metric not found: {}.".format(method)) elif method in alpha.__all__ and method not in metrics: sys.exit( "Currently, PhyloToAST does not support {} metric.".format( method)) metric = eval("alpha." + method) div_calc, sample_ids = calc_diversity(metric, sample_map, biom_tbl, cat_vals, cat_idx) if args.save_calculations: write_diversity_metrics(div_calc, sample_ids, args.save_calculations) plot_group_diversity(div_calc, colors, plot_title, x_label, args.output_dir, args.image_type) # calculate and print significance testing results if not args.suppress_stats: print "Diversity significance testing: {}".format(x_label) if len(cat_vals) == 2: print_MannWhitneyU(div_calc) elif len(cat_vals) > 2: print_KruskalWallisH(div_calc) print else: continue
def main(): args = handle_program_options() try: with open(args.coord_fp): pass except IOError as ioe: err_msg = "\nError in input principal coordinates filepath (-i): {}\n" sys.exit(err_msg.format(ioe)) try: with open(args.map_fp): pass except IOError as ioe: err_msg = "\nError in input metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) with open(args.coord_fp) as F: pcd = F.readlines() pcd = [line.split("\t") for line in pcd] map_header, imap = util.parse_map_file(args.map_fp) data_gather = util.gather_categories(imap, map_header, args.group_by.split(",")) categories = OrderedDict([(condition, { "pc1": [], "pc2": [], "pc3": [] }) for condition in data_gather.keys()]) bcolors = itertools.cycle(Set3_12.hex_colors) if not args.colors: colors = [bcolors.next() for _ in categories] else: colors = util.color_mapping(imap, map_header, args.group_by, args.colors) colors = colors.values() parsed_unifrac = util.parse_unifrac(args.coord_fp) pco = args.pc_order if args.dimensions == 3: pco.append(3) pc1v = parsed_unifrac["varexp"][pco[0] - 1] pc2v = parsed_unifrac["varexp"][pco[1] - 1] if args.dimensions == 3: pc3v = parsed_unifrac["varexp"][pco[2] - 1] for sid, points in parsed_unifrac["pcd"].items(): for condition, dc in data_gather.items(): if sid in dc.sids: cat = condition break categories[cat]["pc1"].append((sid, points[pco[0] - 1])) categories[cat]["pc2"].append((sid, points[pco[1] - 1])) if args.dimensions == 3: categories[cat]["pc3"].append((sid, points[pco[2] - 1])) axis_str = "PC{} (Percent Explained Variance {:.3f}%)" # initialize plot fig = plt.figure(figsize=args.figsize) if args.dimensions == 3: ax = fig.add_subplot(111, projection="3d") ax.view_init(elev=args.z_angles[1], azim=args.z_angles[0]) ax.set_zlabel(axis_str.format(3, pc3v), labelpad=args.label_padding) if args.z_limits: ax.set_zlim(args.z_limits) else: ax = fig.add_subplot(111) # plot data for i, cat in enumerate(categories): if args.dimensions == 3: ax.scatter(xs=[e[1] for e in categories[cat]["pc1"]], ys=[e[1] for e in categories[cat]["pc2"]], zs=[e[1] for e in categories[cat]["pc3"]], zdir="z", c=colors[i], s=args.point_size, label=cat, edgecolors="k") else: ax.scatter([e[1] for e in categories[cat]["pc1"]], [e[1] for e in categories[cat]["pc2"]], c=colors[i], s=args.point_size, label=cat, edgecolors="k") # Script to annotate PCoA sample points. if args.annotate_points: for x, y in zip(categories[cat]["pc1"], categories[cat]["pc2"]): ax.annotate( x[0], xy=(x[1], y[1]), xytext=(-10, -15), textcoords="offset points", ha="center", va="center", ) # customize plot options if args.x_limits: ax.set_xlim(args.x_limits) if args.y_limits: ax.set_ylim(args.y_limits) ax.set_xlabel(axis_str.format(pco[0], float(pc1v)), labelpad=args.label_padding) ax.set_ylabel(axis_str.format(pco[1], float(pc2v)), labelpad=args.label_padding) leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1) leg.get_frame().set_edgecolor('k') # Set the font characteristics font = {"family": "normal", "weight": "bold", "size": args.font_size} mpl.rc("font", **font) if args.title: ax.set_title(args.title) if args.ggplot2_style and not args.dimensions == 3: gu.ggplot2_style(ax) # save or display result if args.out_fp: fig.savefig(args.out_fp, facecolor="white", edgecolor="none", bbox_inches="tight", pad_inches=0.2) else: plt.show()
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # check that the output dir exists, create it if not util.ensure_dir(args.output_dir) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = bc.relative_abundance(biomtbl) rel_abd = bc.arcsine_sqrt_transform(rel_abd) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name( biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = { cat: { "pc1": [], "pc2": [], "size": [] } for cat in category_ids } for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = handle_program_options() try: with open(args.map_fp): pass except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Parse and read mapping file and obtain group colors header, imap = util.parse_map_file(args.map_fp) class_colors = util.color_mapping(imap, header, args.group_by, args.color_by) if args.input_data_type == "unifrac_dm": try: with open(args.unifrac_file): pass except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) uf_data = pd.read_csv(args.unifrac_file, sep="\t", index_col=0) uf_data.insert(0, "Condition", [imap[sid][header.index(args.group_by)] for sid in uf_data.index]) sampleids = uf_data.index if args.save_lda_input: uf_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(uf_data) # Plot LDA plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, out_fp=args.out_fp) else: # Load biom file and calculate relative abundance try: rel_abd = get_relative_abundance(args.biom_file) except ValueError as ve: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ve)) df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert(0, "Condition", [imap[sid][header.index(args.group_by)] for sid in df_rel_abd.index]) sampleids = df_rel_abd.index if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Plot LDA plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, out_fp=args.out_fp) if args.bubble: # Get otus for LDA bubble plots try: with open(args.bubble) as hojiehr: for line in hojiehr.readlines(): bubble_otus = line.strip().split("\r") except IOError as ioe: err_msg = "\nError in OTU name list file (--bubble): {}\n" sys.exit(err_msg.format(ioe)) # Load biom file and calculate relative abundance try: rel_abd = get_relative_abundance(args.biom_file) except ValueError as ve: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ve)) category_idx = header.index(args.group_by) # Calculate position and size of SampleIDs to plot for each OTU for otuname in bubble_otus: plot_data = {cat: {"x": [], "y": [], "size": [], "label": []} for cat in class_colors.keys()} for sid, data in zip(sampleids, X_lda): category = plot_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuname] * args.scale_by except KeyError as ke: print "{} not found in {} sample.".format(ke, sid) continue category["x"].append(float(data[0])) category["y"].append(float(data[1])) category["size"].append(size) # Plot LDA bubble for each OTU fig = plt.figure(figsize=(12, 9)) ax = fig.add_subplot(111) for i, cat in enumerate(plot_data): plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"], plot_data[cat]["size"], label=cat, color=class_colors[cat], alpha=0.85, marker="o", edgecolor="k") mpl.rc("font", family="Arial") # define font for figure text mpl.rc("xtick", labelsize=12) # increase X axis ticksize mpl.rc("ytick", labelsize=12) # increase Y axis ticksize if X_lda.shape[1] == 1: plt.ylim((0.5, 2.5)) plt.title(" ".join(otuname.split("_")), style="italic") plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100), fontsize=12) plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100), fontsize=12) lgnd = plt.legend(loc="best", scatterpoints=3, fontsize=12) # Change the legend marker size manually for i in range(len(class_colors.keys())): lgnd.legendHandles[i]._sizes = [75] # Set style for LDA bubble plots if args.ggplot2_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # Save LDA bubble plots to output directory print "Saving chart for {}".format(" ".join(otuname.split("_"))) fig.savefig(os.path.join(args.output_dir, "_".join(otuname.split())) + "." + args.save_as, facecolor=fc, edgecolor="none", dpi=300, bbox_inches="tight", pad_inches=0.2) plt.close(fig)
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Obtain group colors class_colors = util.color_mapping(imap, header, args.group_by, args.color_by) # Get otus for LDA bubble plots try: bubble_otus = set(pd.read_csv(args.otu_ids_fp, sep="\n", header=None)[0]) except IOError as ioe: err_msg = "\nError in OTU IDs file (--bubble): {}\n" sys.exit(err_msg.format(ioe)) # Load biom file and calculate relative abundance try: biomf = biom.load_table(args.otu_table) except IOError as ioe: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ioe)) # Get normalized relative abundances rel_abd = bc.relative_abundance(biomf) rel_abd = bc.arcsine_sqrt_transform(rel_abd) abd_val = {abd for sid, v1 in rel_abd.items() for otuid, abd in v1.items() if abd > 0} bubble_range = np.linspace(min(abd_val), max(abd_val), num=5) * args.scale_by # Get abundance to the nearest 50 bubble_range = [int(50 * round(float(abd)/50)) for abd in bubble_range[1:]] # Set up input for LDA calc and get LDA transformed data if args.dist_matrix_file: try: uf_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) uf_data.insert(0, "Condition", [imap[sid][category_idx] for sid in uf_data.index]) sampleids = uf_data.index if args.save_lda_input: uf_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(uf_data) else: df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert(0, "Condition", [imap[sid][category_idx] for sid in df_rel_abd.index]) sampleids = df_rel_abd.index if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Calculate position and size of SampleIDs to plot for each OTU for otuid in bubble_otus: otuname = oc.otu_name(biomf.metadata(otuid, axis="observation")["taxonomy"]) plot_data = {cat: {"x": [], "y": [], "size": [], "label": []} for cat in class_colors.keys()} for sid, data in zip(sampleids, X_lda): category = plot_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["x"].append(float(data[0])) category["y"].append(float(data[1])) category["size"].append(size) # Plot LDA bubble for each OTU fig = plt.figure(figsize=args.figsize) ax = fig.add_subplot(111) for i, cat in enumerate(plot_data): plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"], s=plot_data[cat]["size"], label=cat, color=class_colors[cat], alpha=0.85, edgecolors="k") if X_lda.shape[1] == 1: plt.ylim((0.5, 2.5)) plt.title(" ".join(otuname.split("_")), style="italic", fontsize=13) try: plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100), fontsize=13, labelpad=15) except: plt.xlabel("LD1", fontsize=13, labelpad=15) try: plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100), fontsize=13, labelpad=15) except: plt.ylabel("LD2", fontsize=13, labelpad=15) lgnd1 = plt.legend(loc="best", scatterpoints=3, fontsize=13) for i in range(len(class_colors.keys())): lgnd1.legendHandles[i]._sizes = [80] # Change the legend marker size manually # Add the legend manually to the current plot plt.gca().add_artist(lgnd1) c = [plt.scatter([], [], c="w", edgecolors="k", s=s1) for s1 in bubble_range] plt.legend(c, ["{}".format(s2) for s2 in bubble_range], title="Scaled Bubble\n Sizes", frameon=True, labelspacing=2, fontsize=13, loc=4, scatterpoints=1, borderpad=1.1) # Set style for LDA bubble plots if args.ggplot2_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # Save LDA bubble plots to output directory if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) fig.savefig(pj(args.output_dir, "_".join(otuname.split())) + "." + args.save_as, facecolor=fc, edgecolor="none", dpi=300, bbox_inches="tight", pad_inches=0.2) plt.close(fig)