def plot_PCoA(cat_data, otu_name, unifrac, names, colors, xr, yr, outDir, save_as, plot_style): """ Plot PCoA principal coordinates scaled by the relative abundances of otu_name. """ fig = plt.figure() ax = fig.add_subplot(111) legend = [] for i, cat in enumerate(cat_data): plt.scatter(cat_data[cat]["pc1"], cat_data[cat]["pc2"], cat_data[cat]["size"], color=colors[cat], alpha=0.85, marker="o", edgecolor="black") legend.append(plt.Rectangle((0, 0), 1, 1, fc=colors[cat])) ax.legend(legend, names, loc="best") plt.title(" ".join(otu_name.split("_")), style="italic") plt.ylabel("PC2 - Percent variation explained {:.2f}%".format(float(unifrac["varexp"][1]))) plt.xlabel("PC1 - Percent variation explained {:.2f}%".format(float(unifrac["varexp"][0]))) plt.xlim(round(xr[0]*1.5, 1), round(xr[1]*1.5, 1)) plt.ylim(round(yr[0]*1.5, 1), round(yr[1]*1.5, 1)) if plot_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" fig.savefig(os.path.join(outDir, "_".join(otu_name.split())) + "." + save_as, facecolor=fc, edgecolor="none", format=save_as, bbox_inches="tight", pad_inches=0.2) plt.close(fig)
def plot_PCoA(cat_data, otu_name, unifrac, names, colors, xr, yr, outDir, save_as, plot_style): """ Plot PCoA principal coordinates scaled by the relative abundances of otu_name. """ fig = plt.figure(figsize=(14, 8)) ax = fig.add_subplot(111) for i, cat in enumerate(cat_data): plt.scatter(cat_data[cat]["pc1"], cat_data[cat]["pc2"], cat_data[cat]["size"], color=colors[cat], alpha=0.85, marker="o", edgecolor="black", label=cat) lgnd = plt.legend(loc="best", scatterpoints=3, fontsize=13) for i in range(len(colors.keys())): lgnd.legendHandles[i]._sizes = [80] # Change the legend marker size manually plt.title(" ".join(otu_name.split("_")), style="italic") plt.ylabel("PC2 (Percent Explained Variance {:.3f}%)".format(float(unifrac["varexp"][1]))) plt.xlabel("PC1 (Percent Explained Variance {:.3f}%)".format(float(unifrac["varexp"][0]))) plt.xlim(round(xr[0]*1.5, 1), round(xr[1]*1.5, 1)) plt.ylim(round(yr[0]*1.5, 1), round(yr[1]*1.5, 1)) if plot_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" fig.savefig(os.path.join(outDir, "_".join(otu_name.split())) + "." + save_as, facecolor=fc, edgecolor="none", format=save_as, bbox_inches="tight", pad_inches=0.2) plt.close(fig)
def plot_PCoA(cat_data, otu_name, unifrac, names, colors, xr, yr, outDir, save_as, plot_style): """ Plot PCoA principal coordinates scaled by the relative abundances of otu_name. """ fig = plt.figure(figsize=(14, 8)) ax = fig.add_subplot(111) for i, cat in enumerate(cat_data): plt.scatter(cat_data[cat]["pc1"], cat_data[cat]["pc2"], cat_data[cat]["size"], color=colors[cat], alpha=0.85, marker="o", edgecolor="black", label=cat) lgnd = plt.legend(loc="best", scatterpoints=3, fontsize=13) for i in range(len(colors.keys())): lgnd.legendHandles[i]._sizes = [ 80 ] # Change the legend marker size manually plt.title(" ".join(otu_name.split("_")), style="italic") plt.ylabel("PC2 (Percent Explained Variance {:.3f}%)".format( float(unifrac["varexp"][1]))) plt.xlabel("PC1 (Percent Explained Variance {:.3f}%)".format( float(unifrac["varexp"][0]))) plt.xlim(round(xr[0] * 1.5, 1), round(xr[1] * 1.5, 1)) plt.ylim(round(yr[0] * 1.5, 1), round(yr[1] * 1.5, 1)) if plot_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" fig.savefig(os.path.join(outDir, "_".join(otu_name.split())) + "." + save_as, facecolor=fc, edgecolor="none", format=save_as, bbox_inches="tight", pad_inches=0.2) plt.close(fig)
def plot_LDA(X_lda, y_lda, class_colors, exp_var, style, out_fp=""): """ Plot transformed LDA data. """ cats = class_colors.keys() group_lda = {c: [] for c in cats} fig = plt.figure(figsize=(15, 10)) ax = fig.add_subplot(111) for i, target_name in zip(range(len(cats)), cats): cat_x = X_lda[:, 0][y_lda == target_name] if X_lda.shape[1] == 1: cat_y = np.ones((cat_x.shape[0], 1)) + i else: cat_y = X_lda[:, 1][y_lda == target_name] group_lda[target_name].append(cat_x) group_lda[target_name].append(cat_y) plt.scatter(x=cat_x, y=cat_y, label=target_name, color=class_colors[target_name], alpha=0.85, s=250, edgecolors="k") mpl.rc("font", family="Arial") # define font for figure text mpl.rc('xtick', labelsize=12) # increase X axis ticksize mpl.rc('ytick', labelsize=12) # increase Y axis ticksize if X_lda.shape[1] == 1: plt.ylim((0.5, 2.5)) plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100), fontsize=16) plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100), fontsize=16) leg = plt.legend(loc="best", frameon=True, framealpha=1, fontsize=16) leg.get_frame().set_edgecolor('k') if style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # save or display result if out_fp: plt.savefig(out_fp, facecolor=fc, edgecolor="none", dpi=300, bbox_inches="tight", pad_inches=0.1) else: plt.show()
def plot_LDA(X_lda, y_lda, class_colors, exp_var, style, fig_size, label_pad, font_size, sids, dim=2, zangles=None, pt_size=250, out_fp=""): """ Plot transformed LDA data. """ cats = class_colors.keys() fig = plt.figure(figsize=fig_size) if dim == 3: try: assert X_lda.shape[1] >= 3 except AssertionError: sys.exit("\nLinear Discriminant Analysis requires at least 4 groups of " "samples to create a 3D figure. Please update group information or " "use the default 2D view of the results.\n") if sids is not None: print("\nPoint annotations are available only for 2D figures.\n") ax = fig.add_subplot(111, projection="3d") ax.view_init(elev=zangles[1], azim=zangles[0]) try: ax.set_zlabel("LD3 (Percent Explained Variance: {:.3f}%)". format(exp_var[2]*100), fontsize=font_size, labelpad=label_pad) except: ax.set_zlabel("LD3", fontsize=font_size, labelpad=label_pad) for i, target_name in zip(range(len(cats)), cats): cat_x = X_lda[:, 0][y_lda == target_name] cat_y = X_lda[:, 1][y_lda == target_name] cat_z = X_lda[:, 2][y_lda == target_name] ax.scatter(xs=cat_x, ys=cat_y, zs=cat_z, label=target_name, c=class_colors[target_name], alpha=0.85, s=pt_size, edgecolors="k", zdir="z") else: ax = fig.add_subplot(111) for i, target_name in zip(range(len(cats)), cats): cat_x = X_lda[:, 0][y_lda == target_name] if X_lda.shape[1] == 1: cat_y = np.ones((cat_x.shape[0], 1)) + i else: cat_y = X_lda[:, 1][y_lda == target_name] ax.scatter(x=cat_x, y=cat_y, label=target_name, alpha=0.85, s=pt_size, color=class_colors[target_name], edgecolors="k") # Annotate data points with sample IDs if sids is not None: for sample, point, group in zip(sids, X_lda, y_lda): try: assert len(point) >= 2 except AssertionError: point = (point[0], cats.index(group)+1) finally: ax.annotate(s=sample, xy=point[:2], xytext=(0, -15), ha="center", va="center", textcoords="offset points") if X_lda.shape[1] == 1: plt.ylim((0.5, 2.5)) try: ax.set_xlabel("LD1 (Percent Explained Variance: {:.3f}%)". format(exp_var[0]*100), fontsize=font_size, labelpad=label_pad) except: ax.set_xlabel("LD1", fontsize=font_size, labelpad=label_pad) try: ax.set_ylabel("LD2 (Percent Explained Variance: {:.3f}%)". format(exp_var[1]*100), fontsize=font_size, labelpad=label_pad) except: ax.set_ylabel("LD2", fontsize=font_size, labelpad=label_pad) leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1, fontsize=15) leg.get_frame().set_edgecolor('k') if dim == 2 and style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # save or display result if out_fp: plt.savefig(out_fp, facecolor=fc, edgecolor="none", dpi=300, bbox_inches="tight", pad_inches=0.1) else: plt.show()
def main(): args = handle_program_options() try: with open(args.coord_fp): pass except IOError as ioe: err_msg = "\nError in input principal coordinates filepath (-i): {}\n" sys.exit(err_msg.format(ioe)) try: with open(args.map_fp): pass except IOError as ioe: err_msg = "\nError in input metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) with open(args.coord_fp) as F: pcd = F.readlines() pcd = [line.split("\t") for line in pcd] map_header, imap = util.parse_map_file(args.map_fp) data_gather = util.gather_categories(imap, map_header, args.group_by.split(",")) categories = OrderedDict([(condition, {"pc1": [], "pc2": [], "pc3": []}) for condition in data_gather.keys()]) bcolors = itertools.cycle(Set3_12.hex_colors) if not args.colors: colors = [bcolors.next() for _ in categories] else: colors = util.color_mapping(imap, map_header, args.group_by, args.colors) colors = colors.values() parsed_unifrac = util.parse_unifrac(args.coord_fp) pco = args.pc_order if args.dimensions == 3: pco.append(3) pc1v = parsed_unifrac["varexp"][pco[0] - 1] pc2v = parsed_unifrac["varexp"][pco[1] - 1] if args.dimensions == 3: pc3v = parsed_unifrac["varexp"][pco[2] - 1] for sid, points in parsed_unifrac["pcd"].items(): for condition, dc in data_gather.items(): if sid in dc.sids: cat = condition break categories[cat]["pc1"].append((sid, points[pco[0] - 1])) categories[cat]["pc2"].append((sid, points[pco[1] - 1])) if args.dimensions == 3: categories[cat]["pc3"].append((sid, points[pco[2] - 1])) axis_str = "PC{} (Percent Explained Variance {:.3f}%)" # initialize plot fig = plt.figure(figsize=args.figsize) if args.dimensions == 3: ax = fig.add_subplot(111, projection="3d") ax.view_init(elev=args.z_angles[1], azim=args.z_angles[0]) ax.set_zlabel(axis_str.format(3, pc3v), labelpad=args.label_padding) if args.z_limits: ax.set_zlim(args.z_limits) else: ax = fig.add_subplot(111) # plot data for i, cat in enumerate(categories): if args.dimensions == 3: ax.scatter(xs=[e[1] for e in categories[cat]["pc1"]], ys=[e[1] for e in categories[cat]["pc2"]], zs=[e[1] for e in categories[cat]["pc3"]], zdir="z", c=colors[i], s=args.point_size, label=cat, edgecolors="k") else: ax.scatter([e[1] for e in categories[cat]["pc1"]], [e[1] for e in categories[cat]["pc2"]], c=colors[i], s=args.point_size, label=cat, edgecolors="k") # Script to annotate PCoA sample points. if args.annotate_points: for x, y in zip(categories[cat]["pc1"], categories[cat]["pc2"]): ax.annotate( x[0], xy=(x[1], y[1]), xytext=(-10, -15), textcoords="offset points", ha="center", va="center", ) # customize plot options if args.x_limits: ax.set_xlim(args.x_limits) if args.y_limits: ax.set_ylim(args.y_limits) ax.set_xlabel(axis_str.format(pco[0], float(pc1v)), labelpad=args.label_padding) ax.set_ylabel(axis_str.format(pco[1], float(pc2v)), labelpad=args.label_padding) leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1) leg.get_frame().set_edgecolor('k') # Set the font characteristics font = {"family": "normal", "weight": "bold", "size": args.font_size} mpl.rc("font", **font) if args.title: ax.set_title(args.title) if args.ggplot2_style and not args.dimensions == 3: gu.ggplot2_style(ax) # save or display result if args.out_fp: fig.savefig(args.out_fp, facecolor="white", edgecolor="none", bbox_inches="tight", pad_inches=0.2) else: plt.show()
def main(): args = handle_program_options() # Read in the distance data try: dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) dm_data_sids = dm_data.index dm_data = pairwise_distances(dm_data[range(dm_data.shape[1])].values, metric="precomputed") except IOError as ioe: sys.exit("\nError reading in distance matrix file: {}.".format(ioe)) # Mapping and colors info for plotting try: header, map_data = util.parse_map_file(args.map_fp) except IOError as ioe: sys.exit("\nError reading mapping file: {}.".format(ioe)) y = [map_data[sid][header.index(args.group_by)] for sid in dm_data_sids] # Get colors for all categories if not args.color_by: categories = set(y) bcolors = itertools.cycle(Set1_9.hex_colors) cond_colors = {c: bcolors.next() for c in categories} else: cond_colors = util.color_mapping(map_data, header, args.group_by, args.color_by) # Prep input data for t-SNE X_tsne = TSNE(n_components=3, perplexity=args.perplexity, metric="precomputed", method="exact", verbose=2, random_state=0, angle=0.8) X_new = X_tsne.fit_transform(dm_data) print("KL divergence after optimization: {}\n".format(X_tsne.kl_divergence_)) x_min, x_max = np.min(X_new, 0), np.max(X_new, 0) X_new = (X_new - x_min) / (x_max - x_min) # Plot t-SNE result fig = plt.figure(figsize=(14, 8)) for cond, sid, xy in zip(y, dm_data_sids, X_new): ax = fig.add_subplot(111) ax.scatter(x=xy[0], y=xy[1], s=args.point_size, c=cond_colors[cond], alpha=0.9, edgecolors="k") if args.annotate: ax.annotate(s=sid, xy=(xy[0], xy[1]), xytext=(12, 12), textcoords="offset points", ha="center", va="center", alpha=1, style="italic") if args.plot_title is not None: ax.set_title(args.plot_title, fontsize=16, weight="bold") l = [plt.scatter([], [], c=cond_colors[cond], s=150, edgecolors="k") for cond in cond_colors] plt.legend(l, ["{}".format(cond) for cond in cond_colors], loc="best", scatterpoints=3, frameon=True, framealpha=1, fontsize=14) ax.set_xlabel("t-SNE 1", fontsize=14) ax.set_ylabel("t-SNE 2", fontsize=14) plt.tight_layout() if args.ggplot2_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # save or display result if args.out_fp: plt.savefig(args.out_fp, facecolor=fc, edgecolor="none", dpi=300, pad_inches=0.1, bbox_inches="tight") else: plt.show()
def plot_doc(do_values_df, order, ci=False, title=None, save=None): """ Plot DOC diagram with confidence intervals. :type do_values_df: Pandas Dataframe :param do_values_df: Dataframe containing dissimilarity and overlap values with lowess predictions. :type order: int :param order: Order of the polynomial to fit when calculating the residuals. :type ci: boolean :param ci: Set to plot the confidance intervals. Default is no plotting CI. :type save: boolean :param save: Text for plot title. :type save: boolean :param save: Set to save the plot. """ fig = plt.figure(figsize=(12, 8)) ax = fig.add_subplot(111) ax.scatter(x=do_values_df["Overlap"], y=do_values_df["Dissimilarity"], s=25, c="#111111", label="Dissimilarity-Overlap") ax.plot(do_values_df["Overlap"], do_values_df["LOWESS"], "r-", lw=3, label="LOWESS Smoothing", alpha=0.75) # Plot LOWESS fit R^2 Rsqr_ld = r2_score(y_true=do_values_df["Dissimilarity"], y_pred=do_values_df["LOWESS"], multioutput="uniform_average") ld_r2_text = r"LOWESS Fit $R^2$: {:.3f}".format(Rsqr_ld) ax.text(x=0.1, y=0.97, s=ld_r2_text, ha="center", va="center", fontsize=14, transform=ax.transAxes) try: assert ci == False except AssertionError: ymin = [] ymax = [] for rows in do_values_df.iterrows(): row = rows[1] min_calc = abs(row["LOWESS"] - row["LOWESS_min"]) max_calc = abs(row["LOWESS_max"] - row["LOWESS"]) ymin.append(row["LOWESS"] - min_calc) ymax.append(row["LOWESS"] + max_calc) ax.fill_between(do_values_df["Overlap"], ymin, ymax, color="#ff0000", alpha=0.25) # Plot polynomial fit # z_orig = np.polyfit(x=do_values_df["Overlap"], y=do_values_df["Dissimilarity"], # deg=order) # p_orig = np.poly1d(z_orig) # ax.plot(do_values_df["Overlap"], p_orig(do_values_df["Overlap"]), "b--", lw=3, # label="Polynomial Fit (degree {})".format(order), alpha=0.75) # polyfit_r2 = r2_score(y_true=do_values_df["Dissimilarity"], # y_pred=p_orig(do_values_df["Overlap"]), # multioutput="uniform_average") # poly_r2_text = r"Polynomial Fit $R^2$: {:.3f}".format(polyfit_r2) # ax.text(x=0.1, y=0.92, s=poly_r2_text, ha="center", va="center", fontsize=14, # transform=ax.transAxes) ax.legend(fontsize=14, scatterpoints=4) ax.set_xlabel("Overlap", fontsize=14) ax.set_ylabel("Dissimilarity", fontsize=14) ax.set_xlim([0, 1.0]) ax.set_ylim([0, 1.0]) if title is not None: ax.set_title(title, weight="bold", fontsize=15) ggplot2_style(ax) fig.tight_layout() if save is not None: fig.savefig(save, facecolor="white", edgecolor="none", bbox_inches="tight", pad_inches=0.2) else: plt.show()
def main(): args = handle_program_options() try: with open(args.map_fp): pass except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Parse and read mapping file and obtain group colors header, imap = util.parse_map_file(args.map_fp) class_colors = util.color_mapping(imap, header, args.group_by, args.color_by) if args.input_data_type == "unifrac_dm": try: with open(args.unifrac_file): pass except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) uf_data = pd.read_csv(args.unifrac_file, sep="\t", index_col=0) uf_data.insert(0, "Condition", [imap[sid][header.index(args.group_by)] for sid in uf_data.index]) sampleids = uf_data.index if args.save_lda_input: uf_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(uf_data) # Plot LDA plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, out_fp=args.out_fp) else: # Load biom file and calculate relative abundance try: rel_abd = get_relative_abundance(args.biom_file) except ValueError as ve: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ve)) df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert(0, "Condition", [imap[sid][header.index(args.group_by)] for sid in df_rel_abd.index]) sampleids = df_rel_abd.index if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Plot LDA plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, out_fp=args.out_fp) if args.bubble: # Get otus for LDA bubble plots try: with open(args.bubble) as hojiehr: for line in hojiehr.readlines(): bubble_otus = line.strip().split("\r") except IOError as ioe: err_msg = "\nError in OTU name list file (--bubble): {}\n" sys.exit(err_msg.format(ioe)) # Load biom file and calculate relative abundance try: rel_abd = get_relative_abundance(args.biom_file) except ValueError as ve: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ve)) category_idx = header.index(args.group_by) # Calculate position and size of SampleIDs to plot for each OTU for otuname in bubble_otus: plot_data = {cat: {"x": [], "y": [], "size": [], "label": []} for cat in class_colors.keys()} for sid, data in zip(sampleids, X_lda): category = plot_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuname] * args.scale_by except KeyError as ke: print "{} not found in {} sample.".format(ke, sid) continue category["x"].append(float(data[0])) category["y"].append(float(data[1])) category["size"].append(size) # Plot LDA bubble for each OTU fig = plt.figure(figsize=(12, 9)) ax = fig.add_subplot(111) for i, cat in enumerate(plot_data): plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"], plot_data[cat]["size"], label=cat, color=class_colors[cat], alpha=0.85, marker="o", edgecolor="k") mpl.rc("font", family="Arial") # define font for figure text mpl.rc("xtick", labelsize=12) # increase X axis ticksize mpl.rc("ytick", labelsize=12) # increase Y axis ticksize if X_lda.shape[1] == 1: plt.ylim((0.5, 2.5)) plt.title(" ".join(otuname.split("_")), style="italic") plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100), fontsize=12) plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100), fontsize=12) lgnd = plt.legend(loc="best", scatterpoints=3, fontsize=12) # Change the legend marker size manually for i in range(len(class_colors.keys())): lgnd.legendHandles[i]._sizes = [75] # Set style for LDA bubble plots if args.ggplot2_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # Save LDA bubble plots to output directory print "Saving chart for {}".format(" ".join(otuname.split("_"))) fig.savefig(os.path.join(args.output_dir, "_".join(otuname.split())) + "." + args.save_as, facecolor=fc, edgecolor="none", dpi=300, bbox_inches="tight", pad_inches=0.2) plt.close(fig)
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Obtain group colors class_colors = util.color_mapping(imap, header, args.group_by, args.color_by) # Get otus for LDA bubble plots try: bubble_otus = set(pd.read_csv(args.otu_ids_fp, sep="\n", header=None)[0]) except IOError as ioe: err_msg = "\nError in OTU IDs file (--bubble): {}\n" sys.exit(err_msg.format(ioe)) # Load biom file and calculate relative abundance try: biomf = biom.load_table(args.otu_table) except IOError as ioe: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ioe)) # Get normalized relative abundances rel_abd = bc.relative_abundance(biomf) rel_abd = bc.arcsine_sqrt_transform(rel_abd) abd_val = {abd for sid, v1 in rel_abd.items() for otuid, abd in v1.items() if abd > 0} bubble_range = np.linspace(min(abd_val), max(abd_val), num=5) * args.scale_by # Get abundance to the nearest 50 bubble_range = [int(50 * round(float(abd)/50)) for abd in bubble_range[1:]] # Set up input for LDA calc and get LDA transformed data if args.dist_matrix_file: try: uf_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) uf_data.insert(0, "Condition", [imap[sid][category_idx] for sid in uf_data.index]) sampleids = uf_data.index if args.save_lda_input: uf_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(uf_data) else: df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert(0, "Condition", [imap[sid][category_idx] for sid in df_rel_abd.index]) sampleids = df_rel_abd.index if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Calculate position and size of SampleIDs to plot for each OTU for otuid in bubble_otus: otuname = oc.otu_name(biomf.metadata(otuid, axis="observation")["taxonomy"]) plot_data = {cat: {"x": [], "y": [], "size": [], "label": []} for cat in class_colors.keys()} for sid, data in zip(sampleids, X_lda): category = plot_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["x"].append(float(data[0])) category["y"].append(float(data[1])) category["size"].append(size) # Plot LDA bubble for each OTU fig = plt.figure(figsize=args.figsize) ax = fig.add_subplot(111) for i, cat in enumerate(plot_data): plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"], s=plot_data[cat]["size"], label=cat, color=class_colors[cat], alpha=0.85, edgecolors="k") if X_lda.shape[1] == 1: plt.ylim((0.5, 2.5)) plt.title(" ".join(otuname.split("_")), style="italic", fontsize=13) try: plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100), fontsize=13, labelpad=15) except: plt.xlabel("LD1", fontsize=13, labelpad=15) try: plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100), fontsize=13, labelpad=15) except: plt.ylabel("LD2", fontsize=13, labelpad=15) lgnd1 = plt.legend(loc="best", scatterpoints=3, fontsize=13) for i in range(len(class_colors.keys())): lgnd1.legendHandles[i]._sizes = [80] # Change the legend marker size manually # Add the legend manually to the current plot plt.gca().add_artist(lgnd1) c = [plt.scatter([], [], c="w", edgecolors="k", s=s1) for s1 in bubble_range] plt.legend(c, ["{}".format(s2) for s2 in bubble_range], title="Scaled Bubble\n Sizes", frameon=True, labelspacing=2, fontsize=13, loc=4, scatterpoints=1, borderpad=1.1) # Set style for LDA bubble plots if args.ggplot2_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # Save LDA bubble plots to output directory if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) fig.savefig(pj(args.output_dir, "_".join(otuname.split())) + "." + args.save_as, facecolor=fc, edgecolor="none", dpi=300, bbox_inches="tight", pad_inches=0.2) plt.close(fig)
def plot_LDA(X_lda, y_lda, class_colors, exp_var, style, fig_size, label_pad, font_size, sids, dim=2, zangles=None, pt_size=250, out_fp=""): """ Plot transformed LDA data. """ cats = class_colors.keys() fig = plt.figure(figsize=fig_size) if dim == 3: try: assert X_lda.shape[1] >= 3 except AssertionError: sys.exit( "\nLinear Discriminant Analysis requires at least 4 groups of " "samples to create a 3D figure. Please update group information or " "use the default 2D view of the results.\n") if sids is not None: print("\nPoint annotations are available only for 2D figures.\n") ax = fig.add_subplot(111, projection="3d") ax.view_init(elev=zangles[1], azim=zangles[0]) try: ax.set_zlabel("LD3 (Percent Explained Variance: {:.3f}%)".format( exp_var[2] * 100), fontsize=font_size, labelpad=label_pad) except: ax.set_zlabel("LD3", fontsize=font_size, labelpad=label_pad) for i, target_name in zip(range(len(cats)), cats): cat_x = X_lda[:, 0][y_lda == target_name] cat_y = X_lda[:, 1][y_lda == target_name] cat_z = X_lda[:, 2][y_lda == target_name] ax.scatter(xs=cat_x, ys=cat_y, zs=cat_z, label=target_name, c=class_colors[target_name], alpha=0.85, s=pt_size, edgecolors="k", zdir="z") else: ax = fig.add_subplot(111) for i, target_name in zip(range(len(cats)), cats): cat_x = X_lda[:, 0][y_lda == target_name] if X_lda.shape[1] == 1: cat_y = np.ones((cat_x.shape[0], 1)) + i else: cat_y = X_lda[:, 1][y_lda == target_name] ax.scatter(x=cat_x, y=cat_y, label=target_name, alpha=0.85, s=pt_size, color=class_colors[target_name], edgecolors="k") # Annotate data points with sample IDs if sids is not None: for sample, point, group in zip(sids, X_lda, y_lda): try: assert len(point) >= 2 except AssertionError: point = (point[0], cats.index(group) + 1) finally: ax.annotate(s=sample, xy=point[:2], xytext=(0, -15), ha="center", va="center", textcoords="offset points") if X_lda.shape[1] == 1: plt.ylim((0.5, 2.5)) try: ax.set_xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format( exp_var[0] * 100), fontsize=font_size, labelpad=label_pad) except: ax.set_xlabel("LD1", fontsize=font_size, labelpad=label_pad) try: ax.set_ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format( exp_var[1] * 100), fontsize=font_size, labelpad=label_pad) except: ax.set_ylabel("LD2", fontsize=font_size, labelpad=label_pad) leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1, fontsize=15) leg.get_frame().set_edgecolor('k') if dim == 2 and style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # save or display result if out_fp: plt.savefig(out_fp, facecolor=fc, edgecolor="none", dpi=300, bbox_inches="tight", pad_inches=0.1) else: plt.show()
def main(): args = handle_program_options() try: with open(args.coord_fp): pass except IOError as ioe: err_msg = "\nError in input principal coordinates filepath (-i): {}\n" sys.exit(err_msg.format(ioe)) try: with open(args.map_fp): pass except IOError as ioe: err_msg = "\nError in input metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) with open(args.coord_fp) as F: pcd = F.readlines() pcd = [line.split("\t") for line in pcd] map_header, imap = util.parse_map_file(args.map_fp) data_gather = util.gather_categories(imap, map_header, args.group_by.split(",")) categories = OrderedDict([(condition, { "pc1": [], "pc2": [], "pc3": [] }) for condition in data_gather.keys()]) bcolors = itertools.cycle(Set3_12.hex_colors) if not args.colors: colors = [bcolors.next() for _ in categories] else: colors = util.color_mapping(imap, map_header, args.group_by, args.colors) colors = colors.values() parsed_unifrac = util.parse_unifrac(args.coord_fp) pco = args.pc_order if args.dimensions == 3: pco.append(3) pc1v = parsed_unifrac["varexp"][pco[0] - 1] pc2v = parsed_unifrac["varexp"][pco[1] - 1] if args.dimensions == 3: pc3v = parsed_unifrac["varexp"][pco[2] - 1] for sid, points in parsed_unifrac["pcd"].items(): for condition, dc in data_gather.items(): if sid in dc.sids: cat = condition break categories[cat]["pc1"].append((sid, points[pco[0] - 1])) categories[cat]["pc2"].append((sid, points[pco[1] - 1])) if args.dimensions == 3: categories[cat]["pc3"].append((sid, points[pco[2] - 1])) axis_str = "PC{} (Percent Explained Variance {:.3f}%)" # initialize plot fig = plt.figure(figsize=args.figsize) if args.dimensions == 3: ax = fig.add_subplot(111, projection="3d") ax.view_init(elev=args.z_angles[1], azim=args.z_angles[0]) ax.set_zlabel(axis_str.format(3, pc3v), labelpad=args.label_padding) if args.z_limits: ax.set_zlim(args.z_limits) else: ax = fig.add_subplot(111) # plot data for i, cat in enumerate(categories): if args.dimensions == 3: ax.scatter(xs=[e[1] for e in categories[cat]["pc1"]], ys=[e[1] for e in categories[cat]["pc2"]], zs=[e[1] for e in categories[cat]["pc3"]], zdir="z", c=colors[i], s=args.point_size, label=cat, edgecolors="k") else: ax.scatter([e[1] for e in categories[cat]["pc1"]], [e[1] for e in categories[cat]["pc2"]], c=colors[i], s=args.point_size, label=cat, edgecolors="k") # Script to annotate PCoA sample points. if args.annotate_points: for x, y in zip(categories[cat]["pc1"], categories[cat]["pc2"]): ax.annotate( x[0], xy=(x[1], y[1]), xytext=(-10, -15), textcoords="offset points", ha="center", va="center", ) # customize plot options if args.x_limits: ax.set_xlim(args.x_limits) if args.y_limits: ax.set_ylim(args.y_limits) ax.set_xlabel(axis_str.format(pco[0], float(pc1v)), labelpad=args.label_padding) ax.set_ylabel(axis_str.format(pco[1], float(pc2v)), labelpad=args.label_padding) leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1) leg.get_frame().set_edgecolor('k') # Set the font characteristics font = {"family": "normal", "weight": "bold", "size": args.font_size} mpl.rc("font", **font) if args.title: ax.set_title(args.title) if args.ggplot2_style and not args.dimensions == 3: gu.ggplot2_style(ax) # save or display result if args.out_fp: fig.savefig(args.out_fp, facecolor="white", edgecolor="none", bbox_inches="tight", pad_inches=0.2) else: plt.show()