Exemple #1
0
def plot_PCoA(cat_data, otu_name, unifrac, names, colors, xr, yr, outDir,
              save_as, plot_style):
    """
    Plot PCoA principal coordinates scaled by the relative abundances of
    otu_name.
    """
    fig = plt.figure()
    ax = fig.add_subplot(111)
    legend = []

    for i, cat in enumerate(cat_data):
        plt.scatter(cat_data[cat]["pc1"], cat_data[cat]["pc2"],
                    cat_data[cat]["size"], color=colors[cat],
                    alpha=0.85, marker="o", edgecolor="black")
        legend.append(plt.Rectangle((0, 0), 1, 1, fc=colors[cat]))

    ax.legend(legend, names, loc="best")
    plt.title(" ".join(otu_name.split("_")), style="italic")
    plt.ylabel("PC2 - Percent variation explained {:.2f}%".format(float(unifrac["varexp"][1])))
    plt.xlabel("PC1 - Percent variation explained {:.2f}%".format(float(unifrac["varexp"][0])))
    plt.xlim(round(xr[0]*1.5, 1), round(xr[1]*1.5, 1))
    plt.ylim(round(yr[0]*1.5, 1), round(yr[1]*1.5, 1))
    if plot_style:
        gu.ggplot2_style(ax)
        fc = "0.8"
    else:
        fc = "none"
    fig.savefig(os.path.join(outDir, "_".join(otu_name.split())) + "." + save_as,
                facecolor=fc, edgecolor="none", format=save_as,
                bbox_inches="tight", pad_inches=0.2)
    plt.close(fig)
def plot_PCoA(cat_data, otu_name, unifrac, names, colors, xr, yr, outDir,
              save_as, plot_style):
    """
    Plot PCoA principal coordinates scaled by the relative abundances of
    otu_name.
    """
    fig = plt.figure(figsize=(14, 8))
    ax = fig.add_subplot(111)

    for i, cat in enumerate(cat_data):
        plt.scatter(cat_data[cat]["pc1"], cat_data[cat]["pc2"], cat_data[cat]["size"],
                    color=colors[cat], alpha=0.85, marker="o", edgecolor="black",
                    label=cat)
    lgnd = plt.legend(loc="best", scatterpoints=3, fontsize=13)
    for i in range(len(colors.keys())):
        lgnd.legendHandles[i]._sizes = [80]  # Change the legend marker size manually
    plt.title(" ".join(otu_name.split("_")), style="italic")
    plt.ylabel("PC2 (Percent Explained Variance {:.3f}%)".format(float(unifrac["varexp"][1])))
    plt.xlabel("PC1 (Percent Explained Variance {:.3f}%)".format(float(unifrac["varexp"][0])))
    plt.xlim(round(xr[0]*1.5, 1), round(xr[1]*1.5, 1))
    plt.ylim(round(yr[0]*1.5, 1), round(yr[1]*1.5, 1))
    if plot_style:
        gu.ggplot2_style(ax)
        fc = "0.8"
    else:
        fc = "none"
    fig.savefig(os.path.join(outDir, "_".join(otu_name.split())) + "." + save_as,
                facecolor=fc, edgecolor="none", format=save_as,
                bbox_inches="tight", pad_inches=0.2)
    plt.close(fig)
def plot_PCoA(cat_data, otu_name, unifrac, names, colors, xr, yr, outDir,
              save_as, plot_style):
    """
    Plot PCoA principal coordinates scaled by the relative abundances of
    otu_name.
    """
    fig = plt.figure(figsize=(14, 8))
    ax = fig.add_subplot(111)

    for i, cat in enumerate(cat_data):
        plt.scatter(cat_data[cat]["pc1"],
                    cat_data[cat]["pc2"],
                    cat_data[cat]["size"],
                    color=colors[cat],
                    alpha=0.85,
                    marker="o",
                    edgecolor="black",
                    label=cat)
    lgnd = plt.legend(loc="best", scatterpoints=3, fontsize=13)
    for i in range(len(colors.keys())):
        lgnd.legendHandles[i]._sizes = [
            80
        ]  # Change the legend marker size manually
    plt.title(" ".join(otu_name.split("_")), style="italic")
    plt.ylabel("PC2 (Percent Explained Variance {:.3f}%)".format(
        float(unifrac["varexp"][1])))
    plt.xlabel("PC1 (Percent Explained Variance {:.3f}%)".format(
        float(unifrac["varexp"][0])))
    plt.xlim(round(xr[0] * 1.5, 1), round(xr[1] * 1.5, 1))
    plt.ylim(round(yr[0] * 1.5, 1), round(yr[1] * 1.5, 1))
    if plot_style:
        gu.ggplot2_style(ax)
        fc = "0.8"
    else:
        fc = "none"
    fig.savefig(os.path.join(outDir, "_".join(otu_name.split())) + "." +
                save_as,
                facecolor=fc,
                edgecolor="none",
                format=save_as,
                bbox_inches="tight",
                pad_inches=0.2)
    plt.close(fig)
Exemple #4
0
def plot_LDA(X_lda, y_lda, class_colors, exp_var, style, out_fp=""):
    """
    Plot transformed LDA data.
    """
    cats = class_colors.keys()
    group_lda = {c: [] for c in cats}
    fig = plt.figure(figsize=(15, 10))
    ax = fig.add_subplot(111)
    for i, target_name in zip(range(len(cats)), cats):
        cat_x = X_lda[:, 0][y_lda == target_name]
        if X_lda.shape[1] == 1:
            cat_y = np.ones((cat_x.shape[0], 1)) + i
        else:
            cat_y = X_lda[:, 1][y_lda == target_name]
        group_lda[target_name].append(cat_x)
        group_lda[target_name].append(cat_y)
        plt.scatter(x=cat_x, y=cat_y, label=target_name,
                    color=class_colors[target_name],
                    alpha=0.85, s=250, edgecolors="k")
    mpl.rc("font", family="Arial")  # define font for figure text
    mpl.rc('xtick', labelsize=12)  # increase X axis ticksize
    mpl.rc('ytick', labelsize=12)  # increase Y axis ticksize
    if X_lda.shape[1] == 1:
        plt.ylim((0.5, 2.5))
    plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100), fontsize=16)
    plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100), fontsize=16)
    leg = plt.legend(loc="best", frameon=True, framealpha=1, fontsize=16)
    leg.get_frame().set_edgecolor('k')
    if style:
        gu.ggplot2_style(ax)
        fc = "0.8"
    else:
        fc = "none"

    # save or display result
    if out_fp:
        plt.savefig(out_fp, facecolor=fc, edgecolor="none", dpi=300,
                    bbox_inches="tight", pad_inches=0.1)
    else:
        plt.show()
Exemple #5
0
def plot_LDA(X_lda, y_lda, class_colors, exp_var, style, fig_size, label_pad,
             font_size, sids, dim=2, zangles=None, pt_size=250, out_fp=""):
    """
    Plot transformed LDA data.
    """
    cats = class_colors.keys()
    fig = plt.figure(figsize=fig_size)
    if dim == 3:
        try:
            assert X_lda.shape[1] >= 3
        except AssertionError:
            sys.exit("\nLinear Discriminant Analysis requires at least 4 groups of "
                     "samples to create a 3D figure. Please update group information or "
                     "use the default 2D view of the results.\n")
        if sids is not None:
            print("\nPoint annotations are available only for 2D figures.\n")
        ax = fig.add_subplot(111, projection="3d")
        ax.view_init(elev=zangles[1], azim=zangles[0])
        try:
            ax.set_zlabel("LD3 (Percent Explained Variance: {:.3f}%)".
                          format(exp_var[2]*100), fontsize=font_size, labelpad=label_pad)
        except:
            ax.set_zlabel("LD3", fontsize=font_size, labelpad=label_pad)
        for i, target_name in zip(range(len(cats)), cats):
            cat_x = X_lda[:, 0][y_lda == target_name]
            cat_y = X_lda[:, 1][y_lda == target_name]
            cat_z = X_lda[:, 2][y_lda == target_name]
            ax.scatter(xs=cat_x, ys=cat_y, zs=cat_z, label=target_name,
                       c=class_colors[target_name], alpha=0.85, s=pt_size, edgecolors="k",
                       zdir="z")
    else:
        ax = fig.add_subplot(111)
        for i, target_name in zip(range(len(cats)), cats):
            cat_x = X_lda[:, 0][y_lda == target_name]
            if X_lda.shape[1] == 1:
                cat_y = np.ones((cat_x.shape[0], 1)) + i
            else:
                cat_y = X_lda[:, 1][y_lda == target_name]
            ax.scatter(x=cat_x, y=cat_y, label=target_name, alpha=0.85, s=pt_size,
                       color=class_colors[target_name], edgecolors="k")
        # Annotate data points with sample IDs
        if sids is not None:
            for sample, point, group in zip(sids, X_lda, y_lda):
                try:
                    assert len(point) >= 2
                except AssertionError:
                    point = (point[0], cats.index(group)+1)
                finally:
                    ax.annotate(s=sample, xy=point[:2], xytext=(0, -15), ha="center",
                                va="center", textcoords="offset points")
    if X_lda.shape[1] == 1:
        plt.ylim((0.5, 2.5))
    try:
        ax.set_xlabel("LD1 (Percent Explained Variance: {:.3f}%)".
                      format(exp_var[0]*100), fontsize=font_size, labelpad=label_pad)
    except:
        ax.set_xlabel("LD1", fontsize=font_size, labelpad=label_pad)
    try:
        ax.set_ylabel("LD2 (Percent Explained Variance: {:.3f}%)".
                      format(exp_var[1]*100), fontsize=font_size, labelpad=label_pad)
    except:
        ax.set_ylabel("LD2", fontsize=font_size, labelpad=label_pad)

    leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1, fontsize=15)
    leg.get_frame().set_edgecolor('k')
    if dim == 2 and style:
        gu.ggplot2_style(ax)
        fc = "0.8"
    else:
        fc = "none"

    # save or display result
    if out_fp:
        plt.savefig(out_fp, facecolor=fc, edgecolor="none", dpi=300, bbox_inches="tight",
                    pad_inches=0.1)
    else:
        plt.show()
Exemple #6
0
def main():
    args = handle_program_options()

    try:
        with open(args.coord_fp):
            pass
    except IOError as ioe:
        err_msg = "\nError in input principal coordinates filepath (-i): {}\n"
        sys.exit(err_msg.format(ioe))

    try:
        with open(args.map_fp):
            pass
    except IOError as ioe:
        err_msg = "\nError in input metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))

    with open(args.coord_fp) as F:
        pcd = F.readlines()
    pcd = [line.split("\t") for line in pcd]

    map_header, imap = util.parse_map_file(args.map_fp)

    data_gather = util.gather_categories(imap, map_header,
                                         args.group_by.split(","))
    categories = OrderedDict([(condition, {"pc1": [], "pc2": [], "pc3": []})
                              for condition in data_gather.keys()])

    bcolors = itertools.cycle(Set3_12.hex_colors)
    if not args.colors:
        colors = [bcolors.next() for _ in categories]
    else:
        colors = util.color_mapping(imap, map_header,
                                    args.group_by, args.colors)
        colors = colors.values()

    parsed_unifrac = util.parse_unifrac(args.coord_fp)

    pco = args.pc_order
    if args.dimensions == 3:
        pco.append(3)

    pc1v = parsed_unifrac["varexp"][pco[0] - 1]
    pc2v = parsed_unifrac["varexp"][pco[1] - 1]
    if args.dimensions == 3:
        pc3v = parsed_unifrac["varexp"][pco[2] - 1]

    for sid, points in parsed_unifrac["pcd"].items():
        for condition, dc in data_gather.items():
            if sid in dc.sids:
                cat = condition
                break
        categories[cat]["pc1"].append((sid, points[pco[0] - 1]))
        categories[cat]["pc2"].append((sid, points[pco[1] - 1]))

        if args.dimensions == 3:
            categories[cat]["pc3"].append((sid, points[pco[2] - 1]))

    axis_str = "PC{} (Percent Explained Variance {:.3f}%)"
    # initialize plot
    fig = plt.figure(figsize=args.figsize)
    if args.dimensions == 3:
        ax = fig.add_subplot(111, projection="3d")
        ax.view_init(elev=args.z_angles[1], azim=args.z_angles[0])
        ax.set_zlabel(axis_str.format(3, pc3v), labelpad=args.label_padding)
        if args.z_limits:
            ax.set_zlim(args.z_limits)
    else:
        ax = fig.add_subplot(111)

    # plot data
    for i, cat in enumerate(categories):
        if args.dimensions == 3:
            ax.scatter(xs=[e[1] for e in categories[cat]["pc1"]],
                       ys=[e[1] for e in categories[cat]["pc2"]],
                       zs=[e[1] for e in categories[cat]["pc3"]],
                       zdir="z", c=colors[i], s=args.point_size, label=cat,
                       edgecolors="k")
        else:
            ax.scatter([e[1] for e in categories[cat]["pc1"]],
                       [e[1] for e in categories[cat]["pc2"]],
                       c=colors[i], s=args.point_size, label=cat, edgecolors="k")

        # Script to annotate PCoA sample points.
        if args.annotate_points:
            for x, y in zip(categories[cat]["pc1"], categories[cat]["pc2"]):
                ax.annotate(
                    x[0], xy=(x[1], y[1]), xytext=(-10, -15),
                    textcoords="offset points", ha="center", va="center",
                    )

    # customize plot options
    if args.x_limits:
        ax.set_xlim(args.x_limits)
    if args.y_limits:
        ax.set_ylim(args.y_limits)

    ax.set_xlabel(axis_str.format(pco[0], float(pc1v)), labelpad=args.label_padding)
    ax.set_ylabel(axis_str.format(pco[1], float(pc2v)), labelpad=args.label_padding)

    leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1)
    leg.get_frame().set_edgecolor('k')

    # Set the font characteristics
    font = {"family": "normal", "weight": "bold", "size": args.font_size}
    mpl.rc("font", **font)

    if args.title:
        ax.set_title(args.title)

    if args.ggplot2_style and not args.dimensions == 3:
        gu.ggplot2_style(ax)

    # save or display result
    if args.out_fp:
        fig.savefig(args.out_fp, facecolor="white", edgecolor="none", bbox_inches="tight",
                    pad_inches=0.2)
    else:
        plt.show()
Exemple #7
0
def main():
    args = handle_program_options()

    # Read in the distance data
    try:
        dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0)
        dm_data_sids = dm_data.index
        dm_data = pairwise_distances(dm_data[range(dm_data.shape[1])].values,
                                     metric="precomputed")
    except IOError as ioe:
        sys.exit("\nError reading in distance matrix file: {}.".format(ioe))

    # Mapping and colors info for plotting
    try:
        header, map_data = util.parse_map_file(args.map_fp)
    except IOError as ioe:
        sys.exit("\nError reading mapping file: {}.".format(ioe))
    y = [map_data[sid][header.index(args.group_by)] for sid in dm_data_sids]

    # Get colors for all categories
    if not args.color_by:
        categories = set(y)
        bcolors = itertools.cycle(Set1_9.hex_colors)
        cond_colors = {c: bcolors.next() for c in categories}
    else:
        cond_colors = util.color_mapping(map_data, header, args.group_by, args.color_by)

    # Prep input data for t-SNE
    X_tsne = TSNE(n_components=3, perplexity=args.perplexity, metric="precomputed",
                  method="exact", verbose=2, random_state=0, angle=0.8)
    X_new = X_tsne.fit_transform(dm_data)
    print("KL divergence after optimization: {}\n".format(X_tsne.kl_divergence_))
    x_min, x_max = np.min(X_new, 0), np.max(X_new, 0)
    X_new = (X_new - x_min) / (x_max - x_min)

    # Plot t-SNE result
    fig = plt.figure(figsize=(14, 8))
    for cond, sid, xy in zip(y, dm_data_sids, X_new):
        ax = fig.add_subplot(111)
        ax.scatter(x=xy[0], y=xy[1], s=args.point_size, c=cond_colors[cond],
                   alpha=0.9, edgecolors="k")
        if args.annotate:
            ax.annotate(s=sid, xy=(xy[0], xy[1]), xytext=(12, 12),
                        textcoords="offset points", ha="center", va="center",
                        alpha=1, style="italic")
    if args.plot_title is not None:
        ax.set_title(args.plot_title, fontsize=16, weight="bold")
    l = [plt.scatter([], [], c=cond_colors[cond], s=150, edgecolors="k")
         for cond in cond_colors]
    plt.legend(l, ["{}".format(cond) for cond in cond_colors], loc="best",
               scatterpoints=3, frameon=True, framealpha=1, fontsize=14)
    ax.set_xlabel("t-SNE 1", fontsize=14)
    ax.set_ylabel("t-SNE 2", fontsize=14)
    plt.tight_layout()
    if args.ggplot2_style:
        gu.ggplot2_style(ax)
        fc = "0.8"
    else:
        fc = "none"

    # save or display result
    if args.out_fp:
        plt.savefig(args.out_fp, facecolor=fc, edgecolor="none", dpi=300, pad_inches=0.1,
                    bbox_inches="tight")
    else:
        plt.show()
Exemple #8
0
def plot_doc(do_values_df, order, ci=False, title=None, save=None):
    """
    Plot DOC diagram with confidence intervals.

    :type do_values_df: Pandas Dataframe
    :param do_values_df: Dataframe containing dissimilarity and overlap values with
                         lowess predictions.
    :type order: int
    :param order: Order of the polynomial to fit when calculating the residuals.
    :type ci: boolean
    :param ci: Set to plot the confidance intervals. Default is no plotting CI.
    :type save: boolean
    :param save: Text for plot title.
    :type save: boolean
    :param save: Set to save the plot.
    """
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111)
    ax.scatter(x=do_values_df["Overlap"],
               y=do_values_df["Dissimilarity"],
               s=25,
               c="#111111",
               label="Dissimilarity-Overlap")
    ax.plot(do_values_df["Overlap"],
            do_values_df["LOWESS"],
            "r-",
            lw=3,
            label="LOWESS Smoothing",
            alpha=0.75)
    # Plot LOWESS fit R^2
    Rsqr_ld = r2_score(y_true=do_values_df["Dissimilarity"],
                       y_pred=do_values_df["LOWESS"],
                       multioutput="uniform_average")
    ld_r2_text = r"LOWESS Fit $R^2$: {:.3f}".format(Rsqr_ld)
    ax.text(x=0.1,
            y=0.97,
            s=ld_r2_text,
            ha="center",
            va="center",
            fontsize=14,
            transform=ax.transAxes)
    try:
        assert ci == False
    except AssertionError:
        ymin = []
        ymax = []
        for rows in do_values_df.iterrows():
            row = rows[1]
            min_calc = abs(row["LOWESS"] - row["LOWESS_min"])
            max_calc = abs(row["LOWESS_max"] - row["LOWESS"])
            ymin.append(row["LOWESS"] - min_calc)
            ymax.append(row["LOWESS"] + max_calc)
        ax.fill_between(do_values_df["Overlap"],
                        ymin,
                        ymax,
                        color="#ff0000",
                        alpha=0.25)

    # Plot polynomial fit


#     z_orig = np.polyfit(x=do_values_df["Overlap"], y=do_values_df["Dissimilarity"],
#                         deg=order)
#     p_orig = np.poly1d(z_orig)
#     ax.plot(do_values_df["Overlap"], p_orig(do_values_df["Overlap"]), "b--", lw=3,
#             label="Polynomial Fit (degree {})".format(order), alpha=0.75)
#     polyfit_r2 = r2_score(y_true=do_values_df["Dissimilarity"],
#                           y_pred=p_orig(do_values_df["Overlap"]),
#                           multioutput="uniform_average")
#     poly_r2_text = r"Polynomial Fit $R^2$: {:.3f}".format(polyfit_r2)
#     ax.text(x=0.1, y=0.92, s=poly_r2_text, ha="center", va="center", fontsize=14,
#             transform=ax.transAxes)

    ax.legend(fontsize=14, scatterpoints=4)
    ax.set_xlabel("Overlap", fontsize=14)
    ax.set_ylabel("Dissimilarity", fontsize=14)
    ax.set_xlim([0, 1.0])
    ax.set_ylim([0, 1.0])
    if title is not None:
        ax.set_title(title, weight="bold", fontsize=15)
    ggplot2_style(ax)
    fig.tight_layout()
    if save is not None:
        fig.savefig(save,
                    facecolor="white",
                    edgecolor="none",
                    bbox_inches="tight",
                    pad_inches=0.2)
    else:
        plt.show()
Exemple #9
0
def main():
    args = handle_program_options()

    try:
        with open(args.map_fp):
            pass
    except IOError as ioe:
        err_msg = "\nError in metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))

    # Parse and read mapping file and obtain group colors
    header, imap = util.parse_map_file(args.map_fp)
    class_colors = util.color_mapping(imap, header, args.group_by, args.color_by)

    if args.input_data_type == "unifrac_dm":
        try:
            with open(args.unifrac_file):
                pass
        except IOError as ioe:
            err_msg = "\nError with unifrac distance matrix file (-d): {}\n"
            sys.exit(err_msg.format(ioe))
        uf_data = pd.read_csv(args.unifrac_file, sep="\t", index_col=0)
        uf_data.insert(0, "Condition", [imap[sid][header.index(args.group_by)]
                                        for sid in uf_data.index])
        sampleids = uf_data.index
        if args.save_lda_input:
            uf_data.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(uf_data)
        # Plot LDA
        plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style,
                 out_fp=args.out_fp)
    else:
        # Load biom file and calculate relative abundance
        try:
            rel_abd = get_relative_abundance(args.biom_file)
        except ValueError as ve:
            err_msg = "\nError with biom format file (-d): {}\n"
            sys.exit(err_msg.format(ve))
        df_rel_abd = pd.DataFrame(rel_abd).T
        df_rel_abd.insert(0, "Condition", [imap[sid][header.index(args.group_by)]
                                           for sid in df_rel_abd.index])
        sampleids = df_rel_abd.index
        if args.save_lda_input:
            df_rel_abd.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(df_rel_abd)
        # Plot LDA
        plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style,
                 out_fp=args.out_fp)

    if args.bubble:
        # Get otus for LDA bubble plots
        try:
            with open(args.bubble) as hojiehr:
                for line in hojiehr.readlines():
                    bubble_otus = line.strip().split("\r")
        except IOError as ioe:
            err_msg = "\nError in OTU name list file (--bubble): {}\n"
            sys.exit(err_msg.format(ioe))

        # Load biom file and calculate relative abundance
        try:
            rel_abd = get_relative_abundance(args.biom_file)
        except ValueError as ve:
            err_msg = "\nError with biom format file (-d): {}\n"
            sys.exit(err_msg.format(ve))
        category_idx = header.index(args.group_by)

        # Calculate position and size of SampleIDs to plot for each OTU
        for otuname in bubble_otus:
            plot_data = {cat: {"x": [], "y": [], "size": [], "label": []}
                         for cat in class_colors.keys()}
            for sid, data in zip(sampleids, X_lda):
                category = plot_data[imap[sid][category_idx]]
                try:
                    size = rel_abd[sid][otuname] * args.scale_by
                except KeyError as ke:
                    print "{} not found in {} sample.".format(ke, sid)
                    continue
                category["x"].append(float(data[0]))
                category["y"].append(float(data[1]))
                category["size"].append(size)

            # Plot LDA bubble for each OTU
            fig = plt.figure(figsize=(12, 9))
            ax = fig.add_subplot(111)
            for i, cat in enumerate(plot_data):
                plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"],
                            plot_data[cat]["size"], label=cat,
                            color=class_colors[cat],
                            alpha=0.85, marker="o", edgecolor="k")
            mpl.rc("font", family="Arial")  # define font for figure text
            mpl.rc("xtick", labelsize=12)  # increase X axis ticksize
            mpl.rc("ytick", labelsize=12)  # increase Y axis ticksize
            if X_lda.shape[1] == 1:
                plt.ylim((0.5, 2.5))
            plt.title(" ".join(otuname.split("_")), style="italic")
            plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100),
                       fontsize=12)
            plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100),
                       fontsize=12)
            lgnd = plt.legend(loc="best", scatterpoints=3, fontsize=12)
            # Change the legend marker size manually
            for i in range(len(class_colors.keys())):
                lgnd.legendHandles[i]._sizes = [75]

            # Set style for LDA bubble plots
            if args.ggplot2_style:
                gu.ggplot2_style(ax)
                fc = "0.8"
            else:
                fc = "none"

            # Save LDA bubble plots to output directory
            print "Saving chart for {}".format(" ".join(otuname.split("_")))
            fig.savefig(os.path.join(args.output_dir, "_".join(otuname.split())) + "." + args.save_as,
                        facecolor=fc, edgecolor="none", dpi=300,
                        bbox_inches="tight", pad_inches=0.2)
            plt.close(fig)
def main():
    args = handle_program_options()

    # Parse and read mapping file
    try:
        header, imap = util.parse_map_file(args.map_fp)
        category_idx = header.index(args.group_by)
    except IOError as ioe:
        err_msg = "\nError in metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))
    # Obtain group colors
    class_colors = util.color_mapping(imap, header, args.group_by, args.color_by)

    # Get otus for LDA bubble plots
    try:
        bubble_otus = set(pd.read_csv(args.otu_ids_fp, sep="\n", header=None)[0])
    except IOError as ioe:
        err_msg = "\nError in OTU IDs file (--bubble): {}\n"
        sys.exit(err_msg.format(ioe))

    # Load biom file and calculate relative abundance
    try:
        biomf = biom.load_table(args.otu_table)
    except IOError as ioe:
        err_msg = "\nError with biom format file (-d): {}\n"
        sys.exit(err_msg.format(ioe))

    # Get normalized relative abundances
    rel_abd = bc.relative_abundance(biomf)
    rel_abd = bc.arcsine_sqrt_transform(rel_abd)
    abd_val = {abd for sid, v1 in rel_abd.items() for otuid, abd in v1.items() if abd > 0}
    bubble_range = np.linspace(min(abd_val), max(abd_val), num=5) * args.scale_by
    # Get abundance to the nearest 50
    bubble_range = [int(50 * round(float(abd)/50)) for abd in bubble_range[1:]]

    # Set up input for LDA calc and get LDA transformed data
    if args.dist_matrix_file:
        try:
            uf_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0)
        except IOError as ioe:
            err_msg = "\nError with unifrac distance matrix file (-d): {}\n"
            sys.exit(err_msg.format(ioe))
        uf_data.insert(0, "Condition", [imap[sid][category_idx] for sid in uf_data.index])
        sampleids = uf_data.index
        if args.save_lda_input:
            uf_data.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(uf_data)
    else:
        df_rel_abd = pd.DataFrame(rel_abd).T
        df_rel_abd.insert(0, "Condition", [imap[sid][category_idx]
                                           for sid in df_rel_abd.index])
        sampleids = df_rel_abd.index
        if args.save_lda_input:
            df_rel_abd.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(df_rel_abd)

    # Calculate position and size of SampleIDs to plot for each OTU
    for otuid in bubble_otus:
        otuname = oc.otu_name(biomf.metadata(otuid, axis="observation")["taxonomy"])
        plot_data = {cat: {"x": [], "y": [], "size": [], "label": []}
                     for cat in class_colors.keys()}
        for sid, data in zip(sampleids, X_lda):
            category = plot_data[imap[sid][category_idx]]
            try:
                size = rel_abd[sid][otuid] * args.scale_by
            except KeyError as ke:
                print("{} not found in {} sample.".format(ke, sid))
                continue
            category["x"].append(float(data[0]))
            category["y"].append(float(data[1]))
            category["size"].append(size)

        # Plot LDA bubble for each OTU
        fig = plt.figure(figsize=args.figsize)
        ax = fig.add_subplot(111)
        for i, cat in enumerate(plot_data):
            plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"],
                        s=plot_data[cat]["size"], label=cat, color=class_colors[cat],
                        alpha=0.85, edgecolors="k")
        if X_lda.shape[1] == 1:
            plt.ylim((0.5, 2.5))
        plt.title(" ".join(otuname.split("_")), style="italic", fontsize=13)
        try:
            plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100),
                       fontsize=13, labelpad=15)
        except:
            plt.xlabel("LD1", fontsize=13, labelpad=15)
        try:
            plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100),
                       fontsize=13, labelpad=15)
        except:
            plt.ylabel("LD2", fontsize=13, labelpad=15)

        lgnd1 = plt.legend(loc="best", scatterpoints=3, fontsize=13)
        for i in range(len(class_colors.keys())):
            lgnd1.legendHandles[i]._sizes = [80]  # Change the legend marker size manually
        # Add the legend manually to the current plot
        plt.gca().add_artist(lgnd1)

        c = [plt.scatter([], [], c="w", edgecolors="k", s=s1) for s1 in bubble_range]
        plt.legend(c, ["{}".format(s2) for s2 in bubble_range],
                   title="Scaled Bubble\n       Sizes", frameon=True, labelspacing=2,
                   fontsize=13, loc=4, scatterpoints=1, borderpad=1.1)

        # Set style for LDA bubble plots
        if args.ggplot2_style:
            gu.ggplot2_style(ax)
            fc = "0.8"
        else:
            fc = "none"

        # Save LDA bubble plots to output directory
        if args.verbose:
            print("Saving chart for {}".format(" ".join(otuname.split("_"))))
        fig.savefig(pj(args.output_dir, "_".join(otuname.split())) + "." + args.save_as,
                    facecolor=fc, edgecolor="none", dpi=300,
                    bbox_inches="tight", pad_inches=0.2)
        plt.close(fig)
Exemple #11
0
def plot_LDA(X_lda,
             y_lda,
             class_colors,
             exp_var,
             style,
             fig_size,
             label_pad,
             font_size,
             sids,
             dim=2,
             zangles=None,
             pt_size=250,
             out_fp=""):
    """
    Plot transformed LDA data.
    """
    cats = class_colors.keys()
    fig = plt.figure(figsize=fig_size)
    if dim == 3:
        try:
            assert X_lda.shape[1] >= 3
        except AssertionError:
            sys.exit(
                "\nLinear Discriminant Analysis requires at least 4 groups of "
                "samples to create a 3D figure. Please update group information or "
                "use the default 2D view of the results.\n")
        if sids is not None:
            print("\nPoint annotations are available only for 2D figures.\n")
        ax = fig.add_subplot(111, projection="3d")
        ax.view_init(elev=zangles[1], azim=zangles[0])
        try:
            ax.set_zlabel("LD3 (Percent Explained Variance: {:.3f}%)".format(
                exp_var[2] * 100),
                          fontsize=font_size,
                          labelpad=label_pad)
        except:
            ax.set_zlabel("LD3", fontsize=font_size, labelpad=label_pad)
        for i, target_name in zip(range(len(cats)), cats):
            cat_x = X_lda[:, 0][y_lda == target_name]
            cat_y = X_lda[:, 1][y_lda == target_name]
            cat_z = X_lda[:, 2][y_lda == target_name]
            ax.scatter(xs=cat_x,
                       ys=cat_y,
                       zs=cat_z,
                       label=target_name,
                       c=class_colors[target_name],
                       alpha=0.85,
                       s=pt_size,
                       edgecolors="k",
                       zdir="z")
    else:
        ax = fig.add_subplot(111)
        for i, target_name in zip(range(len(cats)), cats):
            cat_x = X_lda[:, 0][y_lda == target_name]
            if X_lda.shape[1] == 1:
                cat_y = np.ones((cat_x.shape[0], 1)) + i
            else:
                cat_y = X_lda[:, 1][y_lda == target_name]
            ax.scatter(x=cat_x,
                       y=cat_y,
                       label=target_name,
                       alpha=0.85,
                       s=pt_size,
                       color=class_colors[target_name],
                       edgecolors="k")
        # Annotate data points with sample IDs
        if sids is not None:
            for sample, point, group in zip(sids, X_lda, y_lda):
                try:
                    assert len(point) >= 2
                except AssertionError:
                    point = (point[0], cats.index(group) + 1)
                finally:
                    ax.annotate(s=sample,
                                xy=point[:2],
                                xytext=(0, -15),
                                ha="center",
                                va="center",
                                textcoords="offset points")
    if X_lda.shape[1] == 1:
        plt.ylim((0.5, 2.5))
    try:
        ax.set_xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(
            exp_var[0] * 100),
                      fontsize=font_size,
                      labelpad=label_pad)
    except:
        ax.set_xlabel("LD1", fontsize=font_size, labelpad=label_pad)
    try:
        ax.set_ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(
            exp_var[1] * 100),
                      fontsize=font_size,
                      labelpad=label_pad)
    except:
        ax.set_ylabel("LD2", fontsize=font_size, labelpad=label_pad)

    leg = plt.legend(loc="best",
                     scatterpoints=3,
                     frameon=True,
                     framealpha=1,
                     fontsize=15)
    leg.get_frame().set_edgecolor('k')
    if dim == 2 and style:
        gu.ggplot2_style(ax)
        fc = "0.8"
    else:
        fc = "none"

    # save or display result
    if out_fp:
        plt.savefig(out_fp,
                    facecolor=fc,
                    edgecolor="none",
                    dpi=300,
                    bbox_inches="tight",
                    pad_inches=0.1)
    else:
        plt.show()
Exemple #12
0
def main():
    args = handle_program_options()

    try:
        with open(args.coord_fp):
            pass
    except IOError as ioe:
        err_msg = "\nError in input principal coordinates filepath (-i): {}\n"
        sys.exit(err_msg.format(ioe))

    try:
        with open(args.map_fp):
            pass
    except IOError as ioe:
        err_msg = "\nError in input metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))

    with open(args.coord_fp) as F:
        pcd = F.readlines()
    pcd = [line.split("\t") for line in pcd]

    map_header, imap = util.parse_map_file(args.map_fp)

    data_gather = util.gather_categories(imap, map_header,
                                         args.group_by.split(","))
    categories = OrderedDict([(condition, {
        "pc1": [],
        "pc2": [],
        "pc3": []
    }) for condition in data_gather.keys()])

    bcolors = itertools.cycle(Set3_12.hex_colors)
    if not args.colors:
        colors = [bcolors.next() for _ in categories]
    else:
        colors = util.color_mapping(imap, map_header, args.group_by,
                                    args.colors)
        colors = colors.values()

    parsed_unifrac = util.parse_unifrac(args.coord_fp)

    pco = args.pc_order
    if args.dimensions == 3:
        pco.append(3)

    pc1v = parsed_unifrac["varexp"][pco[0] - 1]
    pc2v = parsed_unifrac["varexp"][pco[1] - 1]
    if args.dimensions == 3:
        pc3v = parsed_unifrac["varexp"][pco[2] - 1]

    for sid, points in parsed_unifrac["pcd"].items():
        for condition, dc in data_gather.items():
            if sid in dc.sids:
                cat = condition
                break
        categories[cat]["pc1"].append((sid, points[pco[0] - 1]))
        categories[cat]["pc2"].append((sid, points[pco[1] - 1]))

        if args.dimensions == 3:
            categories[cat]["pc3"].append((sid, points[pco[2] - 1]))

    axis_str = "PC{} (Percent Explained Variance {:.3f}%)"
    # initialize plot
    fig = plt.figure(figsize=args.figsize)
    if args.dimensions == 3:
        ax = fig.add_subplot(111, projection="3d")
        ax.view_init(elev=args.z_angles[1], azim=args.z_angles[0])
        ax.set_zlabel(axis_str.format(3, pc3v), labelpad=args.label_padding)
        if args.z_limits:
            ax.set_zlim(args.z_limits)
    else:
        ax = fig.add_subplot(111)

    # plot data
    for i, cat in enumerate(categories):
        if args.dimensions == 3:
            ax.scatter(xs=[e[1] for e in categories[cat]["pc1"]],
                       ys=[e[1] for e in categories[cat]["pc2"]],
                       zs=[e[1] for e in categories[cat]["pc3"]],
                       zdir="z",
                       c=colors[i],
                       s=args.point_size,
                       label=cat,
                       edgecolors="k")
        else:
            ax.scatter([e[1] for e in categories[cat]["pc1"]],
                       [e[1] for e in categories[cat]["pc2"]],
                       c=colors[i],
                       s=args.point_size,
                       label=cat,
                       edgecolors="k")

        # Script to annotate PCoA sample points.
        if args.annotate_points:
            for x, y in zip(categories[cat]["pc1"], categories[cat]["pc2"]):
                ax.annotate(
                    x[0],
                    xy=(x[1], y[1]),
                    xytext=(-10, -15),
                    textcoords="offset points",
                    ha="center",
                    va="center",
                )

    # customize plot options
    if args.x_limits:
        ax.set_xlim(args.x_limits)
    if args.y_limits:
        ax.set_ylim(args.y_limits)

    ax.set_xlabel(axis_str.format(pco[0], float(pc1v)),
                  labelpad=args.label_padding)
    ax.set_ylabel(axis_str.format(pco[1], float(pc2v)),
                  labelpad=args.label_padding)

    leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1)
    leg.get_frame().set_edgecolor('k')

    # Set the font characteristics
    font = {"family": "normal", "weight": "bold", "size": args.font_size}
    mpl.rc("font", **font)

    if args.title:
        ax.set_title(args.title)

    if args.ggplot2_style and not args.dimensions == 3:
        gu.ggplot2_style(ax)

    # save or display result
    if args.out_fp:
        fig.savefig(args.out_fp,
                    facecolor="white",
                    edgecolor="none",
                    bbox_inches="tight",
                    pad_inches=0.2)
    else:
        plt.show()