def logo_rbs_from_gms2_mod_file(pd_figures, pf_mod, title=""):
    # type: (str, str, str) -> None

    mod = GMS2Mod.init_from_file(pf_mod)
    mm = MotifModel(mod.items["RBS_MAT"], mod.items["RBS_POS_DISTR"])
    non = GMS2Noncoding(mod.items["NON_MAT"])
    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(1, 2)
    import logomaker as lm
    lm.Logo(lm.transform_matrix(mm.pwm_to_df(),
                                from_type="probability",
                                to_type="information",
                                background=non.pwm_to_array(0)),
            ax=axes[0])
    axes[0].set_title(title)
    axes[0].set_ylim(0, 2)

    df_spacer = pd.DataFrame({
        "Distance from start": range(len(mm._spacer)),
        "Probability": mm._spacer
    })
    sns.lineplot(df_spacer,
                 "Distance from start",
                 "Probability",
                 ax=axes[1],
                 figure_options=FigureOptions(ylim=[0, 0.4]))
    plt.tight_layout()
    plt.savefig(next_name(pd_figures))

    plt.show()
def plot_sensitivities_vs_num_candidates(sensitiviies_func, max_candidates,
                                         sen_a, sen_b):
    # type: (Dict[str, Callable], int, float, float) -> None

    list_entries = list()
    for i in range(1, max_candidates + 1):
        curr_sensitivities = {
            name: sensitiviies_func[name](i, sen_a, sen_b)
            for name in sensitiviies_func.keys()
        }
        list_entries.append({"Number of candidates": i, **curr_sensitivities})

    df = pd.DataFrame(list_entries)
    conditions = sorted(list(sensitiviies_func.keys()))
    df_stacked = stack_columns_as_rows(df, conditions, "Probability",
                                       conditions, "Condition")

    sns.lineplot(df_stacked,
                 "Number of candidates",
                 "Probability",
                 hue="Condition")
def analyze_by_support(df, pd_work, fn_prefix, tag):
    # type: (pd.DataFrame, str, str, str) -> None

    list_df = list()
    for index in df.index:
        curr_df = pd.DataFrame(df.at[index, "by_support_{}".format(tag)])
        curr_df["Genome"] = df.at[index, "Genome"]

        if df.at[index, "Genome"] in {"A. pernix", "Synechocystis"}:
            continue

        list_df.append(curr_df)

    df_acc = pd.concat(list_df)

    sns.lineplot(
        df_acc,
        "Min Support",
        "Percentage 3p match: Verified from {}".format(tag),
        hue="Genome",
        figure_options=FigureOptions(
            title="Percentage of verified genes predicted\nby {}".format(tag),
            ylabel="Percentage",
            save_fig=next_name(pd_work),
            ylim=[None, 100.5]))

    sns.lineplot(
        df_acc,
        "Min Support",
        "Percentage 5p-3p match: Verified from {}".format(tag),
        hue="Genome",
        figure_options=FigureOptions(
            title="Percentage of predicted {} genes\nwith correct 5' end".
            format(tag),
            ylabel="Percentage of 5p-3p match",
            save_fig=next_name(pd_work),
            ylim=[90, 100.5]))
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    gil = GenomeInfoList.init_from_file(args.pf_genome_list)

    prl_options = ParallelizationOptions.init_from_dict(env, vars(args))

    if not prl_options["use-pbs"]:
        df = relative_entropy_analysis(env, gil, prl_options)
    else:
        pbs = PBS(env,
                  prl_options,
                  splitter=split_genome_info_list,
                  merger=merge_identity)
        list_df = pbs.run(data={"gil": gil},
                          func=relative_entropy_analysis,
                          func_kwargs={
                              "env": env,
                              "prl_options": prl_options
                          })
        df = pd.concat(list_df, ignore_index=True, sort=False)

    df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False)

    pd_figures = os_join(env["pd-work"], "summary_figures")
    mkdir_p(pd_figures)

    sns.scatterplot(df,
                    "Percent",
                    "Error",
                    figure_options=FigureOptions(
                        ylim=[0, 20], save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE Motif",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE Spacer",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.scatterplot(
        df,
        "RE Motif",
        "RE Spacer",
        hue="Genome",
        identity=True,
        figure_options=FigureOptions(save_fig=next_name(pd_figures)))

    sns.lmplot(df,
               "Percent",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE Motif",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE Spacer",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "Percent",
               "RE",
               hue="Genome",
               figure_options=FigureOptions(save_fig=next_name(pd_figures)))
def one_dim_Kimura_accuracy(env, df_all, num_steps=20):
    # type: (Environment, pd.DataFrame, int) -> None
    import matplotlib.pyplot as plt
    pd_work = env["pd-work"]
    ancestors = sorted(list(set(df_all["Ancestor"])))
    # fig, axes = plt.subplots(2, math.ceil(len(ancestors) / 2), sharex=True, sharey=True)

    # min_x = min(df_all["Average-Kimura"])
    # max_x = max(df_all["Average-Kimura"]) + 0.000000001
    # ss_x = (max_x - min_x) / float(num_steps)
    #
    # list_df = list()
    # axis_idx = 0
    # for ancestor, df in df_all.groupby("Ancestor", as_index=False):
    #     # ax = axes.ravel()[axis_idx]
    #     # axis_idx += 1
    #
    #
    #
    #
    #
    #     import numpy as np
    #     gms2_eq_sbsp_and_ncbi = np.zeros(num_steps, dtype=float)
    #     gms2_eq_sbsp_eq_ncbi = np.zeros(num_steps, dtype=float)
    #
    #     df_gms2_eq_sbsp_and_ncbi = (df["GMS2=SBSP"]) & (df["NCBI"])
    #     df_gms2_eq_sbsp_eq_ncbi = (df["GMS2=SBSP=NCBI"])
    #
    #     for index in df.index:
    #
    #         x_val = df.at[index, "Average-Kimura"]
    #
    #         x_pos = int((x_val-min_x) / ss_x)
    #
    #         gms2_eq_sbsp_and_ncbi[x_pos] += 1 if df.at[index, "GMS2=SBSP"] and df.at[index, "NCBI"] else 0
    #         gms2_eq_sbsp_eq_ncbi[x_pos] += 1 if df.at[index, "GMS2=SBSP=NCBI"] else 0
    #
    #     accuracy = np.divide(gms2_eq_sbsp_eq_ncbi, gms2_eq_sbsp_and_ncbi)
    #     # accuracy = np.flip(accuracy, 0)
    #
    #
    #     xticks = list(range(0, num_steps))
    #
    #     l_x = np.arange(min_x, max_x, ss_x)
    #     xticklabels = [round(l_x[i], 2) for i in xticks]
    #     # g = seaborn.heatmap(accuracy.transpose(), vmin=0, vmax=1, xticklabels=xticklabels, yticklabels=yticklabels, ax=ax,
    #     #                     cbar=True)
    #
    #     # g = seaborn.lineplot(xticklabels, accuracy, ax=ax, label=ancestor)
    #
    #     # cbar=g.cbar
    #
    #     # g.set_xticks(xticks)
    #
    #     curr_df = pd.DataFrame({
    #         "Average-Kimura": xticklabels,
    #         "Accuracy": accuracy,
    #         "Number-of-queries": gms2_eq_sbsp_and_ncbi
    #     })
    #     curr_df["Ancestor"] = ancestor
    #     list_df.append(curr_df)
    #
    #     # g.set_xlabel("Min Kimura")
    #     # g.set_ylabel("Max Kimura")
    #     # g.set_title(ancestor)
    #
    # df = pd.concat(list_df)     # type: pd.DataFrame
    df = bin_data_one_d(env, df_all, "Average-Kimura", num_steps)
    sns.lineplot(df,
                 "Average-Kimura",
                 "Accuracy",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})
    sns.lineplot(df,
                 "Average-Kimura",
                 "Number-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    total_per_ancestor = {
        ancestor: (df["Ancestor"].isin({ancestor})).sum()
        for ancestor in ancestors
    }

    df["Percentage-of-queries"] = 0
    df["Cumulative-percentage-of-queries"] = 0
    df.reset_index(inplace=True)
    for ancestor, df_group in df.groupby(
            "Ancestor", as_index=False):  # type: str, pd.DataFrame
        df_group.sort_values("Average-Kimura", inplace=True)
        index = df_group.index

        prev = 0
        total = df_group["Number-of-queries"].sum()
        df.loc[index, "Percentage-of-queries"] = 100 * df.loc[
            index, "Number-of-queries"] / float(total)

        for i in index:
            df.loc[i, "Cumulative-percentage-of-queries"] = prev + df.loc[
                i, "Percentage-of-queries"]
            prev = df.loc[i, "Cumulative-percentage-of-queries"]

    fig, ax = plt.subplots(figsize=(8, 4))
    sns.lineplot(df,
                 "Average-Kimura",
                 "Percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work),
                                              ylabel="Percentage of queries",
                                              xlabel="Average Kimura"),
                 ax=ax,
                 show=True,
                 legend_loc="best",
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.lineplot(df,
                 "Average-Kimura",
                 "Cumulative-percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    # standard dev
    df = bin_data_one_d(env, df_all[df_all["Support"] > 2], "Std-Kimura",
                        num_steps)
    sns.lineplot(df,
                 "Std-Kimura",
                 "Accuracy",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})
    sns.lineplot(df,
                 "Std-Kimura",
                 "Number-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    total_per_ancestor = {
        ancestor: (df["Ancestor"].isin({ancestor})).sum()
        for ancestor in ancestors
    }

    df["Percentage-of-queries"] = 0
    df["Cumulative-percentage-of-queries"] = 0
    df.reset_index(inplace=True)
    for ancestor, df_group in df.groupby(
            "Ancestor", as_index=False):  # type: str, pd.DataFrame
        df_group.sort_values("Std-Kimura", inplace=True)
        index = df_group.index

        prev = 0
        total = df_group["Number-of-queries"].sum()
        df.loc[index, "Percentage-of-queries"] = 100 * df.loc[
            index, "Number-of-queries"] / float(total)

        for i in index:
            df.loc[i, "Cumulative-percentage-of-queries"] = prev + df.loc[
                i, "Percentage-of-queries"]
            prev = df.loc[i, "Cumulative-percentage-of-queries"]
    sns.lineplot(df,
                 "Std-Kimura",
                 "Percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.lineplot(df,
                 "Std-Kimura",
                 "Cumulative-percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})
def viz_summary_per_gcfid_per_step(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = env['pd-work']

    list_df = list()

    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index, "Total SBSP"] = df.loc[df_group.index,
                                                      "SBSP"].sum()
        df.loc[df_group.index, "Total GMS2"] = df.loc[df_group.index,
                                                      "GMS2"].sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = df.loc[df_group.index,
                                                           "GMS2=SBSP"].sum()

    tag = None
    for step in ["A", "B", "C"]:
        if tag is None:
            tag = step
        else:
            tag += "+" + step
        df_summary_per_gcfid = get_summary_per_gcfid(
            df[df["Predicted-at-step"] <= step])
        df_summary_per_gcfid["SBSP Step"] = tag
        list_df.append(df_summary_per_gcfid)

    df_per_gcfid_per_step = pd.concat(list_df, sort=False)

    import matplotlib.pyplot as plt
    # fig, ax = plt.subplots()
    #
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax,
    #              sns_kwargs={"palette": CM.get_map("verified")},
    #              legend=False
    #              )
    # for l in ax.lines:
    #     l.set_linestyle("--")
    #
    # ax2 = ax.twinx()
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax2,
    #              sns_kwargs={"palette": CM.get_map("verified")},)
    #
    # fo = FigureOptions(
    #     xlabel="SBSP Step",
    #     ylabel="Percentage",
    #     # ylim=[0, 105],
    #     save_fig=next_name(env["pd-work"])
    # )
    # FigureOptions.set_properties_for_axis(ax, fo)
    # plt.subplots_adjust(bottom=0.2)
    # handles, labels = ax.get_legend_handles_labels()
    # ax.legend(handles=handles[1:], labels=labels[1:],
    #           loc="lower center", ncol=4, bbox_to_anchor=(0.5, -0.25))
    #
    # plt.savefig(fo.save_fig)
    # plt.show()

    fig, axes = plt.subplots(3, 2, sharex="all", sharey="row")
    ax = axes[:, 0]

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Sensitivity",
                     ylim=[85, 105],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percent of Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Number of Genes",
                                              ylim=[0, None]))

    fig.align_ylabels(ax)

    # plt.savefig(next_name(env["pd-work"]))
    # plt.show()

    # fig, ax = plt.subplots(3, 1, sharex="all")
    ax = axes[:, 1]
    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Sensitivity",
                     ylim=[85, 105],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percent of Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "GMS2=SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 figure_options=FigureOptions(ylabel="Number of Genes",
                                              ylim=[0, None]))

    ax[2].get_legend().remove()

    fig.align_ylabels(ax)

    for ax in axes.ravel():
        ax.set_xlabel("Steps")

    axes[0][0].set_title("SBSP")
    axes[0][1].set_title("GMS2=SBSP")

    fig.subplots_adjust(bottom=0.21)

    # handles, labels = ax.get_legend_handles_labels()
    # fig.legend(handles=handles[1:], labels=labels[1:], loc="lower center", ncol=4)#, bbox_to_anchor=(0.5, -0.25))
    handles, labels = ax.get_legend_handles_labels()
    labels[0] = "Genome"
    fig.legend(handles=handles, labels=labels, loc="lower center",
               ncol=3)  #, bbox_to_anchor=(0.5, -0.25))
    plt.savefig(next_name(env["pd-work"]))
    plt.show()

    # three plots

    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index,
               "Total SBSP"] = ((df_group["SBSP"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index,
               "Total GMS2"] = ((df_group["GMS2"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = ((df_group["GMS2=SBSP"]) &
                                                     (df_group["NCBI"])).sum()

    df_all = get_summary_per_gcfid(df)

    print(df_all[[
        "GCFID", "NCBI", "Sen(SBSP,NCBI)", "Sen(GMS2,NCBI)",
        "Sen(GMS2=SBSP,NCBI)"
    ]].to_string(index=False))

    print(df_all[[
        "GCFID", "NCBI", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)",
        "Cov2(GMS2=SBSP,NCBI)"
    ]].to_string(index=False))

    import sys
    sys.exit()
Example #7
0
def viz_per_genome(env, df):
    # type: (Environment, pd.DataFrame) -> None

    df_grp = df.groupby(["Genome", "Ancestor"], as_index=False).mean()

    sns.catplot(df_grp,
                "Ancestor",
                "BLAST",
                figure_options=FigureOptions(save_fig=next_name(
                    env["pd-work"]),
                                             xlabel="Clade",
                                             ylabel="Number of BLASTp Hits"),
                sns_kwargs={"palette": CM.get_map("ancestor")})

    # list_grp = list()
    # for _, df_grp in df.groupby("Genome", as_index=False):
    #     indices = df_grp.index
    #
    #     list_grp.append({
    #         "Genome": df.at[indices[0], "Genome"],
    #         "Ancestor": df.at[indices[0], "Ancestor"],
    #         "= 0": len(df_grp[df_grp["BLAST"] == 0]),
    #         **{
    #             f"< {x}": len(df_grp[df_grp["BLAST"] < x]) for x in [5, 10, 20, 50, 100, 500, 1000, 5000, 10000]
    #         },
    #         "> 10000": len(df_grp[df_grp["BLAST"] > 10000])
    #     })
    #
    # df_grp = pd.DataFrame(list_grp)
    # sns.catplot(df_grp, "Ancestor", "= 0")
    # sns.catplot(df_grp, "Ancestor", "< 5")
    # sns.catplot(df_grp, "Ancestor", "< 50")
    # sns.catplot(df_grp, "Ancestor", "< 100")

    # plots
    # 1) x: number of queries with < x targets

    # compute per genome, the % of queries with hits <= 0, 5, 10, 20, 40, 80, 160, ... 240 580 1160, ...
    # plot

    list_entries = list()
    for _, df_grp in df.groupby("Genome", as_index=False):
        indices = df_grp.index
        genome = df.at[indices[0], "Genome"]
        ancestor = df.at[indices[0], "Ancestor"]

        total_queries = len(df_grp)
        curr = 0
        for n in range(40):

            list_entries.append({
                "Genome":
                genome,
                "Ancestor":
                ancestor,
                "x":
                curr,
                "y":
                100 * len(df_grp[df_grp["BLAST"] < curr]) / total_queries
            })

            # if list_entries[-1]["y"] == 100:
            #     break

            if curr == 0:
                curr = 5
            else:
                curr *= 1.2

    df_tmp = pd.DataFrame(list_entries)

    SMALL_SIZE = 16
    MEDIUM_SIZE = 22
    BIGGER_SIZE = 24
    matplotlib.rcParams.update({
        # "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': True,
        'pgf.rcfonts': False,
        'font.size': SMALL_SIZE,  # controls default text sizes
        'axes.titlesize': SMALL_SIZE,  # fontsize of the axes title
        'axes.labelsize': MEDIUM_SIZE,  # fontsize of the x and y labels
        'xtick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'ytick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'legend.fontsize': 12,  # legend fontsize
        'figure.titlesize': BIGGER_SIZE,  # fontsize of the figure title
    })

    sns.lineplot(df_tmp,
                 "x",
                 "y",
                 hue="Ancestor",
                 figure_options=FigureOptions(
                     xlabel="Number of BLASTp hits",
                     ylabel="Cumulative percentage of queries (per genome)",
                     save_fig=next_name(env["pd-work"]),
                 ),
                 legend_loc="best",
                 legend_title="",
                 legend_ncol=2,
                 sns_kwargs={
                     "ci": "sd",
                     "palette": CM.get_map("ancestor")
                 })

    sns.lineplot(df_tmp,
                 "y",
                 "x",
                 hue="Ancestor",
                 figure_options=FigureOptions(
                     ylabel="Number of BLASTp hits",
                     xlabel="Cumulative percentage of queries (per genome)",
                     save_fig=next_name(env["pd-work"]),
                 ),
                 legend_loc="best",
                 legend_title="",
                 legend_ncol=2,
                 sns_kwargs={
                     "ci": "sd",
                     "palette": CM.get_map("ancestor")
                 })

    SMALL_SIZE = 14
    MEDIUM_SIZE = 18
    BIGGER_SIZE = 20
    matplotlib.rcParams.update({
        # "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': True,
        'pgf.rcfonts': False,
        'font.size': SMALL_SIZE,  # controls default text sizes
        'axes.titlesize': SMALL_SIZE,  # fontsize of the axes title
        'axes.labelsize': MEDIUM_SIZE,  # fontsize of the x and y labels
        'xtick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'ytick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'legend.fontsize': 12,  # legend fontsize
        'figure.titlesize': BIGGER_SIZE,  # fontsize of the figure title
    })
    fig, axes = plt.subplots(2, 2, sharex="all", sharey="all")

    ancestors = sorted(set(df["Ancestor"]))

    for anc, ax in zip(ancestors, axes.ravel()):

        df_anc = df_tmp[df_tmp["Ancestor"] == anc]
        sns.lineplot(df_anc[df_anc["x"] <= 40],
                     "x",
                     "y",
                     hue="Ancestor",
                     legend=None,
                     ax=ax,
                     sns_kwargs={
                         "ci": "sd",
                         "palette": CM.get_map("ancestor")
                     })
        ax.set_title(anc)
        ax.set_xlabel("")
        ax.set_ylabel("")

    figure_options = FigureOptions(
        xlabel="Number of BLASTp hits",
        ylabel="Cumulative percentage of\nqueries (per genome)",
        save_fig=next_name(env["pd-work"]),
    )

    fig.add_subplot(111, frameon=False)
    # # hide tick and tick label of the big axes
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel(figure_options.xlabel, labelpad=30)
    plt.ylabel(figure_options.ylabel, labelpad=30)

    # save_figure(figure_options, fig)
    fig.savefig(next_name(env["pd-work"]), bbox_inches="tight")
    plt.show()
Example #8
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    gil = GenomeInfoList.init_from_file(args.pf_genome_list)

    pd_figures = os_join(env["pd-work"], "figures")
    mkdir_p(pd_figures)


    list_run_info = list()

    for gi in tqdm(gil, total=len(gil)):
        # get gms2 and toolp models
        mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi)

        group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper()


        mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None)
        mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None)
        non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"])

        df_gms2 = mm_gms2.pwm_to_df()
        df_toolp = mm_toolp.pwm_to_df()

        fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4))

        # relative
        rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information")
        lm.Logo(rel_mat, color_scheme="classic", ax=axes[0])
        axes[0].set_ylim(*[0, 2])
        axes[0].set_title("GeneMarkS-2")

        # shannon
        sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information")
        lm.Logo(sha_mat, color_scheme="classic", ax=axes[1])
        axes[1].set_ylim(*[0, 2])
        axes[1].set_title("StartLink+")
        plt.tight_layout()
        plt.savefig(next_name(pd_figures))
        plt.show()

        rel_gms2 = relative_entropy(mm_gms2, non_gms2)
        rel_toolp = relative_entropy(mm_toolp, non_gms2)
        gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta"))

        if not args.verified:
            list_run_info.append({
                "GC": gc,
                "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi),
                "RE GMS2": rel_gms2,
                "RE toolp": rel_toolp
            })
        else:
            # verified
            comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group)
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[0],
                "Tool": "GMS2",
                "RE": rel_gms2,
                "GC": gc
                })
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[1],
                "Tool": "GMS2 with SL",
                "RE": rel_toolp,
                "GC": gc
                })

            print(list_run_info[-2:])

    import sbsp_viz.sns as sns
    if args.verified:
        df = pd.DataFrame(list_run_info)
        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"]),
            xlabel="Genome",
            ylabel="Error"))

        sns.lineplot(df, "Genome", "RE", hue="Tool",
                        figure_options=FigureOptions(
                            save_fig=next_name(env["pd-work"]),
                            xlabel="Genome",
                            ylabel="Relative entropy",
                        ))


    else:

        df = pd.DataFrame(list_run_info)
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))


        print("Average Error: {}".format(df["Accuracy"].mean()))

        df = pd.DataFrame(list_run_info)
        df = df[df["Accuracy"] < 2].copy()
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))

        print("Average Error: {}".format(df["Accuracy"].mean()))

        df.to_csv(next_name(env["pd-work"], ext="csv"))
Example #9
0
def viz_summary_per_gcfid_per_step(env, df):
    # type: (Environment, pd.DataFrame) -> None

    # gather analysis for steps A, A+B, and A+B+C
    list_df = list()  # type: List[pd.DataFrame]

    # compute total number of predictions per tool, per genome
    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index, "Total SBSP"] = df.loc[df_group.index,
                                                      "SBSP"].sum()
        df.loc[df_group.index, "Total GMS2"] = df.loc[df_group.index,
                                                      "GMS2"].sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = df.loc[df_group.index,
                                                           "GMS2=SBSP"].sum()

    # loop over steps A, A+B, and A+B+C and collect stats
    tag = None
    for step in ["A", "B", "C"]:
        if tag is None:
            tag = step
        else:
            tag += "+" + step
        df_summary_per_gcfid = get_summary_per_gcfid(
            df[df["Predicted-at-step"] <= step])
        df_summary_per_gcfid["SBSP Step"] = tag
        list_df.append(df_summary_per_gcfid)

    df_per_gcfid_per_step = pd.concat(list_df, sort=False)

    import matplotlib.pyplot as plt
    # fig, ax = plt.subplots()
    #
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax,
    #              sns_kwargs={"palette": CM.get_map("verified")},
    #              legend=False
    #              )
    # for l in ax.lines:
    #     l.set_linestyle("--")
    #
    # ax2 = ax.twinx()
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax2,
    #              sns_kwargs={"palette": CM.get_map("verified")},)
    #
    # fo = FigureOptions(
    #     xlabel="SBSP Step",
    #     ylabel="Percentage",
    #     # ylim=[0, 105],
    #     save_fig=next_name(env["pd-work"])
    # )
    # FigureOptions.set_properties_for_axis(ax, fo)
    # plt.subplots_adjust(bottom=0.2)
    # handles, labels = ax.get_legend_handles_labels()
    # ax.legend(handles=handles[1:], labels=labels[1:],
    #           loc="lower center", ncol=4, bbox_to_anchor=(0.5, -0.25))
    #
    # plt.savefig(fo.save_fig)
    # plt.show()

    fig, axes = plt.subplots(3, 2, sharex="all", sharey="row")
    ax = axes[:, 0]

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Error rate (\%)",
                     ylim=[0, 20],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percentage\nof Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Number\nof Genes",
                                              ylim=[0, None]))

    fig.align_ylabels(ax)

    # plt.savefig(next_name(env["pd-work"]))
    # plt.show()

    # fig, ax = plt.subplots(3, 1, sharex="all")
    ax = axes[:, 1]
    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Error",
                     ylim=[0, None],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percentage of Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "GMS2=SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 figure_options=FigureOptions(ylabel="Number of Genes",
                                              ylim=[0, None]))

    ax[2].get_legend().remove()

    fig.align_ylabels(ax)

    for ax in axes.ravel():
        ax.set_xlabel("Steps")

    axes[0][0].set_title(TOOL)
    axes[0][1].set_title(TOOLp)

    fig.subplots_adjust(bottom=0.21)

    # handles, labels = ax.get_legend_handles_labels()
    # fig.legend(handles=handles[1:], labels=labels[1:], loc="lower center", ncol=4)#, bbox_to_anchor=(0.5, -0.25))
    handles, labels = ax.get_legend_handles_labels()
    labels[0] = "Genome"
    fig.legend(handles=handles, labels=labels, loc="lower center",
               ncol=3)  #, bbox_to_anchor=(0.5, -0.25))
    plt.savefig(next_name(env["pd-work"]))
    plt.show()

    # three plots

    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index,
               "Total SBSP"] = ((df_group["SBSP"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index,
               "Total GMS2"] = ((df_group["GMS2"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = ((df_group["GMS2=SBSP"]) &
                                                     (df_group["NCBI"])).sum()

    df_all = get_summary_per_gcfid(df)

    # map column names for tables
    columns = [
        "GCFID", "NCBI", "Sen(SBSP,NCBI)", "Sen(GMS2,NCBI)",
        "Sen(GMS2=SBSP,NCBI)", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)",
        "Cov2(GMS2=SBSP,NCBI)"
    ]
    df_sen = df_all.copy()[columns].rename(columns={
        "GCFID": "Genome",
        "NCBI": "Verified",
        "Sen(SBSP,NCBI)": "SBSP",
        "Sen(GMS2,NCBI)": "GMS2",
        "Sen(GMS2=SBSP,NCBI)": "GMS2=SBSP",
    },
                                           inplace=False)
    df_sen[["Genome", "Verified", "SBSP", "GMS2",
            "GMS2=SBSP"]].to_csv(os_join(env["pd-work"], "sensitivity.csv"),
                                 index=False)

    # print(df_all[["GCFID", "NCBI", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)", "Cov2(GMS2=SBSP,NCBI)"]].to_string(index=False))

    df_cov = df_all[columns].rename(columns={
        "GCFID": "Genome",
        "NCBI": "Verified",
        "Cov2(SBSP,NCBI)": "SBSP",
        "Cov2(GMS2,NCBI)": "GMS2",
        "Cov2(GMS2=SBSP,NCBI)": "GMS2=SBSP",
    },
                                    inplace=False)

    df_cov[["Genome", "Verified", "SBSP", "GMS2",
            "GMS2=SBSP"]].to_csv(os_join(env["pd-work"], "coverage.csv"),
                                 index=False)
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    df = pd.read_csv(args.pf_data)
    df["chunk-size"] /= 1000

    import matplotlib.pyplot as plt

    fig, ax = plt.subplots()



    sns.lineplot(df[df["Tool"] == "SBSP"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime",
                 hue="Genome",
                 sns_kwargs={"palette": CM.get_map("verified"), "linestyle": "dashed"},
                 ax=ax,
                 legend=False,
                 figure_options=FigureOptions(
                     xlabel="Chunk size (mb)",
                     ylabel="Accuracy",
                     ylim=[74, 101],
                     save_fig=next_name(env["pd-work"])
                 ))

    for l in ax.lines:
        l.set_linestyle("--")

    sns.lineplot(df[df["Tool"] == "GMS2"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime",
                 hue="Genome",
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend_loc="best",
                 legend_ncol=2,
                 ax=ax)




    if args.with_mgm:
        y_max = ax.get_ylim()[1]
        ax.axvline(50, 0, y_max, color="grey", linestyle="dashed")
        ax.axhline(74, 5, 49, color="grey", linestyle="dashed")
        ax.annotate("MGM", (5, 72))

    if "MGM" in set(df["Tool"]):
        sns.lineplot(df[df["Tool"] == "MGM"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime",
                     hue="Genome",
                     sns_kwargs={"palette": CM.get_map("verified"), "linestyle": "-."},
                     ax=ax,
                     legend=False)

    for l in ax.lines[len(ax.lines)-5:]:
        l.set_linestyle(":")

    fo = FigureOptions(
                     xlabel="Chunk size (mb)",
                     ylabel="Accuracy",
                     ylim=[74,101],
                     save_fig=next_name(env["pd-work"])
                 )
    FigureOptions.set_properties_for_axis(ax, fo)
    plt.savefig(fo.save_fig)
    plt.show()
def analyze_independent_predictions(max_candidates, sen_a, sen_b):
    # type: (int, float, float) -> None

    sensitivities = {
        "Random": sensitivity_random,
        "Independent": sensitivity_independent,
        "Fully dependent": sensitivity_fully_dependent
    }

    agree_given_pred = {
        "Random": agree_given_pred_random,
        "Independent": agree_given_pred_independent,
        "Fully dependent": agree_given_pred_fully_dependent
    }

    df = compute_data(sensitivities, agree_given_pred, max_candidates)

    plot_sensitivities_vs_num_candidates(sensitivities, max_candidates, sen_a,
                                         sen_b)

    sns.lineplot(
        df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)],
        "Number of candidates",
        "Probability",
        hue="Condition",
        sns_kwargs={"palette": CM.get_map("independence-conditions")},
        legend_loc="best",
        figure_options=FigureOptions(
            save_fig=next_name("."),
            ylabel=r"$P(y=s|x_1=y, x_2=y)$",
            # xlim=[None, 40]
        ))
    # error
    df["1 - Probability"] = 1 - df["Probability"]
    sns.lineplot(
        df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)],
        "Number of candidates",
        "1 - Probability",
        hue="Condition",
        sns_kwargs={"palette": CM.get_map("independence-conditions")},
        legend_loc="best",
        figure_options=FigureOptions(
            save_fig=next_name("."),
            ylabel=r"$P(y\neq s|x_1=y, x_2=y)$",
            # xlim=[None, 40]
        ))

    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4))

    sns.lineplot(df[(df["Sensitivity A"] == 0.9)
                    & (df["Sensitivity B"] == 0.9)],
                 "Number of candidates",
                 "Probability",
                 hue="Condition",
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 ax=axes[0],
                 legend=False,
                 figure_options=FigureOptions(title="Sensitivity = 0.9", ))

    sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"])
                    & (df["Number of candidates"] == 25)],
                 "Sensitivity A",
                 "Probability",
                 hue="Condition",
                 ax=axes[1],
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 figure_options=FigureOptions(
                     ylim=[0, 1.05],
                     xlim=[0, 1],
                     xlabel="Sensitivity",
                     title="Number of candidates = 25",
                 ))

    save_figure(FigureOptions(save_fig=next_name(".")), fig)
    plt.show()

    df_tmp = df[(df["Sensitivity A"] == df["Sensitivity B"])
                & (df["Condition"] == "Independent") &
                (df["Sensitivity A"].isin(
                    {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}))]
    df_tmp.rename(columns={"Sensitivity A": "Sensitivity"}, inplace=True)

    sns.lineplot(
        df_tmp,
        "Number of candidates",
        "Probability",
        hue="Sensitivity",
        figure_options=FigureOptions(
            # ylim=[0, 1.05],
            # xlim=[0, 1],
            title="Independent algorithms",
            save_fig=next_name(".")),
    )

    # for condition in set(df["Condition"]):
    #
    #     sns.kdeplot(
    #         df[(df["Condition"] == condition) & (df["Sensitivity A"] == df["Sensitivity B"])],
    #         "Sensitivity A", "Number of candidates", "Probability",
    #         figure_options=FigureOptions(
    #             title=condition
    #         ))
    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4))

    sns.lineplot(df[(df["Sensitivity A"] == 0.9)
                    & (df["Sensitivity B"] == 0.9)],
                 "Number of candidates",
                 "Agree given prediction",
                 hue="Condition",
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 ax=axes[0],
                 legend=False,
                 figure_options=FigureOptions(title="Sensitivity = 0.9", ))

    sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"])
                    & (df["Number of candidates"] == 25)],
                 "Sensitivity A",
                 "Agree given prediction",
                 hue="Condition",
                 ax=axes[1],
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 figure_options=FigureOptions(
                     ylim=[0, 1.05],
                     xlim=[0, 1],
                     xlabel="Sensitivity",
                     title="Number of targets = 25",
                 ))

    save_figure(FigureOptions(save_fig=next_name(".")), fig)
    plt.show()