def logo_rbs_from_gms2_mod_file(pd_figures, pf_mod, title=""): # type: (str, str, str) -> None mod = GMS2Mod.init_from_file(pf_mod) mm = MotifModel(mod.items["RBS_MAT"], mod.items["RBS_POS_DISTR"]) non = GMS2Noncoding(mod.items["NON_MAT"]) import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 2) import logomaker as lm lm.Logo(lm.transform_matrix(mm.pwm_to_df(), from_type="probability", to_type="information", background=non.pwm_to_array(0)), ax=axes[0]) axes[0].set_title(title) axes[0].set_ylim(0, 2) df_spacer = pd.DataFrame({ "Distance from start": range(len(mm._spacer)), "Probability": mm._spacer }) sns.lineplot(df_spacer, "Distance from start", "Probability", ax=axes[1], figure_options=FigureOptions(ylim=[0, 0.4])) plt.tight_layout() plt.savefig(next_name(pd_figures)) plt.show()
def plot_sensitivities_vs_num_candidates(sensitiviies_func, max_candidates, sen_a, sen_b): # type: (Dict[str, Callable], int, float, float) -> None list_entries = list() for i in range(1, max_candidates + 1): curr_sensitivities = { name: sensitiviies_func[name](i, sen_a, sen_b) for name in sensitiviies_func.keys() } list_entries.append({"Number of candidates": i, **curr_sensitivities}) df = pd.DataFrame(list_entries) conditions = sorted(list(sensitiviies_func.keys())) df_stacked = stack_columns_as_rows(df, conditions, "Probability", conditions, "Condition") sns.lineplot(df_stacked, "Number of candidates", "Probability", hue="Condition")
def analyze_by_support(df, pd_work, fn_prefix, tag): # type: (pd.DataFrame, str, str, str) -> None list_df = list() for index in df.index: curr_df = pd.DataFrame(df.at[index, "by_support_{}".format(tag)]) curr_df["Genome"] = df.at[index, "Genome"] if df.at[index, "Genome"] in {"A. pernix", "Synechocystis"}: continue list_df.append(curr_df) df_acc = pd.concat(list_df) sns.lineplot( df_acc, "Min Support", "Percentage 3p match: Verified from {}".format(tag), hue="Genome", figure_options=FigureOptions( title="Percentage of verified genes predicted\nby {}".format(tag), ylabel="Percentage", save_fig=next_name(pd_work), ylim=[None, 100.5])) sns.lineplot( df_acc, "Min Support", "Percentage 5p-3p match: Verified from {}".format(tag), hue="Genome", figure_options=FigureOptions( title="Percentage of predicted {} genes\nwith correct 5' end". format(tag), ylabel="Percentage of 5p-3p match", save_fig=next_name(pd_work), ylim=[90, 100.5]))
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) prl_options = ParallelizationOptions.init_from_dict(env, vars(args)) if not prl_options["use-pbs"]: df = relative_entropy_analysis(env, gil, prl_options) else: pbs = PBS(env, prl_options, splitter=split_genome_info_list, merger=merge_identity) list_df = pbs.run(data={"gil": gil}, func=relative_entropy_analysis, func_kwargs={ "env": env, "prl_options": prl_options }) df = pd.concat(list_df, ignore_index=True, sort=False) df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False) pd_figures = os_join(env["pd-work"], "summary_figures") mkdir_p(pd_figures) sns.scatterplot(df, "Percent", "Error", figure_options=FigureOptions( ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.scatterplot( df, "RE Motif", "RE Spacer", hue="Genome", identity=True, figure_options=FigureOptions(save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "RE", hue="Genome", figure_options=FigureOptions(save_fig=next_name(pd_figures)))
def one_dim_Kimura_accuracy(env, df_all, num_steps=20): # type: (Environment, pd.DataFrame, int) -> None import matplotlib.pyplot as plt pd_work = env["pd-work"] ancestors = sorted(list(set(df_all["Ancestor"]))) # fig, axes = plt.subplots(2, math.ceil(len(ancestors) / 2), sharex=True, sharey=True) # min_x = min(df_all["Average-Kimura"]) # max_x = max(df_all["Average-Kimura"]) + 0.000000001 # ss_x = (max_x - min_x) / float(num_steps) # # list_df = list() # axis_idx = 0 # for ancestor, df in df_all.groupby("Ancestor", as_index=False): # # ax = axes.ravel()[axis_idx] # # axis_idx += 1 # # # # # # import numpy as np # gms2_eq_sbsp_and_ncbi = np.zeros(num_steps, dtype=float) # gms2_eq_sbsp_eq_ncbi = np.zeros(num_steps, dtype=float) # # df_gms2_eq_sbsp_and_ncbi = (df["GMS2=SBSP"]) & (df["NCBI"]) # df_gms2_eq_sbsp_eq_ncbi = (df["GMS2=SBSP=NCBI"]) # # for index in df.index: # # x_val = df.at[index, "Average-Kimura"] # # x_pos = int((x_val-min_x) / ss_x) # # gms2_eq_sbsp_and_ncbi[x_pos] += 1 if df.at[index, "GMS2=SBSP"] and df.at[index, "NCBI"] else 0 # gms2_eq_sbsp_eq_ncbi[x_pos] += 1 if df.at[index, "GMS2=SBSP=NCBI"] else 0 # # accuracy = np.divide(gms2_eq_sbsp_eq_ncbi, gms2_eq_sbsp_and_ncbi) # # accuracy = np.flip(accuracy, 0) # # # xticks = list(range(0, num_steps)) # # l_x = np.arange(min_x, max_x, ss_x) # xticklabels = [round(l_x[i], 2) for i in xticks] # # g = seaborn.heatmap(accuracy.transpose(), vmin=0, vmax=1, xticklabels=xticklabels, yticklabels=yticklabels, ax=ax, # # cbar=True) # # # g = seaborn.lineplot(xticklabels, accuracy, ax=ax, label=ancestor) # # # cbar=g.cbar # # # g.set_xticks(xticks) # # curr_df = pd.DataFrame({ # "Average-Kimura": xticklabels, # "Accuracy": accuracy, # "Number-of-queries": gms2_eq_sbsp_and_ncbi # }) # curr_df["Ancestor"] = ancestor # list_df.append(curr_df) # # # g.set_xlabel("Min Kimura") # # g.set_ylabel("Max Kimura") # # g.set_title(ancestor) # # df = pd.concat(list_df) # type: pd.DataFrame df = bin_data_one_d(env, df_all, "Average-Kimura", num_steps) sns.lineplot(df, "Average-Kimura", "Accuracy", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) sns.lineplot(df, "Average-Kimura", "Number-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) total_per_ancestor = { ancestor: (df["Ancestor"].isin({ancestor})).sum() for ancestor in ancestors } df["Percentage-of-queries"] = 0 df["Cumulative-percentage-of-queries"] = 0 df.reset_index(inplace=True) for ancestor, df_group in df.groupby( "Ancestor", as_index=False): # type: str, pd.DataFrame df_group.sort_values("Average-Kimura", inplace=True) index = df_group.index prev = 0 total = df_group["Number-of-queries"].sum() df.loc[index, "Percentage-of-queries"] = 100 * df.loc[ index, "Number-of-queries"] / float(total) for i in index: df.loc[i, "Cumulative-percentage-of-queries"] = prev + df.loc[ i, "Percentage-of-queries"] prev = df.loc[i, "Cumulative-percentage-of-queries"] fig, ax = plt.subplots(figsize=(8, 4)) sns.lineplot(df, "Average-Kimura", "Percentage-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ylabel="Percentage of queries", xlabel="Average Kimura"), ax=ax, show=True, legend_loc="best", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.lineplot(df, "Average-Kimura", "Cumulative-percentage-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) # standard dev df = bin_data_one_d(env, df_all[df_all["Support"] > 2], "Std-Kimura", num_steps) sns.lineplot(df, "Std-Kimura", "Accuracy", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) sns.lineplot(df, "Std-Kimura", "Number-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) total_per_ancestor = { ancestor: (df["Ancestor"].isin({ancestor})).sum() for ancestor in ancestors } df["Percentage-of-queries"] = 0 df["Cumulative-percentage-of-queries"] = 0 df.reset_index(inplace=True) for ancestor, df_group in df.groupby( "Ancestor", as_index=False): # type: str, pd.DataFrame df_group.sort_values("Std-Kimura", inplace=True) index = df_group.index prev = 0 total = df_group["Number-of-queries"].sum() df.loc[index, "Percentage-of-queries"] = 100 * df.loc[ index, "Number-of-queries"] / float(total) for i in index: df.loc[i, "Cumulative-percentage-of-queries"] = prev + df.loc[ i, "Percentage-of-queries"] prev = df.loc[i, "Cumulative-percentage-of-queries"] sns.lineplot(df, "Std-Kimura", "Percentage-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) sns.lineplot(df, "Std-Kimura", "Cumulative-percentage-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")})
def viz_summary_per_gcfid_per_step(env, df): # type: (Environment, pd.DataFrame) -> None pd_work = env['pd-work'] list_df = list() for gcfid, df_group in df.groupby("GCFID", as_index=False): df.loc[df_group.index, "Total SBSP"] = df.loc[df_group.index, "SBSP"].sum() df.loc[df_group.index, "Total GMS2"] = df.loc[df_group.index, "GMS2"].sum() df.loc[df_group.index, "Total GMS2=SBSP"] = df.loc[df_group.index, "GMS2=SBSP"].sum() tag = None for step in ["A", "B", "C"]: if tag is None: tag = step else: tag += "+" + step df_summary_per_gcfid = get_summary_per_gcfid( df[df["Predicted-at-step"] <= step]) df_summary_per_gcfid["SBSP Step"] = tag list_df.append(df_summary_per_gcfid) df_per_gcfid_per_step = pd.concat(list_df, sort=False) import matplotlib.pyplot as plt # fig, ax = plt.subplots() # # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax, # sns_kwargs={"palette": CM.get_map("verified")}, # legend=False # ) # for l in ax.lines: # l.set_linestyle("--") # # ax2 = ax.twinx() # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax2, # sns_kwargs={"palette": CM.get_map("verified")},) # # fo = FigureOptions( # xlabel="SBSP Step", # ylabel="Percentage", # # ylim=[0, 105], # save_fig=next_name(env["pd-work"]) # ) # FigureOptions.set_properties_for_axis(ax, fo) # plt.subplots_adjust(bottom=0.2) # handles, labels = ax.get_legend_handles_labels() # ax.legend(handles=handles[1:], labels=labels[1:], # loc="lower center", ncol=4, bbox_to_anchor=(0.5, -0.25)) # # plt.savefig(fo.save_fig) # plt.show() fig, axes = plt.subplots(3, 2, sharex="all", sharey="row") ax = axes[:, 0] sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax[0], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions( ylabel="Sensitivity", ylim=[85, 105], )) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Cov(SBSP,NCBI)", hue="GCFID", ax=ax[1], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Percent of Genes", ylim=[0, None])) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax[2], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Number of Genes", ylim=[0, None])) fig.align_ylabels(ax) # plt.savefig(next_name(env["pd-work"])) # plt.show() # fig, ax = plt.subplots(3, 1, sharex="all") ax = axes[:, 1] sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(GMS2=SBSP,NCBI)", hue="GCFID", ax=ax[0], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions( ylabel="Sensitivity", ylim=[85, 105], )) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Cov(GMS2=SBSP,NCBI)", hue="GCFID", ax=ax[1], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Percent of Genes", ylim=[0, None])) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "GMS2=SBSP", hue="GCFID", ax=ax[2], sns_kwargs={"palette": CM.get_map("verified")}, figure_options=FigureOptions(ylabel="Number of Genes", ylim=[0, None])) ax[2].get_legend().remove() fig.align_ylabels(ax) for ax in axes.ravel(): ax.set_xlabel("Steps") axes[0][0].set_title("SBSP") axes[0][1].set_title("GMS2=SBSP") fig.subplots_adjust(bottom=0.21) # handles, labels = ax.get_legend_handles_labels() # fig.legend(handles=handles[1:], labels=labels[1:], loc="lower center", ncol=4)#, bbox_to_anchor=(0.5, -0.25)) handles, labels = ax.get_legend_handles_labels() labels[0] = "Genome" fig.legend(handles=handles, labels=labels, loc="lower center", ncol=3) #, bbox_to_anchor=(0.5, -0.25)) plt.savefig(next_name(env["pd-work"])) plt.show() # three plots for gcfid, df_group in df.groupby("GCFID", as_index=False): df.loc[df_group.index, "Total SBSP"] = ((df_group["SBSP"]) & (df_group["NCBI"])).sum() df.loc[df_group.index, "Total GMS2"] = ((df_group["GMS2"]) & (df_group["NCBI"])).sum() df.loc[df_group.index, "Total GMS2=SBSP"] = ((df_group["GMS2=SBSP"]) & (df_group["NCBI"])).sum() df_all = get_summary_per_gcfid(df) print(df_all[[ "GCFID", "NCBI", "Sen(SBSP,NCBI)", "Sen(GMS2,NCBI)", "Sen(GMS2=SBSP,NCBI)" ]].to_string(index=False)) print(df_all[[ "GCFID", "NCBI", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)", "Cov2(GMS2=SBSP,NCBI)" ]].to_string(index=False)) import sys sys.exit()
def viz_per_genome(env, df): # type: (Environment, pd.DataFrame) -> None df_grp = df.groupby(["Genome", "Ancestor"], as_index=False).mean() sns.catplot(df_grp, "Ancestor", "BLAST", figure_options=FigureOptions(save_fig=next_name( env["pd-work"]), xlabel="Clade", ylabel="Number of BLASTp Hits"), sns_kwargs={"palette": CM.get_map("ancestor")}) # list_grp = list() # for _, df_grp in df.groupby("Genome", as_index=False): # indices = df_grp.index # # list_grp.append({ # "Genome": df.at[indices[0], "Genome"], # "Ancestor": df.at[indices[0], "Ancestor"], # "= 0": len(df_grp[df_grp["BLAST"] == 0]), # **{ # f"< {x}": len(df_grp[df_grp["BLAST"] < x]) for x in [5, 10, 20, 50, 100, 500, 1000, 5000, 10000] # }, # "> 10000": len(df_grp[df_grp["BLAST"] > 10000]) # }) # # df_grp = pd.DataFrame(list_grp) # sns.catplot(df_grp, "Ancestor", "= 0") # sns.catplot(df_grp, "Ancestor", "< 5") # sns.catplot(df_grp, "Ancestor", "< 50") # sns.catplot(df_grp, "Ancestor", "< 100") # plots # 1) x: number of queries with < x targets # compute per genome, the % of queries with hits <= 0, 5, 10, 20, 40, 80, 160, ... 240 580 1160, ... # plot list_entries = list() for _, df_grp in df.groupby("Genome", as_index=False): indices = df_grp.index genome = df.at[indices[0], "Genome"] ancestor = df.at[indices[0], "Ancestor"] total_queries = len(df_grp) curr = 0 for n in range(40): list_entries.append({ "Genome": genome, "Ancestor": ancestor, "x": curr, "y": 100 * len(df_grp[df_grp["BLAST"] < curr]) / total_queries }) # if list_entries[-1]["y"] == 100: # break if curr == 0: curr = 5 else: curr *= 1.2 df_tmp = pd.DataFrame(list_entries) SMALL_SIZE = 16 MEDIUM_SIZE = 22 BIGGER_SIZE = 24 matplotlib.rcParams.update({ # "pgf.texsystem": "pdflatex", 'font.family': 'serif', 'text.usetex': True, 'pgf.rcfonts': False, 'font.size': SMALL_SIZE, # controls default text sizes 'axes.titlesize': SMALL_SIZE, # fontsize of the axes title 'axes.labelsize': MEDIUM_SIZE, # fontsize of the x and y labels 'xtick.labelsize': SMALL_SIZE, # fontsize of the tick labels 'ytick.labelsize': SMALL_SIZE, # fontsize of the tick labels 'legend.fontsize': 12, # legend fontsize 'figure.titlesize': BIGGER_SIZE, # fontsize of the figure title }) sns.lineplot(df_tmp, "x", "y", hue="Ancestor", figure_options=FigureOptions( xlabel="Number of BLASTp hits", ylabel="Cumulative percentage of queries (per genome)", save_fig=next_name(env["pd-work"]), ), legend_loc="best", legend_title="", legend_ncol=2, sns_kwargs={ "ci": "sd", "palette": CM.get_map("ancestor") }) sns.lineplot(df_tmp, "y", "x", hue="Ancestor", figure_options=FigureOptions( ylabel="Number of BLASTp hits", xlabel="Cumulative percentage of queries (per genome)", save_fig=next_name(env["pd-work"]), ), legend_loc="best", legend_title="", legend_ncol=2, sns_kwargs={ "ci": "sd", "palette": CM.get_map("ancestor") }) SMALL_SIZE = 14 MEDIUM_SIZE = 18 BIGGER_SIZE = 20 matplotlib.rcParams.update({ # "pgf.texsystem": "pdflatex", 'font.family': 'serif', 'text.usetex': True, 'pgf.rcfonts': False, 'font.size': SMALL_SIZE, # controls default text sizes 'axes.titlesize': SMALL_SIZE, # fontsize of the axes title 'axes.labelsize': MEDIUM_SIZE, # fontsize of the x and y labels 'xtick.labelsize': SMALL_SIZE, # fontsize of the tick labels 'ytick.labelsize': SMALL_SIZE, # fontsize of the tick labels 'legend.fontsize': 12, # legend fontsize 'figure.titlesize': BIGGER_SIZE, # fontsize of the figure title }) fig, axes = plt.subplots(2, 2, sharex="all", sharey="all") ancestors = sorted(set(df["Ancestor"])) for anc, ax in zip(ancestors, axes.ravel()): df_anc = df_tmp[df_tmp["Ancestor"] == anc] sns.lineplot(df_anc[df_anc["x"] <= 40], "x", "y", hue="Ancestor", legend=None, ax=ax, sns_kwargs={ "ci": "sd", "palette": CM.get_map("ancestor") }) ax.set_title(anc) ax.set_xlabel("") ax.set_ylabel("") figure_options = FigureOptions( xlabel="Number of BLASTp hits", ylabel="Cumulative percentage of\nqueries (per genome)", save_fig=next_name(env["pd-work"]), ) fig.add_subplot(111, frameon=False) # # hide tick and tick label of the big axes plt.tick_params(top=False, bottom=False, left=False, right=False, which="both", labelbottom=False, labeltop=False, labelleft=False, labelright=False) plt.xlabel(figure_options.xlabel, labelpad=30) plt.ylabel(figure_options.ylabel, labelpad=30) # save_figure(figure_options, fig) fig.savefig(next_name(env["pd-work"]), bbox_inches="tight") plt.show()
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) pd_figures = os_join(env["pd-work"], "figures") mkdir_p(pd_figures) list_run_info = list() for gi in tqdm(gil, total=len(gil)): # get gms2 and toolp models mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi) group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper() mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None) mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None) non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"]) df_gms2 = mm_gms2.pwm_to_df() df_toolp = mm_toolp.pwm_to_df() fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4)) # relative rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information") lm.Logo(rel_mat, color_scheme="classic", ax=axes[0]) axes[0].set_ylim(*[0, 2]) axes[0].set_title("GeneMarkS-2") # shannon sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information") lm.Logo(sha_mat, color_scheme="classic", ax=axes[1]) axes[1].set_ylim(*[0, 2]) axes[1].set_title("StartLink+") plt.tight_layout() plt.savefig(next_name(pd_figures)) plt.show() rel_gms2 = relative_entropy(mm_gms2, non_gms2) rel_toolp = relative_entropy(mm_toolp, non_gms2) gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta")) if not args.verified: list_run_info.append({ "GC": gc, "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi), "RE GMS2": rel_gms2, "RE toolp": rel_toolp }) else: # verified comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[0], "Tool": "GMS2", "RE": rel_gms2, "GC": gc }) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[1], "Tool": "GMS2 with SL", "RE": rel_toolp, "GC": gc }) print(list_run_info[-2:]) import sbsp_viz.sns as sns if args.verified: df = pd.DataFrame(list_run_info) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Error")) sns.lineplot(df, "Genome", "RE", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Relative entropy", )) else: df = pd.DataFrame(list_run_info) sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df = pd.DataFrame(list_run_info) df = df[df["Accuracy"] < 2].copy() sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df.to_csv(next_name(env["pd-work"], ext="csv"))
def viz_summary_per_gcfid_per_step(env, df): # type: (Environment, pd.DataFrame) -> None # gather analysis for steps A, A+B, and A+B+C list_df = list() # type: List[pd.DataFrame] # compute total number of predictions per tool, per genome for gcfid, df_group in df.groupby("GCFID", as_index=False): df.loc[df_group.index, "Total SBSP"] = df.loc[df_group.index, "SBSP"].sum() df.loc[df_group.index, "Total GMS2"] = df.loc[df_group.index, "GMS2"].sum() df.loc[df_group.index, "Total GMS2=SBSP"] = df.loc[df_group.index, "GMS2=SBSP"].sum() # loop over steps A, A+B, and A+B+C and collect stats tag = None for step in ["A", "B", "C"]: if tag is None: tag = step else: tag += "+" + step df_summary_per_gcfid = get_summary_per_gcfid( df[df["Predicted-at-step"] <= step]) df_summary_per_gcfid["SBSP Step"] = tag list_df.append(df_summary_per_gcfid) df_per_gcfid_per_step = pd.concat(list_df, sort=False) import matplotlib.pyplot as plt # fig, ax = plt.subplots() # # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax, # sns_kwargs={"palette": CM.get_map("verified")}, # legend=False # ) # for l in ax.lines: # l.set_linestyle("--") # # ax2 = ax.twinx() # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax2, # sns_kwargs={"palette": CM.get_map("verified")},) # # fo = FigureOptions( # xlabel="SBSP Step", # ylabel="Percentage", # # ylim=[0, 105], # save_fig=next_name(env["pd-work"]) # ) # FigureOptions.set_properties_for_axis(ax, fo) # plt.subplots_adjust(bottom=0.2) # handles, labels = ax.get_legend_handles_labels() # ax.legend(handles=handles[1:], labels=labels[1:], # loc="lower center", ncol=4, bbox_to_anchor=(0.5, -0.25)) # # plt.savefig(fo.save_fig) # plt.show() fig, axes = plt.subplots(3, 2, sharex="all", sharey="row") ax = axes[:, 0] sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax[0], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions( ylabel="Error rate (\%)", ylim=[0, 20], )) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Cov(SBSP,NCBI)", hue="GCFID", ax=ax[1], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Percentage\nof Genes", ylim=[0, None])) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax[2], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Number\nof Genes", ylim=[0, None])) fig.align_ylabels(ax) # plt.savefig(next_name(env["pd-work"])) # plt.show() # fig, ax = plt.subplots(3, 1, sharex="all") ax = axes[:, 1] sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(GMS2=SBSP,NCBI)", hue="GCFID", ax=ax[0], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions( ylabel="Error", ylim=[0, None], )) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Cov(GMS2=SBSP,NCBI)", hue="GCFID", ax=ax[1], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Percentage of Genes", ylim=[0, None])) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "GMS2=SBSP", hue="GCFID", ax=ax[2], sns_kwargs={"palette": CM.get_map("verified")}, figure_options=FigureOptions(ylabel="Number of Genes", ylim=[0, None])) ax[2].get_legend().remove() fig.align_ylabels(ax) for ax in axes.ravel(): ax.set_xlabel("Steps") axes[0][0].set_title(TOOL) axes[0][1].set_title(TOOLp) fig.subplots_adjust(bottom=0.21) # handles, labels = ax.get_legend_handles_labels() # fig.legend(handles=handles[1:], labels=labels[1:], loc="lower center", ncol=4)#, bbox_to_anchor=(0.5, -0.25)) handles, labels = ax.get_legend_handles_labels() labels[0] = "Genome" fig.legend(handles=handles, labels=labels, loc="lower center", ncol=3) #, bbox_to_anchor=(0.5, -0.25)) plt.savefig(next_name(env["pd-work"])) plt.show() # three plots for gcfid, df_group in df.groupby("GCFID", as_index=False): df.loc[df_group.index, "Total SBSP"] = ((df_group["SBSP"]) & (df_group["NCBI"])).sum() df.loc[df_group.index, "Total GMS2"] = ((df_group["GMS2"]) & (df_group["NCBI"])).sum() df.loc[df_group.index, "Total GMS2=SBSP"] = ((df_group["GMS2=SBSP"]) & (df_group["NCBI"])).sum() df_all = get_summary_per_gcfid(df) # map column names for tables columns = [ "GCFID", "NCBI", "Sen(SBSP,NCBI)", "Sen(GMS2,NCBI)", "Sen(GMS2=SBSP,NCBI)", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)", "Cov2(GMS2=SBSP,NCBI)" ] df_sen = df_all.copy()[columns].rename(columns={ "GCFID": "Genome", "NCBI": "Verified", "Sen(SBSP,NCBI)": "SBSP", "Sen(GMS2,NCBI)": "GMS2", "Sen(GMS2=SBSP,NCBI)": "GMS2=SBSP", }, inplace=False) df_sen[["Genome", "Verified", "SBSP", "GMS2", "GMS2=SBSP"]].to_csv(os_join(env["pd-work"], "sensitivity.csv"), index=False) # print(df_all[["GCFID", "NCBI", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)", "Cov2(GMS2=SBSP,NCBI)"]].to_string(index=False)) df_cov = df_all[columns].rename(columns={ "GCFID": "Genome", "NCBI": "Verified", "Cov2(SBSP,NCBI)": "SBSP", "Cov2(GMS2,NCBI)": "GMS2", "Cov2(GMS2=SBSP,NCBI)": "GMS2=SBSP", }, inplace=False) df_cov[["Genome", "Verified", "SBSP", "GMS2", "GMS2=SBSP"]].to_csv(os_join(env["pd-work"], "coverage.csv"), index=False)
def main(env, args): # type: (Environment, argparse.Namespace) -> None df = pd.read_csv(args.pf_data) df["chunk-size"] /= 1000 import matplotlib.pyplot as plt fig, ax = plt.subplots() sns.lineplot(df[df["Tool"] == "SBSP"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime", hue="Genome", sns_kwargs={"palette": CM.get_map("verified"), "linestyle": "dashed"}, ax=ax, legend=False, figure_options=FigureOptions( xlabel="Chunk size (mb)", ylabel="Accuracy", ylim=[74, 101], save_fig=next_name(env["pd-work"]) )) for l in ax.lines: l.set_linestyle("--") sns.lineplot(df[df["Tool"] == "GMS2"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime", hue="Genome", sns_kwargs={"palette": CM.get_map("verified")}, legend_loc="best", legend_ncol=2, ax=ax) if args.with_mgm: y_max = ax.get_ylim()[1] ax.axvline(50, 0, y_max, color="grey", linestyle="dashed") ax.axhline(74, 5, 49, color="grey", linestyle="dashed") ax.annotate("MGM", (5, 72)) if "MGM" in set(df["Tool"]): sns.lineplot(df[df["Tool"] == "MGM"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime", hue="Genome", sns_kwargs={"palette": CM.get_map("verified"), "linestyle": "-."}, ax=ax, legend=False) for l in ax.lines[len(ax.lines)-5:]: l.set_linestyle(":") fo = FigureOptions( xlabel="Chunk size (mb)", ylabel="Accuracy", ylim=[74,101], save_fig=next_name(env["pd-work"]) ) FigureOptions.set_properties_for_axis(ax, fo) plt.savefig(fo.save_fig) plt.show()
def analyze_independent_predictions(max_candidates, sen_a, sen_b): # type: (int, float, float) -> None sensitivities = { "Random": sensitivity_random, "Independent": sensitivity_independent, "Fully dependent": sensitivity_fully_dependent } agree_given_pred = { "Random": agree_given_pred_random, "Independent": agree_given_pred_independent, "Fully dependent": agree_given_pred_fully_dependent } df = compute_data(sensitivities, agree_given_pred, max_candidates) plot_sensitivities_vs_num_candidates(sensitivities, max_candidates, sen_a, sen_b) sns.lineplot( df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "Probability", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, legend_loc="best", figure_options=FigureOptions( save_fig=next_name("."), ylabel=r"$P(y=s|x_1=y, x_2=y)$", # xlim=[None, 40] )) # error df["1 - Probability"] = 1 - df["Probability"] sns.lineplot( df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "1 - Probability", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, legend_loc="best", figure_options=FigureOptions( save_fig=next_name("."), ylabel=r"$P(y\neq s|x_1=y, x_2=y)$", # xlim=[None, 40] )) import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4)) sns.lineplot(df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "Probability", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, ax=axes[0], legend=False, figure_options=FigureOptions(title="Sensitivity = 0.9", )) sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"]) & (df["Number of candidates"] == 25)], "Sensitivity A", "Probability", hue="Condition", ax=axes[1], sns_kwargs={"palette": CM.get_map("independence-conditions")}, figure_options=FigureOptions( ylim=[0, 1.05], xlim=[0, 1], xlabel="Sensitivity", title="Number of candidates = 25", )) save_figure(FigureOptions(save_fig=next_name(".")), fig) plt.show() df_tmp = df[(df["Sensitivity A"] == df["Sensitivity B"]) & (df["Condition"] == "Independent") & (df["Sensitivity A"].isin( {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}))] df_tmp.rename(columns={"Sensitivity A": "Sensitivity"}, inplace=True) sns.lineplot( df_tmp, "Number of candidates", "Probability", hue="Sensitivity", figure_options=FigureOptions( # ylim=[0, 1.05], # xlim=[0, 1], title="Independent algorithms", save_fig=next_name(".")), ) # for condition in set(df["Condition"]): # # sns.kdeplot( # df[(df["Condition"] == condition) & (df["Sensitivity A"] == df["Sensitivity B"])], # "Sensitivity A", "Number of candidates", "Probability", # figure_options=FigureOptions( # title=condition # )) import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4)) sns.lineplot(df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "Agree given prediction", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, ax=axes[0], legend=False, figure_options=FigureOptions(title="Sensitivity = 0.9", )) sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"]) & (df["Number of candidates"] == 25)], "Sensitivity A", "Agree given prediction", hue="Condition", ax=axes[1], sns_kwargs={"palette": CM.get_map("independence-conditions")}, figure_options=FigureOptions( ylim=[0, 1.05], xlim=[0, 1], xlabel="Sensitivity", title="Number of targets = 25", )) save_figure(FigureOptions(save_fig=next_name(".")), fig) plt.show()