def print_csvs(env, df, **kwargs): # type: (Environment, pd.DataFrame, Dict[str, Any]) -> None fn_prefix = get_value(kwargs, "fn_prefix", "", default_if_none=True) pd_work = env["pd-work"] df["Genome"] = df["Genome"].apply(short_name) num = 0 print(df.columns) df_to_pf_csv( df[[ "Genome", "NCBI", "Verified", "Number 3p match: Verified from NCBI", "Percentage 3p match: Verified from NCBI", "Number 5p-3p match: Verified from NCBI", "Percentage 5p-3p match: Verified from NCBI" ]], next_name(pd_work, ext="csv")) num += 1 df_to_pf_csv( df[[ "Genome", "GMS2", "Verified", "Number 3p match: Verified from GMS2", "Percentage 3p match: Verified from GMS2", "Number 5p-3p match: Verified from GMS2", "Percentage 5p-3p match: Verified from GMS2" ]], next_name(pd_work, ext="csv")) num += 1 df_to_pf_csv( df[[ "Genome", "SBSP", "Verified", "Number 3p match: Verified from SBSP", "Percentage 3p match: Verified from SBSP", "Number 5p-3p match: Verified from SBSP", "Percentage 5p-3p match: Verified from SBSP" ]], next_name(pd_work, ext="csv")) num += 1 df_to_pf_csv( df[[ "Genome", "Verified", "GMS2=SBSP", "Number 3p match: Verified from GMS2=SBSP", "Percentage 3p match: Verified from GMS2=SBSP", "Number 5p-3p match: Verified from GMS2=SBSP", "Percentage 5p-3p match: Verified from GMS2=SBSP" ]], next_name(pd_work, ext="csv")) num += 1 # # by support analyze_by_support(df, pd_work, fn_prefix, "SBSP") analyze_by_support(df, pd_work, fn_prefix, "GMS2=SBSP") analyze_by_step_group(df, pd_work, fn_prefix, "SBSP") analyze_by_step_group(df, pd_work, fn_prefix, "GMS2=SBSP")
def analyze_kimura_distances(env, df): # type: (Environment, pd.DataFrame) -> None pd_work = env["pd-work"] df = df[df["Kimura-to-query"] != "[]"].copy() df["Kimura-to-query"] = df["Kimura-to-query"].apply(ast.literal_eval) df["Average-Kimura"] = df["Kimura-to-query"].apply(np.mean) df["Std-Kimura"] = df["Kimura-to-query"].apply(np.std) sns.lmplot(df, "Genome GC", "Average-Kimura", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "scatter_kws": { "s": 5 }, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work))) df_mean = df.groupby(["Ancestor", "GCFID"], as_index=False).mean() sns.lmplot(df_mean, "Genome GC", "Average-Kimura", hue="Ancestor", sns_kwargs={ "scatter": True, "lowess": True, "scatter_kws": { "s": 5 }, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work))) # Min/max kimura df["Min-Kimura"] = df["Kimura-to-query"].apply(min) df["Max-Kimura"] = df["Kimura-to-query"].apply(max) contour_kimura_per_ancestor(env, df) one_dim_Kimura_accuracy(env, df) kimura_dist_plot(env, df) heat_map_Kimura_accuracy(env, df, "Min-Kimura", "Max-Kimura", balance=True, xlabel="Minimum Kimura", ylabel="Maximum Kimura") heat_map_Kimura_accuracy(env, df, "Average-Kimura", "Std-Kimura", balance=False)
def plot_fix_min_move_max(df): genomes = sorted(set(df["Genome"])) df = df[df["Min"] == 0.1] df = df.sort_values("Max", axis=0) fig, axes = plt.subplots(2, math.ceil(len(genomes) / 2), sharex="all", sharey="all") axes = axes.ravel() lines = list() for i in range(len(genomes)): name = genomes[i] ax = axes[i] df_tmp = df[df["Genome"] == name] s = ax.plot(df_tmp["Max"], df_tmp["Sensitivity"], label="Sensitivity") s = ax.plot(df_tmp["Max"], df_tmp["Coverage"], label="Coverage") ax.set_title(r"\textit{{{}. {}}}".format(name[0], name.split()[1]), style="italic") ax.set_ylim([49, 101]) lines.append(s) if i % 2 == 0: ax.set_ylabel("Percentage") if i >= 2: ax.set_xlabel("Maximum Kimura") plt.subplots_adjust(bottom=0.17) fig.legend(loc="lower center", labels=["Accuracy", "Coverage"], ncol=2) fig.suptitle("StartLink Performance for Kimura thresholds [0.1, x]") plt.savefig(next_name(my_env["pd-work"])) plt.show()
def logo_rbs_from_gms2_mod_file(pd_figures, pf_mod, title=""): # type: (str, str, str) -> None mod = GMS2Mod.init_from_file(pf_mod) mm = MotifModel(mod.items["RBS_MAT"], mod.items["RBS_POS_DISTR"]) non = GMS2Noncoding(mod.items["NON_MAT"]) import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 2) import logomaker as lm lm.Logo(lm.transform_matrix(mm.pwm_to_df(), from_type="probability", to_type="information", background=non.pwm_to_array(0)), ax=axes[0]) axes[0].set_title(title) axes[0].set_ylim(0, 2) df_spacer = pd.DataFrame({ "Distance from start": range(len(mm._spacer)), "Probability": mm._spacer }) sns.lineplot(df_spacer, "Distance from start", "Probability", ax=axes[1], figure_options=FigureOptions(ylim=[0, 0.4])) plt.tight_layout() plt.savefig(next_name(pd_figures)) plt.show()
def kimura_dist_plot(env, df): import seaborn import matplotlib.pyplot as plt ancestors = list(set(df["Ancestor"])) # fig, axes = plt.subplots(2, math.ceil(len(ancestors)/2), sharex=True, sharey=True) # # for anc, ax in zip(ancestors, axes.ravel()): # # df_group = df[df["Ancestor"] == anc] # seaborn.distplot(df_group["Average-Kimura"], ax=ax, color=CM.get_map("ancestor")[anc], # hist=False) # ax.set_title(anc) # plt.show() fig, ax = plt.subplots() # type: plt.Figure, plt.Axes for anc in ancestors: df_group = df[df["Ancestor"] == anc] seaborn.distplot(df_group["Average-Kimura"], ax=ax, color=CM.get_map("ancestor")[anc], hist=False, label=anc) # ax.set_title(anc) ax.legend(ancestors) ax.set_ylabel("PDF") save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show()
def analyze_gms2_components_on_verified_set(env, gil): # type: (Environment, GenomeInfoList) -> None # run different components list_df = list() for gi in gil: list_df.append( analyze_gms2_components_on_verified_set_for_gi(env, gi) ) df = pd.concat(list_df, ignore_index=True, sort=False) df["Genome"] = df.apply(fix_names, axis=1) print(df.to_csv()) fig, ax = plt.subplots(figsize=(12,4)) sns.barplot(df, "Genome", "Error", hue="Component", ax=ax, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) ), sns_kwargs={ "hue_order": reversed(["GMS2", "MGM2*", "Start Context", "RBS", "Start Codons", "Promoter", "MGM"]), "palette": CM.get_map("gms2_components") })
def sbsp_geom_density(df, x, y, pd_work, title=""): p = (ggplot(df, aes(x, color=y, fill=y)) + xlab(x) + ylab("Fraction") + geom_density(position="fill", alpha=0.5)) + \ theme(subplots_adjust={"top": 0.9}) + \ theme(legend_position=(.8, 0.95), legend_direction='horizontal') + ggtitle(title) p.save(next_name(pd_work)) print(p)
def analyze_by_step_group(df, pd_work, fn_prefix, tag): # type: (pd.DataFrame, str, str, str) -> None list_df = list() for index in df.index: curr_df = pd.DataFrame(df.at[index, "by_step_group_{}".format(tag)]) curr_df["Genome"] = df.at[index, "Genome"] if df.at[index, "Genome"] in {"A. pernix", "Synechocystis"}: continue list_df.append(curr_df) df_acc = pd.concat(list_df) sns.catplot( df_acc, "Step Group", "Percentage 3p match: Verified from {}".format(tag), hue="Genome", kind="point", figure_options=FigureOptions( title="Percentage 3p match versus minimum support", ylabel="Percentage of 3p match", save_fig=next_name(pd_work), ylim=[None, 100.5]), ) sns.catplot( df_acc, "Step Group", "Percentage 5p-3p match: Verified from {}".format(tag), kind="point", hue="Genome", figure_options=FigureOptions( title="Percentage 5p-3p match versus minimum support", ylabel="Percentage of 5p-3p match", save_fig=next_name(pd_work), ylim=[90, 100.5]), ) print(df_acc.to_string())
def analyze_by_support(df, pd_work, fn_prefix, tag): # type: (pd.DataFrame, str, str, str) -> None list_df = list() for index in df.index: curr_df = pd.DataFrame(df.at[index, "by_support_{}".format(tag)]) curr_df["Genome"] = df.at[index, "Genome"] if df.at[index, "Genome"] in {"A. pernix", "Synechocystis"}: continue list_df.append(curr_df) df_acc = pd.concat(list_df) sns.lineplot( df_acc, "Min Support", "Percentage 3p match: Verified from {}".format(tag), hue="Genome", figure_options=FigureOptions( title="Percentage of verified genes predicted\nby {}".format(tag), ylabel="Percentage", save_fig=next_name(pd_work), ylim=[None, 100.5])) sns.lineplot( df_acc, "Min Support", "Percentage 5p-3p match: Verified from {}".format(tag), hue="Genome", figure_options=FigureOptions( title="Percentage of predicted {} genes\nwith correct 5' end". format(tag), ylabel="Percentage of 5p-3p match", save_fig=next_name(pd_work), ylim=[90, 100.5]))
def plot_move_consecutive_blocks(df): # type: (pd.DataFrame) -> None genomes = sorted(set(df["Genome"])) fig, axes = plt.subplots(2, math.ceil(len(genomes) / 2), sharex="all", sharey="all") axes = axes.ravel() all_kimura_values = sorted(set(df["Max"]).union(set(df["Min"]))) for i in range(len(genomes)): name = genomes[i] ax = axes[i] # filter df only by those with consecutive block pair list_df = list() for j in range(1, len(all_kimura_values)): low = all_kimura_values[j - 1] high = all_kimura_values[j] df_tmp = df[(df["Min"] == low) & (df["Max"] == high)] list_df.append(df_tmp) df_tmp = pd.concat(list_df, sort=False) df_tmp["Average"] = (df_tmp["Max"] + df_tmp["Min"]) / 2.0 df_tmp = df_tmp[df_tmp["Genome"] == name] s = ax.plot(df_tmp["Average"], df_tmp["Sensitivity"], label="Sensitivity") s = ax.plot(df_tmp["Average"], df_tmp["Coverage"], label="Coverage") ax.set_title(r"\textit{{{}. {}}}".format(name[0], name.split()[1]), style="italic") ax.set_ylim([49, 101]) if i % 2 == 0: ax.set_ylabel("Percentage") if i >= 2: ax.set_xlabel("Average Kimura") plt.subplots_adjust(bottom=0.17) fig.suptitle("StartLink Performance for small blocks of Kimura") fig.legend(loc="lower center", labels=["Accuracy", "Coverage"], ncol=2) plt.savefig(next_name(my_env["pd-work"])) plt.show()
def contour_kimura_per_ancestor(env, df): import seaborn import matplotlib.pyplot as plt ancestors = sorted(list(set(df["Ancestor"]))) fig, axes = plt.subplots(2, math.ceil(len(ancestors) / 2), sharex=True, sharey=True, figsize=(6, 6)) for anc, ax in zip(ancestors, axes.ravel()): df_group = df[df["Ancestor"] == anc] seaborn.kdeplot(df_group["Min-Kimura"].values, df_group["Max-Kimura"].values, ax=ax) ax.set_title(anc) # ax.set_ylim([0.45, 0.525]) # fig.xlabel("Min-Kimura") # plt.xlabel("Min-Kimura") # plt.ylabel("Max-Kimura") # fig.text(0.5, 0.04, 'Min-Kimura', ha='center') # fig.text(0.04, 0.5, 'Max-Kimura', va='center', rotation='vertical') fig.add_subplot(111, frameon=False) # # hide tick and tick label of the big axes plt.tick_params(top=False, bottom=False, left=False, right=False, which="both", labelbottom=False, labeltop=False, labelleft=False, labelright=False) plt.xlabel("Minimum Kimura", labelpad=20) plt.ylabel("Maximum Kimura", labelpad=30) fig.tight_layout() save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show()
def visualize(mgm_mm, title="", **kwargs): # type: (MGMMotifModelV2, str, Dict[str, Any]) -> None msa_t = get_value(kwargs, "msa_t", None) raw_motif_data = get_value(kwargs, "raw_motif_data", None) num_shifts = len(mgm_mm._shift_prior.keys()) fig = plt.figure(figsize=(14, 4 * num_shifts)) shape = (num_shifts + 1, 5) # for each shift for s in range(num_shifts): # create consensus, followed by box plots ax_logo = plt.subplot2grid(shape, (s, 0)) axes_box = [plt.subplot2grid(shape, (s, i)) for i in range(1, 5)] MGMMotifModelVisualizerV2._viz_logo(mgm_mm, ax_logo, s) if raw_motif_data is None: MGMMotifModelVisualizerV2._viz_motif_pwm(mgm_mm, axes_box, s) else: MGMMotifModelVisualizerV2._viz_motif_pwm_from_raw_data( raw_motif_data[s], axes_box, mgm_mm.motif_width()) # last row: MSA, shift prior, spacers ax_text = plt.subplot2grid(shape, (num_shifts, 0)) ax_counts = plt.subplot2grid(shape, (num_shifts, 1)) ax_pos_dist = plt.subplot2grid(shape, (num_shifts, 2)) MGMMotifModelVisualizerV2._viz_spacer(mgm_mm, ax_pos_dist) MGMMotifModelVisualizerV2._viz_prior(mgm_mm, ax_counts) if msa_t is not None: MGMMotifModelVisualizerV2._viz_msa(msa_t, ax_text) plt.suptitle("Gc range: {}".format(title)) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.savefig(next_name(".")) plt.show()
def visualize(mgm_mm, title="", **kwargs): # type: (MGMMotifModel, str, Dict[str, Any]) -> None msa_t = get_value(kwargs, "msa_t", None) raw_motif_data = get_value(kwargs, "raw_motif_data", None) fig = plt.figure(figsize=(10, 12)) shape = (4, 2) ax1 = plt.subplot2grid(shape, (0, 0)) ax2 = plt.subplot2grid(shape, (0, 1)) ax3 = plt.subplot2grid(shape, (1, 0)) ax4 = plt.subplot2grid(shape, (1, 1)) ax_logo = plt.subplot2grid(shape, (3, 0)) ax_counts = plt.subplot2grid(shape, (2, 0)) ax_pos_dist = plt.subplot2grid(shape, (2, 1)) ax_text = plt.subplot2grid(shape, (3, 1)) axes = [ax1, ax2, ax3, ax4] # letters if raw_motif_data is None: MGMMotifModelVisualizer._viz_motif_pwm(mgm_mm, axes) else: MGMMotifModelVisualizer._viz_motif_pwm_from_raw_data( raw_motif_data, axes, mgm_mm.motif_width()) MGMMotifModelVisualizer._viz_spacer(mgm_mm, ax_pos_dist) MGMMotifModelVisualizer._viz_prior(mgm_mm, ax_counts) if msa_t is not None: MGMMotifModelVisualizer._viz_logo(mgm_mm, ax_logo) MGMMotifModelVisualizer._viz_msa(msa_t, ax_text) plt.suptitle("Gc range: {}".format(title)) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.savefig(next_name(".")) plt.show()
def viz_summary_per_gcfid_per_step(env, df): # type: (Environment, pd.DataFrame) -> None pd_work = env['pd-work'] list_df = list() for step in ["A", "B", "C"]: df_summary_per_gcfid = get_summary_per_gcfid( df[df["Predicted-at-step"] == step]) df_summary_per_gcfid["SBSP Step"] = step list_df.append(df_summary_per_gcfid) df_per_gcfid_per_step = pd.concat(list_df, sort=False) sns.catplot(df_per_gcfid_per_step, "Ancestor", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="SBSP Step", kind="box", legend_loc="best", figure_options=FigureOptions(save_fig=next_name(pd_work), xlabel="Clade", ylabel="Err(NCBI,GMS2=SBSP)"))
def plot_for_5prime(df, bins=20): # type: (pd.DataFrame) -> None import matplotlib.pyplot as plt import seaborn genomes = sorted(set(df["Genome"])) regions = sorted(set(df["region"])) num_genome = len(genomes) import numpy as np fig, axes = plt.subplots(2, math.ceil(num_genome / 2), sharey="all", sharex="all") axes = axes.ravel() for i, g in enumerate(genomes): ax = axes[i] for x in regions: df_group = df[(df["Genome"] == g) & (df["region"] == x)] # for x, df_group in df[df["Genome"] == g].groupby("region"): # seaborn.distplot(df_group["score"], label=x if i == 0 else None, ax=ax, # norm_hist=False, kde=False, bins=30) hist_values, bin_edges = np.histogram(df_group["score"], bins=bins) hist_values = 100 * hist_values / sum(hist_values) # bin_edges = bin_edges[:len(bin_edges)-1] bin_edges = bin_edges[1:] seaborn.lineplot(bin_edges, hist_values, markers=False, ax=ax, label=x if i == 0 else None, legend=False) # ax.hist(df_group["score"], bins=30, normed=True, histtype="step") # seaborn.barplot() ax.set_title(r"\textit{{{}}}".format(str(g))) ax.set_xlabel(None) ax.set_ylim([0, None]) y_max = ax.get_ylim()[1] ax.axvline(0.5, 0, y_max, color="grey", linestyle="dashed") plt.subplots_adjust(bottom=0.17) fig.legend(loc="lower center", ncol=len(regions)) fig.add_subplot(111, frameon=False) plt.tick_params(top=False, bottom=False, left=False, right=False, which="both", labelbottom=False, labeltop=False, labelleft=False, labelright=False) plt.xlabel("Score", labelpad=25) plt.ylabel("Percentage per group", labelpad=30) # figure_options = FigureOptions( # xlabel="Score", ylabel="Frequency", save_fig=next_name(my_env["pd-work"]) # ) plt.savefig(next_name(my_env["pd-work"])) plt.show()
def viz_per_genome(env, df): # type: (Environment, pd.DataFrame) -> None df_grp = df.groupby(["Genome", "Ancestor"], as_index=False).mean() sns.catplot(df_grp, "Ancestor", "BLAST", figure_options=FigureOptions(save_fig=next_name( env["pd-work"]), xlabel="Clade", ylabel="Number of BLASTp Hits"), sns_kwargs={"palette": CM.get_map("ancestor")}) # list_grp = list() # for _, df_grp in df.groupby("Genome", as_index=False): # indices = df_grp.index # # list_grp.append({ # "Genome": df.at[indices[0], "Genome"], # "Ancestor": df.at[indices[0], "Ancestor"], # "= 0": len(df_grp[df_grp["BLAST"] == 0]), # **{ # f"< {x}": len(df_grp[df_grp["BLAST"] < x]) for x in [5, 10, 20, 50, 100, 500, 1000, 5000, 10000] # }, # "> 10000": len(df_grp[df_grp["BLAST"] > 10000]) # }) # # df_grp = pd.DataFrame(list_grp) # sns.catplot(df_grp, "Ancestor", "= 0") # sns.catplot(df_grp, "Ancestor", "< 5") # sns.catplot(df_grp, "Ancestor", "< 50") # sns.catplot(df_grp, "Ancestor", "< 100") # plots # 1) x: number of queries with < x targets # compute per genome, the % of queries with hits <= 0, 5, 10, 20, 40, 80, 160, ... 240 580 1160, ... # plot list_entries = list() for _, df_grp in df.groupby("Genome", as_index=False): indices = df_grp.index genome = df.at[indices[0], "Genome"] ancestor = df.at[indices[0], "Ancestor"] total_queries = len(df_grp) curr = 0 for n in range(40): list_entries.append({ "Genome": genome, "Ancestor": ancestor, "x": curr, "y": 100 * len(df_grp[df_grp["BLAST"] < curr]) / total_queries }) # if list_entries[-1]["y"] == 100: # break if curr == 0: curr = 5 else: curr *= 1.2 df_tmp = pd.DataFrame(list_entries) SMALL_SIZE = 16 MEDIUM_SIZE = 22 BIGGER_SIZE = 24 matplotlib.rcParams.update({ # "pgf.texsystem": "pdflatex", 'font.family': 'serif', 'text.usetex': True, 'pgf.rcfonts': False, 'font.size': SMALL_SIZE, # controls default text sizes 'axes.titlesize': SMALL_SIZE, # fontsize of the axes title 'axes.labelsize': MEDIUM_SIZE, # fontsize of the x and y labels 'xtick.labelsize': SMALL_SIZE, # fontsize of the tick labels 'ytick.labelsize': SMALL_SIZE, # fontsize of the tick labels 'legend.fontsize': 12, # legend fontsize 'figure.titlesize': BIGGER_SIZE, # fontsize of the figure title }) sns.lineplot(df_tmp, "x", "y", hue="Ancestor", figure_options=FigureOptions( xlabel="Number of BLASTp hits", ylabel="Cumulative percentage of queries (per genome)", save_fig=next_name(env["pd-work"]), ), legend_loc="best", legend_title="", legend_ncol=2, sns_kwargs={ "ci": "sd", "palette": CM.get_map("ancestor") }) sns.lineplot(df_tmp, "y", "x", hue="Ancestor", figure_options=FigureOptions( ylabel="Number of BLASTp hits", xlabel="Cumulative percentage of queries (per genome)", save_fig=next_name(env["pd-work"]), ), legend_loc="best", legend_title="", legend_ncol=2, sns_kwargs={ "ci": "sd", "palette": CM.get_map("ancestor") }) SMALL_SIZE = 14 MEDIUM_SIZE = 18 BIGGER_SIZE = 20 matplotlib.rcParams.update({ # "pgf.texsystem": "pdflatex", 'font.family': 'serif', 'text.usetex': True, 'pgf.rcfonts': False, 'font.size': SMALL_SIZE, # controls default text sizes 'axes.titlesize': SMALL_SIZE, # fontsize of the axes title 'axes.labelsize': MEDIUM_SIZE, # fontsize of the x and y labels 'xtick.labelsize': SMALL_SIZE, # fontsize of the tick labels 'ytick.labelsize': SMALL_SIZE, # fontsize of the tick labels 'legend.fontsize': 12, # legend fontsize 'figure.titlesize': BIGGER_SIZE, # fontsize of the figure title }) fig, axes = plt.subplots(2, 2, sharex="all", sharey="all") ancestors = sorted(set(df["Ancestor"])) for anc, ax in zip(ancestors, axes.ravel()): df_anc = df_tmp[df_tmp["Ancestor"] == anc] sns.lineplot(df_anc[df_anc["x"] <= 40], "x", "y", hue="Ancestor", legend=None, ax=ax, sns_kwargs={ "ci": "sd", "palette": CM.get_map("ancestor") }) ax.set_title(anc) ax.set_xlabel("") ax.set_ylabel("") figure_options = FigureOptions( xlabel="Number of BLASTp hits", ylabel="Cumulative percentage of\nqueries (per genome)", save_fig=next_name(env["pd-work"]), ) fig.add_subplot(111, frameon=False) # # hide tick and tick label of the big axes plt.tick_params(top=False, bottom=False, left=False, right=False, which="both", labelbottom=False, labeltop=False, labelleft=False, labelright=False) plt.xlabel(figure_options.xlabel, labelpad=30) plt.ylabel(figure_options.ylabel, labelpad=30) # save_figure(figure_options, fig) fig.savefig(next_name(env["pd-work"]), bbox_inches="tight") plt.show()
def viz_summary_per_gcfid_per_step(env, df): # type: (Environment, pd.DataFrame) -> None # gather analysis for steps A, A+B, and A+B+C list_df = list() # type: List[pd.DataFrame] # compute total number of predictions per tool, per genome for gcfid, df_group in df.groupby("GCFID", as_index=False): df.loc[df_group.index, "Total SBSP"] = df.loc[df_group.index, "SBSP"].sum() df.loc[df_group.index, "Total GMS2"] = df.loc[df_group.index, "GMS2"].sum() df.loc[df_group.index, "Total GMS2=SBSP"] = df.loc[df_group.index, "GMS2=SBSP"].sum() # loop over steps A, A+B, and A+B+C and collect stats tag = None for step in ["A", "B", "C"]: if tag is None: tag = step else: tag += "+" + step df_summary_per_gcfid = get_summary_per_gcfid( df[df["Predicted-at-step"] <= step]) df_summary_per_gcfid["SBSP Step"] = tag list_df.append(df_summary_per_gcfid) df_per_gcfid_per_step = pd.concat(list_df, sort=False) import matplotlib.pyplot as plt # fig, ax = plt.subplots() # # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax, # sns_kwargs={"palette": CM.get_map("verified")}, # legend=False # ) # for l in ax.lines: # l.set_linestyle("--") # # ax2 = ax.twinx() # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax2, # sns_kwargs={"palette": CM.get_map("verified")},) # # fo = FigureOptions( # xlabel="SBSP Step", # ylabel="Percentage", # # ylim=[0, 105], # save_fig=next_name(env["pd-work"]) # ) # FigureOptions.set_properties_for_axis(ax, fo) # plt.subplots_adjust(bottom=0.2) # handles, labels = ax.get_legend_handles_labels() # ax.legend(handles=handles[1:], labels=labels[1:], # loc="lower center", ncol=4, bbox_to_anchor=(0.5, -0.25)) # # plt.savefig(fo.save_fig) # plt.show() fig, axes = plt.subplots(3, 2, sharex="all", sharey="row") ax = axes[:, 0] sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax[0], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions( ylabel="Error rate (\%)", ylim=[0, 20], )) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Cov(SBSP,NCBI)", hue="GCFID", ax=ax[1], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Percentage\nof Genes", ylim=[0, None])) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax[2], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Number\nof Genes", ylim=[0, None])) fig.align_ylabels(ax) # plt.savefig(next_name(env["pd-work"])) # plt.show() # fig, ax = plt.subplots(3, 1, sharex="all") ax = axes[:, 1] sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(GMS2=SBSP,NCBI)", hue="GCFID", ax=ax[0], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions( ylabel="Error", ylim=[0, None], )) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Cov(GMS2=SBSP,NCBI)", hue="GCFID", ax=ax[1], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Percentage of Genes", ylim=[0, None])) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "GMS2=SBSP", hue="GCFID", ax=ax[2], sns_kwargs={"palette": CM.get_map("verified")}, figure_options=FigureOptions(ylabel="Number of Genes", ylim=[0, None])) ax[2].get_legend().remove() fig.align_ylabels(ax) for ax in axes.ravel(): ax.set_xlabel("Steps") axes[0][0].set_title(TOOL) axes[0][1].set_title(TOOLp) fig.subplots_adjust(bottom=0.21) # handles, labels = ax.get_legend_handles_labels() # fig.legend(handles=handles[1:], labels=labels[1:], loc="lower center", ncol=4)#, bbox_to_anchor=(0.5, -0.25)) handles, labels = ax.get_legend_handles_labels() labels[0] = "Genome" fig.legend(handles=handles, labels=labels, loc="lower center", ncol=3) #, bbox_to_anchor=(0.5, -0.25)) plt.savefig(next_name(env["pd-work"])) plt.show() # three plots for gcfid, df_group in df.groupby("GCFID", as_index=False): df.loc[df_group.index, "Total SBSP"] = ((df_group["SBSP"]) & (df_group["NCBI"])).sum() df.loc[df_group.index, "Total GMS2"] = ((df_group["GMS2"]) & (df_group["NCBI"])).sum() df.loc[df_group.index, "Total GMS2=SBSP"] = ((df_group["GMS2=SBSP"]) & (df_group["NCBI"])).sum() df_all = get_summary_per_gcfid(df) # map column names for tables columns = [ "GCFID", "NCBI", "Sen(SBSP,NCBI)", "Sen(GMS2,NCBI)", "Sen(GMS2=SBSP,NCBI)", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)", "Cov2(GMS2=SBSP,NCBI)" ] df_sen = df_all.copy()[columns].rename(columns={ "GCFID": "Genome", "NCBI": "Verified", "Sen(SBSP,NCBI)": "SBSP", "Sen(GMS2,NCBI)": "GMS2", "Sen(GMS2=SBSP,NCBI)": "GMS2=SBSP", }, inplace=False) df_sen[["Genome", "Verified", "SBSP", "GMS2", "GMS2=SBSP"]].to_csv(os_join(env["pd-work"], "sensitivity.csv"), index=False) # print(df_all[["GCFID", "NCBI", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)", "Cov2(GMS2=SBSP,NCBI)"]].to_string(index=False)) df_cov = df_all[columns].rename(columns={ "GCFID": "Genome", "NCBI": "Verified", "Cov2(SBSP,NCBI)": "SBSP", "Cov2(GMS2,NCBI)": "GMS2", "Cov2(GMS2=SBSP,NCBI)": "GMS2=SBSP", }, inplace=False) df_cov[["Genome", "Verified", "SBSP", "GMS2", "GMS2=SBSP"]].to_csv(os_join(env["pd-work"], "coverage.csv"), index=False)
def one_dim_Kimura_accuracy(env, df_all, num_steps=20): # type: (Environment, pd.DataFrame, int) -> None import matplotlib.pyplot as plt pd_work = env["pd-work"] ancestors = sorted(list(set(df_all["Ancestor"]))) # fig, axes = plt.subplots(2, math.ceil(len(ancestors) / 2), sharex=True, sharey=True) # min_x = min(df_all["Average-Kimura"]) # max_x = max(df_all["Average-Kimura"]) + 0.000000001 # ss_x = (max_x - min_x) / float(num_steps) # # list_df = list() # axis_idx = 0 # for ancestor, df in df_all.groupby("Ancestor", as_index=False): # # ax = axes.ravel()[axis_idx] # # axis_idx += 1 # # # # # # import numpy as np # gms2_eq_sbsp_and_ncbi = np.zeros(num_steps, dtype=float) # gms2_eq_sbsp_eq_ncbi = np.zeros(num_steps, dtype=float) # # df_gms2_eq_sbsp_and_ncbi = (df["GMS2=SBSP"]) & (df["NCBI"]) # df_gms2_eq_sbsp_eq_ncbi = (df["GMS2=SBSP=NCBI"]) # # for index in df.index: # # x_val = df.at[index, "Average-Kimura"] # # x_pos = int((x_val-min_x) / ss_x) # # gms2_eq_sbsp_and_ncbi[x_pos] += 1 if df.at[index, "GMS2=SBSP"] and df.at[index, "NCBI"] else 0 # gms2_eq_sbsp_eq_ncbi[x_pos] += 1 if df.at[index, "GMS2=SBSP=NCBI"] else 0 # # accuracy = np.divide(gms2_eq_sbsp_eq_ncbi, gms2_eq_sbsp_and_ncbi) # # accuracy = np.flip(accuracy, 0) # # # xticks = list(range(0, num_steps)) # # l_x = np.arange(min_x, max_x, ss_x) # xticklabels = [round(l_x[i], 2) for i in xticks] # # g = seaborn.heatmap(accuracy.transpose(), vmin=0, vmax=1, xticklabels=xticklabels, yticklabels=yticklabels, ax=ax, # # cbar=True) # # # g = seaborn.lineplot(xticklabels, accuracy, ax=ax, label=ancestor) # # # cbar=g.cbar # # # g.set_xticks(xticks) # # curr_df = pd.DataFrame({ # "Average-Kimura": xticklabels, # "Accuracy": accuracy, # "Number-of-queries": gms2_eq_sbsp_and_ncbi # }) # curr_df["Ancestor"] = ancestor # list_df.append(curr_df) # # # g.set_xlabel("Min Kimura") # # g.set_ylabel("Max Kimura") # # g.set_title(ancestor) # # df = pd.concat(list_df) # type: pd.DataFrame df = bin_data_one_d(env, df_all, "Average-Kimura", num_steps) sns.lineplot(df, "Average-Kimura", "Accuracy", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) sns.lineplot(df, "Average-Kimura", "Number-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) total_per_ancestor = { ancestor: (df["Ancestor"].isin({ancestor})).sum() for ancestor in ancestors } df["Percentage-of-queries"] = 0 df["Cumulative-percentage-of-queries"] = 0 df.reset_index(inplace=True) for ancestor, df_group in df.groupby( "Ancestor", as_index=False): # type: str, pd.DataFrame df_group.sort_values("Average-Kimura", inplace=True) index = df_group.index prev = 0 total = df_group["Number-of-queries"].sum() df.loc[index, "Percentage-of-queries"] = 100 * df.loc[ index, "Number-of-queries"] / float(total) for i in index: df.loc[i, "Cumulative-percentage-of-queries"] = prev + df.loc[ i, "Percentage-of-queries"] prev = df.loc[i, "Cumulative-percentage-of-queries"] fig, ax = plt.subplots(figsize=(8, 4)) sns.lineplot(df, "Average-Kimura", "Percentage-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ylabel="Percentage of queries", xlabel="Average Kimura"), ax=ax, show=True, legend_loc="best", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.lineplot(df, "Average-Kimura", "Cumulative-percentage-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) # standard dev df = bin_data_one_d(env, df_all[df_all["Support"] > 2], "Std-Kimura", num_steps) sns.lineplot(df, "Std-Kimura", "Accuracy", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) sns.lineplot(df, "Std-Kimura", "Number-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) total_per_ancestor = { ancestor: (df["Ancestor"].isin({ancestor})).sum() for ancestor in ancestors } df["Percentage-of-queries"] = 0 df["Cumulative-percentage-of-queries"] = 0 df.reset_index(inplace=True) for ancestor, df_group in df.groupby( "Ancestor", as_index=False): # type: str, pd.DataFrame df_group.sort_values("Std-Kimura", inplace=True) index = df_group.index prev = 0 total = df_group["Number-of-queries"].sum() df.loc[index, "Percentage-of-queries"] = 100 * df.loc[ index, "Number-of-queries"] / float(total) for i in index: df.loc[i, "Cumulative-percentage-of-queries"] = prev + df.loc[ i, "Percentage-of-queries"] prev = df.loc[i, "Cumulative-percentage-of-queries"] sns.lineplot(df, "Std-Kimura", "Percentage-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")}) sns.lineplot(df, "Std-Kimura", "Cumulative-percentage-of-queries", hue="Ancestor", figure_options=FigureOptions(save_fig=next_name(pd_work), ), sns_kwargs={"palette": CM.get_map("ancestor")})
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) prl_options = ParallelizationOptions.init_from_dict(env, vars(args)) if not prl_options["use-pbs"]: df = relative_entropy_analysis(env, gil, prl_options) else: pbs = PBS(env, prl_options, splitter=split_genome_info_list, merger=merge_identity) list_df = pbs.run(data={"gil": gil}, func=relative_entropy_analysis, func_kwargs={ "env": env, "prl_options": prl_options }) df = pd.concat(list_df, ignore_index=True, sort=False) df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False) pd_figures = os_join(env["pd-work"], "summary_figures") mkdir_p(pd_figures) sns.scatterplot(df, "Percent", "Error", figure_options=FigureOptions( ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.scatterplot( df, "RE Motif", "RE Spacer", hue="Genome", identity=True, figure_options=FigureOptions(save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "RE", hue="Genome", figure_options=FigureOptions(save_fig=next_name(pd_figures)))
def plot_letter_over_position(env, df, col, title=""): # type: (Environment, pd.DataFrame, str, str) -> None collect = dict() array, update_shifts = create_numpy_for_column_with_extended_motif( env, df, col, collect) df_original = df binned_arrays = [{ "GC": df["GC"], "motifs": array, "shifts": update_shifts }] example = df.at[df.index[0], col] # type: Dict[str, List[float]] w = len(next(iter(example.values()))) # width (numbere of positions) b = len(example) # number of bases (letters) letters = example.keys() letter_to_idx = {x: x_pos for x_pos, x in enumerate(sorted(letters))} # fig, axes = plt.subplots(2, math.ceil(len(letters) / 2), sharex="all", sharey="all") fig = plt.figure(figsize=(10, 12)) shape = (4, 2) ax1 = plt.subplot2grid(shape, (0, 0)) ax2 = plt.subplot2grid(shape, (0, 1)) ax3 = plt.subplot2grid(shape, (1, 0)) ax4 = plt.subplot2grid(shape, (1, 1)) ax_logo = plt.subplot2grid(shape, (3, 0)) ax_counts = plt.subplot2grid(shape, (2, 0)) ax_pos_dist = plt.subplot2grid(shape, (2, 1)) ax_text = plt.subplot2grid(shape, (3, 1)) axes = [ax1, ax2, ax3, ax4] # for each letter # for l, ax in zip(letters, axes.ravel()[:len(letters)]): ylim = [-0.1, 1.1] for l, ax in zip(letters, axes): # for each position in motif # go through df and accumulate values all_gc = list() all_probs = list() for w_pos in range(array.shape[1]): for ba in binned_arrays: arr = ba["motifs"] gc = ba["GC"].values shifts = ba["shifts"] for index in range(len(shifts)): shifted_position = w_pos # print(w_pos, shifted_position) # shifted_pos = w_pos - shifts[index] # if shifted_pos < 0 or shifted_pos >= w: # continue if w_pos < shifts[index] or w_pos >= shifts[index] + 6: continue all_gc.append(shifted_position) if arr[index, shifted_position, letter_to_idx[l]] < 0 or arr[index, shifted_position, letter_to_idx[l]] > 1: raise ValueError("Something's up") all_probs.append(arr[index, shifted_position, letter_to_idx[l]]) # ax.scatter(all_gc, all_probs, marker="+") # seaborn.regplot(all_gc, all_probs, ax=ax, lowess=True, scatter_kws={"s": 5, "alpha": 0.3}) ax.set_title(f"{l}") df = pd.DataFrame({"Position": all_gc, "Probability": all_probs}) df.sort_values("Position", inplace=True) # seaborn.kdeplot(df["Position"], df["Probability"], cmap="Reds", ax=ax) df_mean = df.groupby("Position", as_index=False).mean() seaborn.boxplot("Position", "Probability", data=df, ax=ax, color="red", fliersize=0) seaborn.lineplot(df_mean["Position"], df_mean["Probability"], ax=ax, color="blue") ax.set_ylim(ylim) # loess_with_stde(df, "Position", "Probability", ax, None) # plt.show() # add logo ax = ax_logo msa_t = collect["msa_t"] seqs = [x.seq._data for x in msa_t.list_alignment_sequences] counts_mat = lm.alignment_to_matrix(sequences=seqs, to_type='counts', characters_to_ignore='.-X') # Counts matrix -> Information matrix info_mat = lm.transform_matrix(counts_mat, from_type='counts', to_type='information') lm.Logo(info_mat, ax=ax, color_scheme="classic") ax.set_ylim([0, 2]) # add distplot of starting positions ax = ax_counts # seaborn.distplot(update_shifts, ax=ax) counter = Counter(update_shifts) total = sum(counter.values()) to_add = sorted(set(range(4)).difference(counter.keys())) normalized = [[x, 100 * counter[x] / total] for x in counter] + [[x, 0] for x in to_add] normalized = np.array(normalized) seaborn.barplot(normalized[:, 0], normalized[:, 1], ax=ax, color="blue") ax.set_ylim([0, 100]) ax.set_ylabel("Probability") ax.set_xlabel("Shift in consensus") ### Plot position distribution col_pos = col.replace("_MAT", "_POS_DISTR") ax = ax_pos_dist shift_to_pos_dist = get_position_distributions_by_shift( df_original, col_pos, update_shifts) for s in sorted(shift_to_pos_dist.keys()): list_pos_dist = shift_to_pos_dist[s] # average positions values = dict() for l in list_pos_dist: try: for i in l.keys(): if i not in values.keys(): values[i] = list() values[i].append(l[i]) except Exception: continue for i in values.keys(): values[i] = np.mean(values[i]) total = sum(values.values()) for i in values.keys(): values[i] /= total x = sorted(values.keys()) y = [values[a] for a in x] seaborn.lineplot(x, y, label=s, ax=ax) ax.legend() # TEXT ax = ax_text from matplotlib.font_manager import FontProperties fp = FontProperties() fp.set_family("monospace") print("here") print(print_reduced_msa(msa_t, True, n=10)) ax.text(0, 0, print_reduced_msa(msa_t, True, n=10), horizontalalignment='left', verticalalignment='center', fontproperties=fp) ax.set_xlim([-0.2, 0.4]) ax.set_ylim([-0.4, 0.4]) # ax.axis("off",) ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) plt.suptitle("Gc range: {}. Num Data points: {}".format( title, msa_t.number_of_sequences())) # save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.savefig(next_name(env["pd-work"])) plt.show()
def heat_map_Kimura_accuracy(env, df_all, x, y, num_steps=20, balance=False): # type: (Environment, pd.DataFrame, str, str, int) -> None import matplotlib.pyplot as plt ancestors = sorted(list(set(df_all["Ancestor"]))) fig, axes = plt.subplots(2, math.ceil(len(ancestors) / 2), sharex=True, sharey=True) cbar_ax = fig.add_axes([.91, .3, .03, .4]) # fig = plt.figure() num_rows = 2 num_cols = math.ceil(len(ancestors) / 2) axis_idx = 0 curr_row = 0 curr_col = 0 for ancestor, df in df_all.groupby("Ancestor", as_index=False): ax = axes.ravel()[axis_idx] # ax = plt.subplot2grid((num_rows, num_cols), (curr_row, curr_col)) axis_idx += 1 curr_col += 1 if curr_col == math.ceil(len(ancestors) / 2): curr_row += 1 curr_col = 0 min_x = min(df[x]) max_x = max(df[x]) + 0.000000001 min_y = min(df[y]) max_y = max(df[y]) + 0.000000001 if balance: min_x = min_y = min(min_x, min_y) max_x = max_y = max(max_x, max_y) ss_x = (max_x - min_x) / float(num_steps) ss_y = (max_y - min_y) / float(num_steps) num_col = num_steps num_row = num_steps import numpy as np gms2_eq_sbsp_and_ncbi = np.zeros([num_row, num_col], dtype=float) gms2_eq_sbsp_eq_ncbi = np.zeros([num_row, num_col], dtype=float) df_gms2_eq_sbsp_and_ncbi = (df["GMS2=SBSP"]) & (df["NCBI"]) df_gms2_eq_sbsp_eq_ncbi = (df["GMS2=SBSP=NCBI"]) for index in df.index: x_val = df.at[index, x] y_val = df.at[index, y] x_pos = int((x_val - min_x) / ss_x) y_pos = int((y_val - min_y) / ss_y) gms2_eq_sbsp_and_ncbi[x_pos][y_pos] += 1 if df.at[ index, "GMS2=SBSP"] and df.at[index, "NCBI"] else 0 gms2_eq_sbsp_eq_ncbi[x_pos][y_pos] += 1 if df.at[ index, "GMS2=SBSP=NCBI"] else 0 gms2_eq_sbsp_and_ncbi[gms2_eq_sbsp_and_ncbi < 10] = 0 accuracy = np.divide(gms2_eq_sbsp_eq_ncbi, gms2_eq_sbsp_and_ncbi) # accuracy = np.flip(accuracy, 0) import seaborn import matplotlib.pyplot as plt xticks = list(range(0, num_steps, int(num_steps / 5))) yticks = list(range(0, num_steps, int(num_steps / 5))) l_x = np.arange(min_x, max_x, ss_x) l_y = np.arange(min_y, max_y, ss_y) xticklabels = [round(l_x[i], 2) for i in xticks] yticklabels = [round(l_y[i], 2) for i in yticks] g = seaborn.heatmap(accuracy.transpose(), vmin=0, vmax=1, xticklabels=xticklabels, yticklabels=yticklabels, ax=ax, cbar=False) # cbar_ax=None if axis_idx != 0 else cbar_ax, cbar=axis_idx==0) # cbar=g.cbar g.invert_yaxis() g.set_xticks(xticks) g.set_yticks(yticks) g.set_xticklabels(xticklabels, rotation=0) # g.set_xlabel("Min Kimura") # g.set_ylabel("Max Kimura") g.set_title(ancestor) mappable = ax.collections[0] # im = plt.gca().get_children()[0] # cax = fig.add_axes([0.8, 0.1, 0.03, 0.8]) cbar_ax = fig.axes[-1] # fig.tight_layout(rect=[0, 0, .9, 1]) fig.add_subplot(111, frameon=False) # hide tick and tick label of the big axes plt.tick_params(top=False, bottom=False, left=False, right=False, which="both", labelbottom=False, labeltop=False, labelleft=False, labelright=False) plt.xlabel(x, labelpad=20) plt.ylabel(y, labelpad=30) # ax3 = plt.subplot2grid((num_rows, num_cols), (0, num_cols - 1), rowspan=num_rows, # ) plt.colorbar(mappable, cax=cbar_ax) fig.tight_layout(rect=[0, 0, .9, 1]) save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show()
def analyze_independent_predictions(max_candidates, sen_a, sen_b): # type: (int, float, float) -> None sensitivities = { "Random": sensitivity_random, "Independent": sensitivity_independent, "Fully dependent": sensitivity_fully_dependent } agree_given_pred = { "Random": agree_given_pred_random, "Independent": agree_given_pred_independent, "Fully dependent": agree_given_pred_fully_dependent } df = compute_data(sensitivities, agree_given_pred, max_candidates) plot_sensitivities_vs_num_candidates(sensitivities, max_candidates, sen_a, sen_b) sns.lineplot( df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "Probability", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, legend_loc="best", figure_options=FigureOptions( save_fig=next_name("."), ylabel=r"$P(y=s|x_1=y, x_2=y)$", # xlim=[None, 40] )) # error df["1 - Probability"] = 1 - df["Probability"] sns.lineplot( df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "1 - Probability", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, legend_loc="best", figure_options=FigureOptions( save_fig=next_name("."), ylabel=r"$P(y\neq s|x_1=y, x_2=y)$", # xlim=[None, 40] )) import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4)) sns.lineplot(df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "Probability", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, ax=axes[0], legend=False, figure_options=FigureOptions(title="Sensitivity = 0.9", )) sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"]) & (df["Number of candidates"] == 25)], "Sensitivity A", "Probability", hue="Condition", ax=axes[1], sns_kwargs={"palette": CM.get_map("independence-conditions")}, figure_options=FigureOptions( ylim=[0, 1.05], xlim=[0, 1], xlabel="Sensitivity", title="Number of candidates = 25", )) save_figure(FigureOptions(save_fig=next_name(".")), fig) plt.show() df_tmp = df[(df["Sensitivity A"] == df["Sensitivity B"]) & (df["Condition"] == "Independent") & (df["Sensitivity A"].isin( {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}))] df_tmp.rename(columns={"Sensitivity A": "Sensitivity"}, inplace=True) sns.lineplot( df_tmp, "Number of candidates", "Probability", hue="Sensitivity", figure_options=FigureOptions( # ylim=[0, 1.05], # xlim=[0, 1], title="Independent algorithms", save_fig=next_name(".")), ) # for condition in set(df["Condition"]): # # sns.kdeplot( # df[(df["Condition"] == condition) & (df["Sensitivity A"] == df["Sensitivity B"])], # "Sensitivity A", "Number of candidates", "Probability", # figure_options=FigureOptions( # title=condition # )) import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4)) sns.lineplot(df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "Agree given prediction", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, ax=axes[0], legend=False, figure_options=FigureOptions(title="Sensitivity = 0.9", )) sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"]) & (df["Number of candidates"] == 25)], "Sensitivity A", "Agree given prediction", hue="Condition", ax=axes[1], sns_kwargs={"palette": CM.get_map("independence-conditions")}, figure_options=FigureOptions( ylim=[0, 1.05], xlim=[0, 1], xlabel="Sensitivity", title="Number of targets = 25", )) save_figure(FigureOptions(save_fig=next_name(".")), fig) plt.show()
def viz_summary_per_gcfid(env, df, title=None): # type: (Environment, pd.DataFrame) -> None pd_work = env['pd-work'] sns.catplot(df, "Ancestor", "GMS2=SBSP % SBSP", kind="box", figure_options=FigureOptions( save_fig=next_name(pd_work), ylim=[None, 100], title=title, ), sns_kwargs={"palette": CM.get_map("ancestor")}) sns.catplot(df, "Ancestor", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", kind="box", figure_options=FigureOptions(save_fig=next_name(pd_work), ylim=[0, 20], ylabel="1 - Sen(NCBI, GMS2=SBSP)", xlabel="Clade", title=title), sns_kwargs={"palette": CM.get_map("ancestor")}) # per GC sns.scatterplot(df, "Genome GC", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", figure_options=FigureOptions( save_fig=next_name(pd_work), ylim=[0, None], title=title, ), legend_loc="best", sns_kwargs={"palette": CM.get_map("ancestor")}) # per GC sns.lmplot(df, "Genome GC", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", figure_options=FigureOptions( save_fig=next_name(pd_work), ylim=[0, None], title=title, ylabel="1 - Sen(NCBI, GMS2=SBSP)", ), sns_kwargs={ "palette": CM.get_map("ancestor"), "scatter": False, "lowess": True }) sns.lmplot(df, "Genome GC", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", figure_options=FigureOptions( save_fig=next_name(pd_work), ylim=[0, None], title=title, ylabel="1 - Sen(NCBI, GMS2=SBSP)", ), legend_loc="best", sns_kwargs={ "palette": CM.get_map("ancestor"), "scatter": True, "lowess": True, "scatter_kws": { "s": 5 }, "aspect": 1.5 }) sns.lmplot(df, "Genome GC", "GMS2=SBSP", hue="Ancestor", figure_options=FigureOptions( save_fig=next_name(pd_work), ylim=[0, None], title=title, ), sns_kwargs={ "palette": CM.get_map("ancestor"), "scatter": True, "lowess": True, "scatter_kws": { "s": 5 } }) sns.lmplot(df, "Genome GC", "GMS2=SBSP % SBSP", hue="Ancestor", figure_options=FigureOptions( save_fig=next_name(pd_work), ylim=[50, 100], title=title, ), sns_kwargs={ "palette": CM.get_map("ancestor"), "scatter": True, "lowess": True, "scatter_kws": { "s": 5 } }) sns.lmplot(df, "Genome GC", "GMS2=SBSP % GMS2", hue="Ancestor", figure_options=FigureOptions( save_fig=next_name(pd_work), ylim=[50, 100], title=title, ), sns_kwargs={ "palette": CM.get_map("ancestor"), "scatter": True, "lowess": True, "scatter_kws": { "s": 5 } }) sns.scatterplot(df, "NCBI", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", figure_options=FigureOptions( save_fig=next_name(pd_work), ylim=[0, None], title=title, ), sns_kwargs={ "palette": CM.get_map("ancestor"), }) sns.scatterplot(df, "GMS2=SBSP", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", figure_options=FigureOptions( save_fig=next_name(pd_work), ylim=[0, None], title=title, ), sns_kwargs={ "palette": CM.get_map("ancestor"), }) # per GC sns.scatterplot(df, "Genome GC", "(GMS2=SBSP)!=Prodigal % GMS2=SBSP", hue="Ancestor", figure_options=FigureOptions( save_fig=next_name(pd_work), ylim=[0, None], title=title, ), sns_kwargs={"palette": CM.get_map("ancestor")})
def viz_summary_per_gcfid_per_step(env, df): # type: (Environment, pd.DataFrame) -> None pd_work = env['pd-work'] list_df = list() for gcfid, df_group in df.groupby("GCFID", as_index=False): df.loc[df_group.index, "Total SBSP"] = df.loc[df_group.index, "SBSP"].sum() df.loc[df_group.index, "Total GMS2"] = df.loc[df_group.index, "GMS2"].sum() df.loc[df_group.index, "Total GMS2=SBSP"] = df.loc[df_group.index, "GMS2=SBSP"].sum() tag = None for step in ["A", "B", "C"]: if tag is None: tag = step else: tag += "+" + step df_summary_per_gcfid = get_summary_per_gcfid( df[df["Predicted-at-step"] <= step]) df_summary_per_gcfid["SBSP Step"] = tag list_df.append(df_summary_per_gcfid) df_per_gcfid_per_step = pd.concat(list_df, sort=False) import matplotlib.pyplot as plt # fig, ax = plt.subplots() # # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax, # sns_kwargs={"palette": CM.get_map("verified")}, # legend=False # ) # for l in ax.lines: # l.set_linestyle("--") # # ax2 = ax.twinx() # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax2, # sns_kwargs={"palette": CM.get_map("verified")},) # # fo = FigureOptions( # xlabel="SBSP Step", # ylabel="Percentage", # # ylim=[0, 105], # save_fig=next_name(env["pd-work"]) # ) # FigureOptions.set_properties_for_axis(ax, fo) # plt.subplots_adjust(bottom=0.2) # handles, labels = ax.get_legend_handles_labels() # ax.legend(handles=handles[1:], labels=labels[1:], # loc="lower center", ncol=4, bbox_to_anchor=(0.5, -0.25)) # # plt.savefig(fo.save_fig) # plt.show() fig, axes = plt.subplots(3, 2, sharex="all", sharey="row") ax = axes[:, 0] sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax[0], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions( ylabel="Sensitivity", ylim=[85, 105], )) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Cov(SBSP,NCBI)", hue="GCFID", ax=ax[1], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Percent of Genes", ylim=[0, None])) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax[2], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Number of Genes", ylim=[0, None])) fig.align_ylabels(ax) # plt.savefig(next_name(env["pd-work"])) # plt.show() # fig, ax = plt.subplots(3, 1, sharex="all") ax = axes[:, 1] sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(GMS2=SBSP,NCBI)", hue="GCFID", ax=ax[0], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions( ylabel="Sensitivity", ylim=[85, 105], )) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Cov(GMS2=SBSP,NCBI)", hue="GCFID", ax=ax[1], sns_kwargs={"palette": CM.get_map("verified")}, legend=False, figure_options=FigureOptions(ylabel="Percent of Genes", ylim=[0, None])) sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "GMS2=SBSP", hue="GCFID", ax=ax[2], sns_kwargs={"palette": CM.get_map("verified")}, figure_options=FigureOptions(ylabel="Number of Genes", ylim=[0, None])) ax[2].get_legend().remove() fig.align_ylabels(ax) for ax in axes.ravel(): ax.set_xlabel("Steps") axes[0][0].set_title("SBSP") axes[0][1].set_title("GMS2=SBSP") fig.subplots_adjust(bottom=0.21) # handles, labels = ax.get_legend_handles_labels() # fig.legend(handles=handles[1:], labels=labels[1:], loc="lower center", ncol=4)#, bbox_to_anchor=(0.5, -0.25)) handles, labels = ax.get_legend_handles_labels() labels[0] = "Genome" fig.legend(handles=handles, labels=labels, loc="lower center", ncol=3) #, bbox_to_anchor=(0.5, -0.25)) plt.savefig(next_name(env["pd-work"])) plt.show() # three plots for gcfid, df_group in df.groupby("GCFID", as_index=False): df.loc[df_group.index, "Total SBSP"] = ((df_group["SBSP"]) & (df_group["NCBI"])).sum() df.loc[df_group.index, "Total GMS2"] = ((df_group["GMS2"]) & (df_group["NCBI"])).sum() df.loc[df_group.index, "Total GMS2=SBSP"] = ((df_group["GMS2=SBSP"]) & (df_group["NCBI"])).sum() df_all = get_summary_per_gcfid(df) print(df_all[[ "GCFID", "NCBI", "Sen(SBSP,NCBI)", "Sen(GMS2,NCBI)", "Sen(GMS2=SBSP,NCBI)" ]].to_string(index=False)) print(df_all[[ "GCFID", "NCBI", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)", "Cov2(GMS2=SBSP,NCBI)" ]].to_string(index=False)) import sys sys.exit()
def plot_per_tool_by_genome_type(env, df): # type: (Environment, pd.DataFrame) -> None list_tags = get_tags_for_5prime(df) num_tags = len(list_tags) fig, ax = plt.subplots(2, math.ceil(num_tags / 2), sharey="all", sharex="all") fig.add_axes([.91, .3, .03, .4]) cbar_ax = fig.axes[-1] # # save_figure(FigureOptions( # save_fig=next_name(env["pd-work"]) # ), fig) # # plt.show() # return import numpy as np kws = { # "levels": np.arange(0, 1, 0.2), # "vmin": 0, "vmax": 0.55, # "norm": True "xlim": [0.2, 0.8], "ylim": [0, 35], "cbar_max": 1, "num_steps": 35, } cbar_enable = { "cbar_ax": cbar_ax, "cbar": True, } counter = 0 for tag, c, a in zip(list_tags, ["b", "g", "r", "o"], ax.ravel()): x, y, y_l, y_u = loess_with_stde( df, "GC", f"M:{tag}", a, tag.replace("=", ","), **kws, **cbar_enable if counter == 0 else dict()) a.set_title( tag.replace("=", ",").replace("NCBI", "PGAP").replace("GMS2", "GeneMarkS-2")) a.set_ylabel("") a.set_xlabel("") # a.set_ylim([65,100]) # a.set_ylim([0, 35]) # eps_x = [z for z in a.get_ylim()] # eps_x[0] -= 0.01 # eps_x[1] += 0.01 # # a.set_xlim(eps_x) # if counter % 2 == 0: # a.set_ylabel("Percentage of gene-start differences") # if counter >= math.ceil(num_tags/2): # a.set_xlabel("GC") counter += 1 mappable = a.collections[0] # plt.legend(loc="best") figure_options = FigureOptions(save_fig=next_name(env["pd-work"])) fig.add_subplot(111, frameon=False) # hide tick and tick label of the big axes plt.tick_params(top=False, bottom=False, left=False, right=False, which="both", labelbottom=False, labeltop=False, labelleft=False, labelright=False) plt.xlabel("GC", labelpad=30) plt.ylabel("Percentage of gene-start differences", labelpad=30) # plt.xlabel("GC") # plt.ylabel("Percent 5' Match") # mappable=create_mappable_for_colorbar(np.arange(0, 0.4, 0.05), "Reds") # plt.colorbar(mappable, cax=cbar_ax, cmap="Reds") fig.tight_layout(rect=[-0.02, -0.02, .9, 1]) # plt.tight_layout() # FigureOptions.set_properties_for_axis(ax, figure_options) save_figure(figure_options, fig) plt.show() # # for tag in list_tags: # sns.jointplot(df, "GC", f"M:{tag}") # # # x = df["GC"].values # y = df[f"M:{list_tags[0]}"].values # order = np.argsort(x) # # run it # y_sm, y_std = lowess(x, y, f=1. / 5.) # # plot it # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS') # plt.fill_between(x[order], y_sm[order] - 1.96 * y_std[order], # y_sm[order] + 1.96 * y_std[order], alpha=0.3, label='LOWESS uncertainty') # # plt.plot(x, y, 'k.', label='Observations') # # plt.legend(loc='best') # # run it # y_sm, y_std = lowess(x, y, f=1. / 5.) # # plot it # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS') # plt.fill_between(x[order], y_sm[order] - y_std[order], # y_sm[order] + y_std[order], alpha=0.3, label='LOWESS uncertainty') # # plt.plot(x, y, 'k.', label='Observations') # plt.legend(loc='best') # plt.show() # calculate a 60 day rolling mean and plot # calculate a 60 day rolling mean and plot # df_stacked = stack_columns_as_rows( # df, [f"M:{tag}" for tag in list_tags], "Percent 5p Match", [f"M:{tag}" for tag in list_tags], "Tools" # ) # # # sns.lmplot( # df_stacked, "GC", "Percent 5p Match", hue="Tools", # figure_options=FigureOptions( # xlabel="Genome GC", # ylim=[70, 100] # ), # legend_loc="best", # sns_kwargs={"scatter_kws": {"s": 5, "alpha": 0.3}, "lowess": False, "scatter": False, "aspect": 1.5} # ) # # sns.tsplot(df_stacked, "GC", "Percent 5p Match", hue="Tools", sns_kwargs={"ci":"sd"}) # fig, ax = plt.subplots(1, 1) # seaborn.lineplot(df["GC"], df[f"M:{list_tags[0]}"]) # # seaborn.tsplot(df, "GC", f"M:{list_tags[0]}" , ci="sd") # plt.show() plt.show()
def visualize_matrix_column(env, df, col): # type: (Environment, pd.DataFrame, str) -> None # first, remove all NA for column df = df[~df[col].isna()] # we only need non-NA fp = FontProperties() fp.set_family("monospace") # create N x 6 x 4 matrix for RBS mat = create_numpy_for_column(df, col) mat = mat.reshape((mat.shape[0], mat.shape[1] * mat.shape[2])) # get interesting features to view data by gc = df["GC"] group = df["GENOME_TYPE"] for r in range(1): reducer = umap.UMAP(random_state=r) reducer = reducer.fit(mat) embedding = reducer.embedding_ print(embedding.shape) # fig, ax = plt.subplots() # # plt.scatter(embedding[:, 0], embedding[:, 1], c=gc, marker="+") # plt.colorbar() # plt.show() # themes = ["fire", "viridis", "inferno", "blue", "red", "green", "darkblue", "darkred", "darkgreen"] # fig, axes = plt.subplots(3, 3) # for ax, theme in zip(axes.ravel(), themes): # fig, ax = plt.subplots() # umap.plot.points(reducer, values=gc, theme=theme, ) # plt.show() ax = umap.plot.points(reducer, values=gc, cmap="viridis") mappable = create_mappable_for_colorbar(gc, "viridis") plt.colorbar(mappable) plt.title(col) plt.tight_layout() save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show() umap.plot.points(reducer, labels=group.values, color_key_cmap="Paired") plt.title(col) plt.tight_layout() save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show() # umap.plot.points(reducer, labels=group.values, color_key_cmap="Dark2") # plt.title(col) # save_figure(FigureOptions( # save_fig=next_name(env["pd-work"]) # )) # plt.show() umap.plot.points(reducer, labels=df["Type"]) plt.title(col) plt.tight_layout() save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show()
def main(env, args): # type: (Environment, argparse.Namespace) -> None df = pd.read_csv(args.pf_data) df["chunk-size"] /= 1000 import matplotlib.pyplot as plt fig, ax = plt.subplots() sns.lineplot(df[df["Tool"] == "SBSP"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime", hue="Genome", sns_kwargs={"palette": CM.get_map("verified"), "linestyle": "dashed"}, ax=ax, legend=False, figure_options=FigureOptions( xlabel="Chunk size (mb)", ylabel="Accuracy", ylim=[74, 101], save_fig=next_name(env["pd-work"]) )) for l in ax.lines: l.set_linestyle("--") sns.lineplot(df[df["Tool"] == "GMS2"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime", hue="Genome", sns_kwargs={"palette": CM.get_map("verified")}, legend_loc="best", legend_ncol=2, ax=ax) if args.with_mgm: y_max = ax.get_ylim()[1] ax.axvline(50, 0, y_max, color="grey", linestyle="dashed") ax.axhline(74, 5, 49, color="grey", linestyle="dashed") ax.annotate("MGM", (5, 72)) if "MGM" in set(df["Tool"]): sns.lineplot(df[df["Tool"] == "MGM"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime", hue="Genome", sns_kwargs={"palette": CM.get_map("verified"), "linestyle": "-."}, ax=ax, legend=False) for l in ax.lines[len(ax.lines)-5:]: l.set_linestyle(":") fo = FigureOptions( xlabel="Chunk size (mb)", ylabel="Accuracy", ylim=[74,101], save_fig=next_name(env["pd-work"]) ) FigureOptions.set_properties_for_axis(ax, fo) plt.savefig(fo.save_fig) plt.show()
def analyze_upstream_distances(env, df): # type: (Environment, pd.DataFrame) -> None pd_work = os_join(env["pd-work"], "upstream_distances") mkdir_p(pd_work) # remove empty lists df = df[df["Upstream-distance"] != "[]"].copy() df["Upstream-distance"] = df["Upstream-distance"].apply(ast.literal_eval) df["Most frequent upstream"] = df["Upstream-distance"].apply(most_frequent) # compute consistencies with different flexibilities for flexibility in {0, 3}: df["PC(x,{})".format(flexibility)] = df[[ "Most frequent upstream", "Upstream-distance" ]].apply(lambda r: compute_consistency(r["Upstream-distance"], r[ "Most frequent upstream"], flexibility), axis=1) df = df[df["Support"] > 10].copy() # for mf in range(-20, 50): # df_mf = df[df["Most frequent upstream"] == mf] # if len(df_mf) < 50: # continue # # sns.distplot(df_mf, "PC(x,0)", figure_options=FigureOptions( # title="PC({},{})".format(mf, 0), # save_fig=next_name(pd_work), # xlim=(0,1) # )) # sns.distplot(df_mf, "PC(x,3)", figure_options=FigureOptions( # title="PC({},{})".format(mf, 3), # save_fig=next_name(pd_work), # xlim=(0, 1) # )) # plot distribution of Average PC import seaborn import matplotlib.pyplot as plt df_tmp = df[(df["Support"] > 10) & (df["Most frequent upstream"] < 100) & (df["Most frequent upstream"] > -50)] # NCBI consistency as a func df = df[(df["Support"] > 10) & (df["GMS2=SBSP"]) & (df["Most frequent upstream"] < 100) & (df["Most frequent upstream"] > -50)] df_tmp = stack_columns_as_rows( df_tmp[["Most frequent upstream", "PC(x,0)", "PC(x,3)", "Ancestor"]], ["PC(x,0)", "PC(x,3)"], "PC(x,f)", None, label_col="Flexibility") # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # scatter=False, hue="Flexibility", lowess=True) # plt.show() # # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # hue="Flexibility", lowess=True) # plt.show() # # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # scatter=False, hue="Flexibility") # plt.show() sns.lmplot(df_tmp, "Most frequent upstream", "PC(x,f)", hue="Flexibility", sns_kwargs={ "scatter": False, "lowess": True }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1])) sns.distplot(df, "Most frequent upstream", figure_options=FigureOptions(save_fig=next_name(pd_work)), sns_kwargs={"kde": True}) import seaborn # seaborn.countplot("Most frequent upstream", data=df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)], hue="Ancestor") (df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)].groupby("Ancestor") ["Most frequent upstream"].value_counts(normalize=True).mul(100).rename( 'Percentage (by clade)').reset_index().pipe( (seaborn.catplot, 'data'), x="Most frequent upstream", y='Percentage (by clade)', hue="Ancestor", kind='point', scale=0.5, legend=False, palette=CM.get_map("ancestor"), aspect=1.5)) plt.legend(loc="best", title="Clade") figure_options = FigureOptions( save_fig=next_name(pd_work), xlabel="Most frequent distance to upstream gene", ylabel="Percent of components (by clade)") plt.xlabel(figure_options.xlabel) plt.ylabel(figure_options.ylabel) save_figure(figure_options) plt.show() (df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)].groupby("Ancestor") ["Most frequent upstream"].value_counts().rename( 'number').reset_index().pipe((seaborn.catplot, 'data'), x="Most frequent upstream", y='number', hue="Ancestor", kind='point', scale=0.5, legend=False, palette=CM.get_map("ancestor"), aspect=1.5)) plt.legend(loc="best", title="Clade") figure_options = FigureOptions( save_fig=next_name(pd_work), xlabel="Most frequent distance to upstream gene", ylabel="Number of components") plt.xlabel(figure_options.xlabel) plt.ylabel(figure_options.ylabel) save_figure(figure_options) plt.show() f, ax1 = plt.subplots() ax2 = ax1.twinx() for ancestor, df_group in df.groupby("Ancestor"): seaborn.distplot(df_group["Most frequent upstream"], kde=False, ax=ax1) # ax2.set_ylim(0, 3) ax2.yaxis.set_ticks([]) seaborn.kdeplot(df_group["Most frequent upstream"], ax=ax2) ax1.set_xlabel('x var') ax1.set_ylabel('Counts') # g = seaborn.FacetGrid(df, hue="Ancestor") # g = g.map(seaborn.distplot, "Most frequent upstream", hist=True) plt.show() print(df["Most frequent upstream"].value_counts(normalize=True)) sns.lmplot( df, "Most frequent upstream", "PC(x,0)", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1]), ) sns.lmplot(df, "Most frequent upstream", "PC(x,3)", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1])) # NCBI sensitivity # collect: # average 5' per ancestor, r, ranges = [(-5, 0), (0, 10), (10, 30), (30, 50), (50, 70)] list_collect = list() for r in ranges: r_filter = (df["Most frequent upstream"] >= r[0]) & (df["Most frequent upstream"] < r[1]) df_summary_per_gcfid = get_summary_per_gcfid(df[r_filter]) # viz_summary_per_gcfid(env, df_summary_per_gcfid, title=str(r)) df_summary_per_gcfid = df_summary_per_gcfid.groupby( "Ancestor", as_index=False).mean() df_summary_per_gcfid["Range"] = str(r) list_collect.append(df_summary_per_gcfid) df_tmp = pd.concat(list_collect, sort=False) sns.catplot(df_tmp, "Range", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.catplot(df_tmp, "Range", "GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) # do not average per gcfid - average per ancestor list_collect = list() range_avgs = list() range_label = list() for r in ranges: r_filter = (df["Most frequent upstream"] >= r[0]) & (df["Most frequent upstream"] < r[1]) df_r = df[r_filter] for ancestor, df_group in df_r.groupby( "Ancestor", as_index=False): # type: str, pd.DataFrame f_gms2_eq_sbsp_with_ncbi_pred = (df_group["GMS2=SBSP"]) & ( df_group["NCBI"]) f_gms2_eq_sbsp_not_eq_ncbi = (f_gms2_eq_sbsp_with_ncbi_pred) & ( df_group["(GMS2=SBSP)!=NCBI"]) sensitivity = 100 * f_gms2_eq_sbsp_not_eq_ncbi.sum() / float( f_gms2_eq_sbsp_with_ncbi_pred.sum()) list_collect.append({ "Ancestor": ancestor, "Range": str(r), "range_avg": (r[1] + r[0]) / 2.0, "(GMS2=SBSP)!=NCBI % GMS2=SBSP": sensitivity, "GMS2=SBSP": f_gms2_eq_sbsp_with_ncbi_pred.sum() }) range_label.append(r) range_avgs.append((r[1] + r[0]) / 2.0) df_tmp = pd.DataFrame(list_collect) sns.catplot(df_tmp, "Range", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.catplot(df_tmp, "Range", "GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) ancestors = list(set(df_tmp["Ancestor"])) fig, axes = plt.subplots( len(ancestors), 1, sharex="all", ) for ancestor, ax in zip(ancestors, axes.ravel()): # type: str, plt.Axes ax2 = ax.twinx() curr_df = df_tmp[df_tmp["Ancestor"] == ancestor] seaborn.lineplot("range_avg", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", data=curr_df, ax=ax) seaborn.lineplot("range_avg", "GMS2=SBSP", data=curr_df, color='r', legend=False, ax=ax2) ax.set_ylabel(None) ax2.set_ylabel(None) ax.set_xlabel("Range Average") plt.xticks(range_avgs, range_label) plt.show() fig, ax = plt.subplots() ax2 = ax.twinx() seaborn.lineplot("range_avg", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", data=df_tmp, ax=ax, color="b", ci=None, hue="Ancestor") seaborn.lineplot("range_avg", "GMS2=SBSP", data=df_tmp, ci=None, color='r', legend=False, ax=ax2, hue="Ancestor") # plt.xticks(range_avgs, range_label) ax.set_ylim([0, None]) ax2.set_ylim([0, None]) ax.set_ylabel("NCBI 5' error rate vs GMS2=SBSP") ax2.set_ylabel("Number of GMS2=SBSP genes") ax.set_xlabel("Range Average") ax.yaxis.label.set_color('b') ax2.yaxis.label.set_color('r') ax.set_xlabel("Distance to upstream gene (nt)") plt.show() # sbsp_geom_density(df, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work) # # for ancestor, df_group in df.groupby("Ancestor", as_index=False): # sbsp_geom_density(df_group, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work, ancestor) # sbsp_geom_density(df_group, "Support", "GMS2=SBSP=NCBI", pd_work, ancestor) a = 0
def main(env, args): # type: (Environment, argparse.Namespace) -> None df_bac = load_obj(args.pf_data).reset_index() # type: pd.DataFrame df_bac = df_bac[df_bac["GENOME_TYPE"].isin(args.group)] min_gc = 20 max_gc = 70 if args.motif_type == "PROMOTER": df_bac = df_bac[df_bac["GC"] >= 40].copy() gc_values = np.arange(min_gc, max_gc, 2) models = get_models_by_gc(df_bac, gc_values, motif_type=args.motif_type) num_plots = len(models) num_rows = int(math.sqrt(num_plots)) num_cols = math.ceil(num_plots / float(num_rows)) fig, axes = plt.subplots(num_rows, num_cols, sharex="all", sharey="all", figsize=(12, 10)) model_index = 0 for r in range(num_rows): for c in range(num_cols): if model_index >= len(models): break if models[model_index] is None: model_index += 1 continue bgd = [0.25] * 4 bgd = background_from_gc(gc_values[model_index]) newmod = lm.transform_matrix(models[model_index][0], to_type="information", from_type="probability", background=models[model_index][1]) # from copy import copy # newmod = copy(models[model_index][0]) # for idx in newmod.index: # # see https://bioconductor.org/packages/release/bioc/vignettes/universalmotif/inst/doc/IntroductionToSequenceMotifs.pdf # # uncertainty = sum( # [newmod.at[idx, l] * math.log2(newmod.at[idx, l]) for l in newmod.columns] # ) # fIC = math.log2(4) - uncertainty # for i, l in enumerate(sorted(newmod.columns)): # newmod.at[idx, l] = max(1 * newmod.at[idx, l] * math.log2(newmod.at[idx, l] / models[model_index][1][i]), 0) lm.Logo(newmod, ax=axes[r][c]) axes[r][c].set_ylim(0, 2) axes[r][c].set_title(int(gc_values[model_index])) # fig.show() model_index += 1 plt.tight_layout() plt.savefig(next_name(env["pd-work"])) plt.show()
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) pd_figures = os_join(env["pd-work"], "figures") mkdir_p(pd_figures) list_run_info = list() for gi in tqdm(gil, total=len(gil)): # get gms2 and toolp models mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi) group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper() mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None) mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None) non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"]) df_gms2 = mm_gms2.pwm_to_df() df_toolp = mm_toolp.pwm_to_df() fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4)) # relative rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information") lm.Logo(rel_mat, color_scheme="classic", ax=axes[0]) axes[0].set_ylim(*[0, 2]) axes[0].set_title("GeneMarkS-2") # shannon sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information") lm.Logo(sha_mat, color_scheme="classic", ax=axes[1]) axes[1].set_ylim(*[0, 2]) axes[1].set_title("StartLink+") plt.tight_layout() plt.savefig(next_name(pd_figures)) plt.show() rel_gms2 = relative_entropy(mm_gms2, non_gms2) rel_toolp = relative_entropy(mm_toolp, non_gms2) gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta")) if not args.verified: list_run_info.append({ "GC": gc, "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi), "RE GMS2": rel_gms2, "RE toolp": rel_toolp }) else: # verified comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[0], "Tool": "GMS2", "RE": rel_gms2, "GC": gc }) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[1], "Tool": "GMS2 with SL", "RE": rel_toolp, "GC": gc }) print(list_run_info[-2:]) import sbsp_viz.sns as sns if args.verified: df = pd.DataFrame(list_run_info) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Error")) sns.lineplot(df, "Genome", "RE", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Relative entropy", )) else: df = pd.DataFrame(list_run_info) sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df = pd.DataFrame(list_run_info) df = df[df["Accuracy"] < 2].copy() sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df.to_csv(next_name(env["pd-work"], ext="csv"))