def kimura_dist_plot(env, df): import seaborn import matplotlib.pyplot as plt ancestors = list(set(df["Ancestor"])) # fig, axes = plt.subplots(2, math.ceil(len(ancestors)/2), sharex=True, sharey=True) # # for anc, ax in zip(ancestors, axes.ravel()): # # df_group = df[df["Ancestor"] == anc] # seaborn.distplot(df_group["Average-Kimura"], ax=ax, color=CM.get_map("ancestor")[anc], # hist=False) # ax.set_title(anc) # plt.show() fig, ax = plt.subplots() # type: plt.Figure, plt.Axes for anc in ancestors: df_group = df[df["Ancestor"] == anc] seaborn.distplot(df_group["Average-Kimura"], ax=ax, color=CM.get_map("ancestor")[anc], hist=False, label=anc) # ax.set_title(anc) ax.legend(ancestors) ax.set_ylabel("PDF") save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show()
def catplot(df, x, y, hue=None, kind="box", figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) g = sns.catplot(x=x, y=y, data=df, kind=kind, hue=hue, legend=False, aspect=1.5, **sns_kwargs) if kind == "point": plt.setp(g.ax.lines, linewidth=1) # set lw for all lines of g axes # plt.setp(g.ax.lines, markersize=0) # set lw for all lines of g axes # # if fontsize: # g.set_xlabels(x, fontsize=fontsize) # g.set_ylabels(x, fontsize=fontsize) FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options) legend = get_value(kwargs, "legend", "full") legend_loc = get_value(kwargs, "legend_loc", None) if hue is not None and legend: title = get_value(kwargs, "legend_title", None) if not legend_loc: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title) else: plt.legend(loc=legend_loc) # plt.savefig(next_name(pd_work)) save_figure(figure_options) plt.show()
def scatterplot(df, x, y, hue=None, figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) ax = get_value(kwargs, "ax", None) identity = get_value(kwargs, "identity", False) if not ax: _, ax = plt.subplots() g = sns.scatterplot(x=x, y=y, hue=hue, data=df, linewidth=0, **sns_kwargs) if identity: add_identity(ax, color="r", ls="--") FigureOptions.set_properties_for_axis(ax, figure_options) legend = get_value(kwargs, "legend", "full") legend_loc = get_value(kwargs, "legend_loc", None) if hue is not None and legend: title = get_value(kwargs, "legend_title", None) if not legend_loc: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title) else: plt.legend(loc=legend_loc) save_figure(figure_options) plt.show()
def tsplot(df, x, y, hue=None, figure_options=None, **kwargs): _, ax = plt.subplots() sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) # g = sns.lmplot(x=x, y=y, hue=hue, data=df, aspect=2, legend=False, ci=None) sns.tsplot(df[y].values, df[x].values, **sns_kwargs) if hue is not None: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5)) # FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options) save_figure(figure_options) plt.show()
def distplot(df, x, figure_options=None, **kwargs): _, ax = plt.subplots() sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) if "kde" not in sns_kwargs: sns_kwargs["kde"] = True g = sns.distplot(df[x], bins=50, **sns_kwargs) FigureOptions.set_properties_for_axis(g.axes, figure_options) save_figure(figure_options) plt.show()
def barplot(df, x, y, hue, figure_options=None, **kwargs): sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) ax = get_value(kwargs, "ax", None) g = sns.barplot(x=x, y=y, data=df, hue=hue, ax=ax, **sns_kwargs) if hue is not None: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5)) FigureOptions.set_properties_for_axis(g, figure_options) plt.tight_layout() save_figure(figure_options) # plt.tight_layout(rect=[-0.3,0,1,1.2]) plt.show()
def kdeplot(df, x, y, hue=None, figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) _, ax = plt.subplots() y_df = None if y is None else df[y] g = sns.kdeplot(df[x], y_df, legend=False, **sns_kwargs) if hue is not None: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5)) FigureOptions.set_properties_for_axis(ax, figure_options) save_figure(figure_options) plt.show()
def contour_kimura_per_ancestor(env, df): import seaborn import matplotlib.pyplot as plt ancestors = sorted(list(set(df["Ancestor"]))) fig, axes = plt.subplots(2, math.ceil(len(ancestors) / 2), sharex=True, sharey=True, figsize=(6, 6)) for anc, ax in zip(ancestors, axes.ravel()): df_group = df[df["Ancestor"] == anc] seaborn.kdeplot(df_group["Min-Kimura"].values, df_group["Max-Kimura"].values, ax=ax) ax.set_title(anc) # ax.set_ylim([0.45, 0.525]) # fig.xlabel("Min-Kimura") # plt.xlabel("Min-Kimura") # plt.ylabel("Max-Kimura") # fig.text(0.5, 0.04, 'Min-Kimura', ha='center') # fig.text(0.04, 0.5, 'Max-Kimura', va='center', rotation='vertical') fig.add_subplot(111, frameon=False) # # hide tick and tick label of the big axes plt.tick_params(top=False, bottom=False, left=False, right=False, which="both", labelbottom=False, labeltop=False, labelleft=False, labelright=False) plt.xlabel("Minimum Kimura", labelpad=20) plt.ylabel("Maximum Kimura", labelpad=30) fig.tight_layout() save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show()
def lmplot(df, x, y, hue=None, figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) if "aspect" not in sns_kwargs: sns_kwargs["aspect"] = 2 g = sns.lmplot(x=x, y=y, hue=hue, data=df, legend=False, **sns_kwargs) FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options) legend = get_value(kwargs, "legend", "full") legend_loc = get_value(kwargs, "legend_loc", None) if hue is not None and legend: title = get_value(kwargs, "legend_title", None) if not legend_loc: g.axes[0][0].legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title) else: g.axes[0][0].legend(loc=legend_loc) save_figure(figure_options, fig=g.fig) plt.subplots_adjust(right=1) plt.show() return g
def lineplot(df, x, y, hue=None, figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) ax = get_value(kwargs, "ax", None) show = get_value(kwargs, "show", ax is None) legend = get_value(kwargs, "legend", "full") legend_loc = get_value(kwargs, "legend_loc", None) legend_ncol = get_value(kwargs, "legend_ncol", 1) identity = get_value(kwargs, "identity", False) if not ax: fig, ax = plt.subplots() else: fig = ax.get_figure() g = sns.lineplot(x=x, y=y, hue=hue, data=df, ax=ax, legend=legend, **sns_kwargs) if identity: add_identity(ax, color="r", ls="--") FigureOptions.set_properties_for_axis(ax, figure_options) if hue is not None and legend: title = get_value(kwargs, "legend_title", None) if not legend_loc: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title, ncol=legend_ncol) else: plt.legend(loc=legend_loc, ncol=legend_ncol, title=title) if title is not None and len(title) == 0: handles, labels = ax.get_legend_handles_labels() ax.legend(handles=handles[1:], labels=labels[1:], ncol=legend_ncol) if show: save_figure(figure_options, fig) plt.show()
def plot_per_tool_by_genome_type(env, df): # type: (Environment, pd.DataFrame) -> None list_tags = get_tags_for_5prime(df) num_tags = len(list_tags) fig, ax = plt.subplots(2, math.ceil(num_tags / 2), sharey="all", sharex="all") fig.add_axes([.91, .3, .03, .4]) cbar_ax = fig.axes[-1] # # save_figure(FigureOptions( # save_fig=next_name(env["pd-work"]) # ), fig) # # plt.show() # return import numpy as np kws = { # "levels": np.arange(0, 1, 0.2), # "vmin": 0, "vmax": 0.55, # "norm": True "xlim": [0.2, 0.8], "ylim": [0, 35], "cbar_max": 1, "num_steps": 35, } cbar_enable = { "cbar_ax": cbar_ax, "cbar": True, } counter = 0 for tag, c, a in zip(list_tags, ["b", "g", "r", "o"], ax.ravel()): x, y, y_l, y_u = loess_with_stde( df, "GC", f"M:{tag}", a, tag.replace("=", ","), **kws, **cbar_enable if counter == 0 else dict()) a.set_title( tag.replace("=", ",").replace("NCBI", "PGAP").replace("GMS2", "GeneMarkS-2")) a.set_ylabel("") a.set_xlabel("") # a.set_ylim([65,100]) # a.set_ylim([0, 35]) # eps_x = [z for z in a.get_ylim()] # eps_x[0] -= 0.01 # eps_x[1] += 0.01 # # a.set_xlim(eps_x) # if counter % 2 == 0: # a.set_ylabel("Percentage of gene-start differences") # if counter >= math.ceil(num_tags/2): # a.set_xlabel("GC") counter += 1 mappable = a.collections[0] # plt.legend(loc="best") figure_options = FigureOptions(save_fig=next_name(env["pd-work"])) fig.add_subplot(111, frameon=False) # hide tick and tick label of the big axes plt.tick_params(top=False, bottom=False, left=False, right=False, which="both", labelbottom=False, labeltop=False, labelleft=False, labelright=False) plt.xlabel("GC", labelpad=30) plt.ylabel("Percentage of gene-start differences", labelpad=30) # plt.xlabel("GC") # plt.ylabel("Percent 5' Match") # mappable=create_mappable_for_colorbar(np.arange(0, 0.4, 0.05), "Reds") # plt.colorbar(mappable, cax=cbar_ax, cmap="Reds") fig.tight_layout(rect=[-0.02, -0.02, .9, 1]) # plt.tight_layout() # FigureOptions.set_properties_for_axis(ax, figure_options) save_figure(figure_options, fig) plt.show() # # for tag in list_tags: # sns.jointplot(df, "GC", f"M:{tag}") # # # x = df["GC"].values # y = df[f"M:{list_tags[0]}"].values # order = np.argsort(x) # # run it # y_sm, y_std = lowess(x, y, f=1. / 5.) # # plot it # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS') # plt.fill_between(x[order], y_sm[order] - 1.96 * y_std[order], # y_sm[order] + 1.96 * y_std[order], alpha=0.3, label='LOWESS uncertainty') # # plt.plot(x, y, 'k.', label='Observations') # # plt.legend(loc='best') # # run it # y_sm, y_std = lowess(x, y, f=1. / 5.) # # plot it # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS') # plt.fill_between(x[order], y_sm[order] - y_std[order], # y_sm[order] + y_std[order], alpha=0.3, label='LOWESS uncertainty') # # plt.plot(x, y, 'k.', label='Observations') # plt.legend(loc='best') # plt.show() # calculate a 60 day rolling mean and plot # calculate a 60 day rolling mean and plot # df_stacked = stack_columns_as_rows( # df, [f"M:{tag}" for tag in list_tags], "Percent 5p Match", [f"M:{tag}" for tag in list_tags], "Tools" # ) # # # sns.lmplot( # df_stacked, "GC", "Percent 5p Match", hue="Tools", # figure_options=FigureOptions( # xlabel="Genome GC", # ylim=[70, 100] # ), # legend_loc="best", # sns_kwargs={"scatter_kws": {"s": 5, "alpha": 0.3}, "lowess": False, "scatter": False, "aspect": 1.5} # ) # # sns.tsplot(df_stacked, "GC", "Percent 5p Match", hue="Tools", sns_kwargs={"ci":"sd"}) # fig, ax = plt.subplots(1, 1) # seaborn.lineplot(df["GC"], df[f"M:{list_tags[0]}"]) # # seaborn.tsplot(df, "GC", f"M:{list_tags[0]}" , ci="sd") # plt.show() plt.show()
def analyze_upstream_distances(env, df): # type: (Environment, pd.DataFrame) -> None pd_work = os_join(env["pd-work"], "upstream_distances") mkdir_p(pd_work) # remove empty lists df = df[df["Upstream-distance"] != "[]"].copy() df["Upstream-distance"] = df["Upstream-distance"].apply(ast.literal_eval) df["Most frequent upstream"] = df["Upstream-distance"].apply(most_frequent) # compute consistencies with different flexibilities for flexibility in {0, 3}: df["PC(x,{})".format(flexibility)] = df[[ "Most frequent upstream", "Upstream-distance" ]].apply(lambda r: compute_consistency(r["Upstream-distance"], r[ "Most frequent upstream"], flexibility), axis=1) df = df[df["Support"] > 10].copy() # for mf in range(-20, 50): # df_mf = df[df["Most frequent upstream"] == mf] # if len(df_mf) < 50: # continue # # sns.distplot(df_mf, "PC(x,0)", figure_options=FigureOptions( # title="PC({},{})".format(mf, 0), # save_fig=next_name(pd_work), # xlim=(0,1) # )) # sns.distplot(df_mf, "PC(x,3)", figure_options=FigureOptions( # title="PC({},{})".format(mf, 3), # save_fig=next_name(pd_work), # xlim=(0, 1) # )) # plot distribution of Average PC import seaborn import matplotlib.pyplot as plt df_tmp = df[(df["Support"] > 10) & (df["Most frequent upstream"] < 100) & (df["Most frequent upstream"] > -50)] # NCBI consistency as a func df = df[(df["Support"] > 10) & (df["GMS2=SBSP"]) & (df["Most frequent upstream"] < 100) & (df["Most frequent upstream"] > -50)] df_tmp = stack_columns_as_rows( df_tmp[["Most frequent upstream", "PC(x,0)", "PC(x,3)", "Ancestor"]], ["PC(x,0)", "PC(x,3)"], "PC(x,f)", None, label_col="Flexibility") # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # scatter=False, hue="Flexibility", lowess=True) # plt.show() # # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # hue="Flexibility", lowess=True) # plt.show() # # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # scatter=False, hue="Flexibility") # plt.show() sns.lmplot(df_tmp, "Most frequent upstream", "PC(x,f)", hue="Flexibility", sns_kwargs={ "scatter": False, "lowess": True }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1])) sns.distplot(df, "Most frequent upstream", figure_options=FigureOptions(save_fig=next_name(pd_work)), sns_kwargs={"kde": True}) import seaborn # seaborn.countplot("Most frequent upstream", data=df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)], hue="Ancestor") (df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)].groupby("Ancestor") ["Most frequent upstream"].value_counts(normalize=True).mul(100).rename( 'Percentage (by clade)').reset_index().pipe( (seaborn.catplot, 'data'), x="Most frequent upstream", y='Percentage (by clade)', hue="Ancestor", kind='point', scale=0.5, legend=False, palette=CM.get_map("ancestor"), aspect=1.5)) plt.legend(loc="best", title="Clade") figure_options = FigureOptions( save_fig=next_name(pd_work), xlabel="Most frequent distance to upstream gene", ylabel="Percent of components (by clade)") plt.xlabel(figure_options.xlabel) plt.ylabel(figure_options.ylabel) save_figure(figure_options) plt.show() (df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)].groupby("Ancestor") ["Most frequent upstream"].value_counts().rename( 'number').reset_index().pipe((seaborn.catplot, 'data'), x="Most frequent upstream", y='number', hue="Ancestor", kind='point', scale=0.5, legend=False, palette=CM.get_map("ancestor"), aspect=1.5)) plt.legend(loc="best", title="Clade") figure_options = FigureOptions( save_fig=next_name(pd_work), xlabel="Most frequent distance to upstream gene", ylabel="Number of components") plt.xlabel(figure_options.xlabel) plt.ylabel(figure_options.ylabel) save_figure(figure_options) plt.show() f, ax1 = plt.subplots() ax2 = ax1.twinx() for ancestor, df_group in df.groupby("Ancestor"): seaborn.distplot(df_group["Most frequent upstream"], kde=False, ax=ax1) # ax2.set_ylim(0, 3) ax2.yaxis.set_ticks([]) seaborn.kdeplot(df_group["Most frequent upstream"], ax=ax2) ax1.set_xlabel('x var') ax1.set_ylabel('Counts') # g = seaborn.FacetGrid(df, hue="Ancestor") # g = g.map(seaborn.distplot, "Most frequent upstream", hist=True) plt.show() print(df["Most frequent upstream"].value_counts(normalize=True)) sns.lmplot( df, "Most frequent upstream", "PC(x,0)", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1]), ) sns.lmplot(df, "Most frequent upstream", "PC(x,3)", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1])) # NCBI sensitivity # collect: # average 5' per ancestor, r, ranges = [(-5, 0), (0, 10), (10, 30), (30, 50), (50, 70)] list_collect = list() for r in ranges: r_filter = (df["Most frequent upstream"] >= r[0]) & (df["Most frequent upstream"] < r[1]) df_summary_per_gcfid = get_summary_per_gcfid(df[r_filter]) # viz_summary_per_gcfid(env, df_summary_per_gcfid, title=str(r)) df_summary_per_gcfid = df_summary_per_gcfid.groupby( "Ancestor", as_index=False).mean() df_summary_per_gcfid["Range"] = str(r) list_collect.append(df_summary_per_gcfid) df_tmp = pd.concat(list_collect, sort=False) sns.catplot(df_tmp, "Range", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.catplot(df_tmp, "Range", "GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) # do not average per gcfid - average per ancestor list_collect = list() range_avgs = list() range_label = list() for r in ranges: r_filter = (df["Most frequent upstream"] >= r[0]) & (df["Most frequent upstream"] < r[1]) df_r = df[r_filter] for ancestor, df_group in df_r.groupby( "Ancestor", as_index=False): # type: str, pd.DataFrame f_gms2_eq_sbsp_with_ncbi_pred = (df_group["GMS2=SBSP"]) & ( df_group["NCBI"]) f_gms2_eq_sbsp_not_eq_ncbi = (f_gms2_eq_sbsp_with_ncbi_pred) & ( df_group["(GMS2=SBSP)!=NCBI"]) sensitivity = 100 * f_gms2_eq_sbsp_not_eq_ncbi.sum() / float( f_gms2_eq_sbsp_with_ncbi_pred.sum()) list_collect.append({ "Ancestor": ancestor, "Range": str(r), "range_avg": (r[1] + r[0]) / 2.0, "(GMS2=SBSP)!=NCBI % GMS2=SBSP": sensitivity, "GMS2=SBSP": f_gms2_eq_sbsp_with_ncbi_pred.sum() }) range_label.append(r) range_avgs.append((r[1] + r[0]) / 2.0) df_tmp = pd.DataFrame(list_collect) sns.catplot(df_tmp, "Range", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.catplot(df_tmp, "Range", "GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) ancestors = list(set(df_tmp["Ancestor"])) fig, axes = plt.subplots( len(ancestors), 1, sharex="all", ) for ancestor, ax in zip(ancestors, axes.ravel()): # type: str, plt.Axes ax2 = ax.twinx() curr_df = df_tmp[df_tmp["Ancestor"] == ancestor] seaborn.lineplot("range_avg", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", data=curr_df, ax=ax) seaborn.lineplot("range_avg", "GMS2=SBSP", data=curr_df, color='r', legend=False, ax=ax2) ax.set_ylabel(None) ax2.set_ylabel(None) ax.set_xlabel("Range Average") plt.xticks(range_avgs, range_label) plt.show() fig, ax = plt.subplots() ax2 = ax.twinx() seaborn.lineplot("range_avg", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", data=df_tmp, ax=ax, color="b", ci=None, hue="Ancestor") seaborn.lineplot("range_avg", "GMS2=SBSP", data=df_tmp, ci=None, color='r', legend=False, ax=ax2, hue="Ancestor") # plt.xticks(range_avgs, range_label) ax.set_ylim([0, None]) ax2.set_ylim([0, None]) ax.set_ylabel("NCBI 5' error rate vs GMS2=SBSP") ax2.set_ylabel("Number of GMS2=SBSP genes") ax.set_xlabel("Range Average") ax.yaxis.label.set_color('b') ax2.yaxis.label.set_color('r') ax.set_xlabel("Distance to upstream gene (nt)") plt.show() # sbsp_geom_density(df, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work) # # for ancestor, df_group in df.groupby("Ancestor", as_index=False): # sbsp_geom_density(df_group, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work, ancestor) # sbsp_geom_density(df_group, "Support", "GMS2=SBSP=NCBI", pd_work, ancestor) a = 0
def heat_map_Kimura_accuracy(env, df_all, x, y, num_steps=20, balance=False): # type: (Environment, pd.DataFrame, str, str, int) -> None import matplotlib.pyplot as plt ancestors = sorted(list(set(df_all["Ancestor"]))) fig, axes = plt.subplots(2, math.ceil(len(ancestors) / 2), sharex=True, sharey=True) cbar_ax = fig.add_axes([.91, .3, .03, .4]) # fig = plt.figure() num_rows = 2 num_cols = math.ceil(len(ancestors) / 2) axis_idx = 0 curr_row = 0 curr_col = 0 for ancestor, df in df_all.groupby("Ancestor", as_index=False): ax = axes.ravel()[axis_idx] # ax = plt.subplot2grid((num_rows, num_cols), (curr_row, curr_col)) axis_idx += 1 curr_col += 1 if curr_col == math.ceil(len(ancestors) / 2): curr_row += 1 curr_col = 0 min_x = min(df[x]) max_x = max(df[x]) + 0.000000001 min_y = min(df[y]) max_y = max(df[y]) + 0.000000001 if balance: min_x = min_y = min(min_x, min_y) max_x = max_y = max(max_x, max_y) ss_x = (max_x - min_x) / float(num_steps) ss_y = (max_y - min_y) / float(num_steps) num_col = num_steps num_row = num_steps import numpy as np gms2_eq_sbsp_and_ncbi = np.zeros([num_row, num_col], dtype=float) gms2_eq_sbsp_eq_ncbi = np.zeros([num_row, num_col], dtype=float) df_gms2_eq_sbsp_and_ncbi = (df["GMS2=SBSP"]) & (df["NCBI"]) df_gms2_eq_sbsp_eq_ncbi = (df["GMS2=SBSP=NCBI"]) for index in df.index: x_val = df.at[index, x] y_val = df.at[index, y] x_pos = int((x_val - min_x) / ss_x) y_pos = int((y_val - min_y) / ss_y) gms2_eq_sbsp_and_ncbi[x_pos][y_pos] += 1 if df.at[ index, "GMS2=SBSP"] and df.at[index, "NCBI"] else 0 gms2_eq_sbsp_eq_ncbi[x_pos][y_pos] += 1 if df.at[ index, "GMS2=SBSP=NCBI"] else 0 gms2_eq_sbsp_and_ncbi[gms2_eq_sbsp_and_ncbi < 10] = 0 accuracy = np.divide(gms2_eq_sbsp_eq_ncbi, gms2_eq_sbsp_and_ncbi) # accuracy = np.flip(accuracy, 0) import seaborn import matplotlib.pyplot as plt xticks = list(range(0, num_steps, int(num_steps / 5))) yticks = list(range(0, num_steps, int(num_steps / 5))) l_x = np.arange(min_x, max_x, ss_x) l_y = np.arange(min_y, max_y, ss_y) xticklabels = [round(l_x[i], 2) for i in xticks] yticklabels = [round(l_y[i], 2) for i in yticks] g = seaborn.heatmap(accuracy.transpose(), vmin=0, vmax=1, xticklabels=xticklabels, yticklabels=yticklabels, ax=ax, cbar=False) # cbar_ax=None if axis_idx != 0 else cbar_ax, cbar=axis_idx==0) # cbar=g.cbar g.invert_yaxis() g.set_xticks(xticks) g.set_yticks(yticks) g.set_xticklabels(xticklabels, rotation=0) # g.set_xlabel("Min Kimura") # g.set_ylabel("Max Kimura") g.set_title(ancestor) mappable = ax.collections[0] # im = plt.gca().get_children()[0] # cax = fig.add_axes([0.8, 0.1, 0.03, 0.8]) cbar_ax = fig.axes[-1] # fig.tight_layout(rect=[0, 0, .9, 1]) fig.add_subplot(111, frameon=False) # hide tick and tick label of the big axes plt.tick_params(top=False, bottom=False, left=False, right=False, which="both", labelbottom=False, labeltop=False, labelleft=False, labelright=False) plt.xlabel(x, labelpad=20) plt.ylabel(y, labelpad=30) # ax3 = plt.subplot2grid((num_rows, num_cols), (0, num_cols - 1), rowspan=num_rows, # ) plt.colorbar(mappable, cax=cbar_ax) fig.tight_layout(rect=[0, 0, .9, 1]) save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show()
def visualize_matrix_column(env, df, col): # type: (Environment, pd.DataFrame, str) -> None # first, remove all NA for column df = df[~df[col].isna()] # we only need non-NA fp = FontProperties() fp.set_family("monospace") # create N x 6 x 4 matrix for RBS mat = create_numpy_for_column(df, col) mat = mat.reshape((mat.shape[0], mat.shape[1] * mat.shape[2])) # get interesting features to view data by gc = df["GC"] group = df["GENOME_TYPE"] for r in range(1): reducer = umap.UMAP(random_state=r) reducer = reducer.fit(mat) embedding = reducer.embedding_ print(embedding.shape) # fig, ax = plt.subplots() # # plt.scatter(embedding[:, 0], embedding[:, 1], c=gc, marker="+") # plt.colorbar() # plt.show() # themes = ["fire", "viridis", "inferno", "blue", "red", "green", "darkblue", "darkred", "darkgreen"] # fig, axes = plt.subplots(3, 3) # for ax, theme in zip(axes.ravel(), themes): # fig, ax = plt.subplots() # umap.plot.points(reducer, values=gc, theme=theme, ) # plt.show() ax = umap.plot.points(reducer, values=gc, cmap="viridis") mappable = create_mappable_for_colorbar(gc, "viridis") plt.colorbar(mappable) plt.title(col) plt.tight_layout() save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show() umap.plot.points(reducer, labels=group.values, color_key_cmap="Paired") plt.title(col) plt.tight_layout() save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show() # umap.plot.points(reducer, labels=group.values, color_key_cmap="Dark2") # plt.title(col) # save_figure(FigureOptions( # save_fig=next_name(env["pd-work"]) # )) # plt.show() umap.plot.points(reducer, labels=df["Type"]) plt.title(col) plt.tight_layout() save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show()
def analyze_independent_predictions(max_candidates, sen_a, sen_b): # type: (int, float, float) -> None sensitivities = { "Random": sensitivity_random, "Independent": sensitivity_independent, "Fully dependent": sensitivity_fully_dependent } agree_given_pred = { "Random": agree_given_pred_random, "Independent": agree_given_pred_independent, "Fully dependent": agree_given_pred_fully_dependent } df = compute_data(sensitivities, agree_given_pred, max_candidates) plot_sensitivities_vs_num_candidates(sensitivities, max_candidates, sen_a, sen_b) sns.lineplot( df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "Probability", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, legend_loc="best", figure_options=FigureOptions( save_fig=next_name("."), ylabel=r"$P(y=s|x_1=y, x_2=y)$", # xlim=[None, 40] )) # error df["1 - Probability"] = 1 - df["Probability"] sns.lineplot( df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "1 - Probability", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, legend_loc="best", figure_options=FigureOptions( save_fig=next_name("."), ylabel=r"$P(y\neq s|x_1=y, x_2=y)$", # xlim=[None, 40] )) import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4)) sns.lineplot(df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "Probability", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, ax=axes[0], legend=False, figure_options=FigureOptions(title="Sensitivity = 0.9", )) sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"]) & (df["Number of candidates"] == 25)], "Sensitivity A", "Probability", hue="Condition", ax=axes[1], sns_kwargs={"palette": CM.get_map("independence-conditions")}, figure_options=FigureOptions( ylim=[0, 1.05], xlim=[0, 1], xlabel="Sensitivity", title="Number of candidates = 25", )) save_figure(FigureOptions(save_fig=next_name(".")), fig) plt.show() df_tmp = df[(df["Sensitivity A"] == df["Sensitivity B"]) & (df["Condition"] == "Independent") & (df["Sensitivity A"].isin( {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}))] df_tmp.rename(columns={"Sensitivity A": "Sensitivity"}, inplace=True) sns.lineplot( df_tmp, "Number of candidates", "Probability", hue="Sensitivity", figure_options=FigureOptions( # ylim=[0, 1.05], # xlim=[0, 1], title="Independent algorithms", save_fig=next_name(".")), ) # for condition in set(df["Condition"]): # # sns.kdeplot( # df[(df["Condition"] == condition) & (df["Sensitivity A"] == df["Sensitivity B"])], # "Sensitivity A", "Number of candidates", "Probability", # figure_options=FigureOptions( # title=condition # )) import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4)) sns.lineplot(df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)], "Number of candidates", "Agree given prediction", hue="Condition", sns_kwargs={"palette": CM.get_map("independence-conditions")}, ax=axes[0], legend=False, figure_options=FigureOptions(title="Sensitivity = 0.9", )) sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"]) & (df["Number of candidates"] == 25)], "Sensitivity A", "Agree given prediction", hue="Condition", ax=axes[1], sns_kwargs={"palette": CM.get_map("independence-conditions")}, figure_options=FigureOptions( ylim=[0, 1.05], xlim=[0, 1], xlabel="Sensitivity", title="Number of targets = 25", )) save_figure(FigureOptions(save_fig=next_name(".")), fig) plt.show()