def triu_plot(x, y, color, label, **kwargs): z = QCplot.density_interpolate(x, y) idx = z.argsort() x, y, z = x[idx], y[idx], z[idx] plt.scatter(x, y, c=z, **kwargs) plt.axhline(0, ls=":", lw=0.1, c="#484848", zorder=0) plt.axvline(0, ls=":", lw=0.1, c="#484848", zorder=0) (x0, x1), (y0, y1) = plt.xlim(), plt.ylim() lims = [max(x0, y0), min(x1, y1)] plt.plot(lims, lims, ls=":", lw=0.1, c="#484848", zorder=0)
def downsample_sgrnas( counts, lib, manifest, n_guides_thresholds, n_iters=10, plasmids=None, gene_col="Gene", ): plasmids = ["CRISPR_C6596666.sample"] if plasmids is None else plasmids # Guides library glib = ( lib[lib.index.isin(counts.index)].reset_index()[["sgRNA_ID", gene_col]].dropna() ) glib = glib.groupby(gene_col) scores = [] for n_guides in n_guides_thresholds: for iteration in range(n_iters): LOG.info(f"Number of sgRNAs: {n_guides}; Iteration: {iteration + 1}") # Downsample randomly guides per genes sgrnas = pd.concat( [d.sample(n=n_guides) if d.shape[0] > n_guides else d for _, d in glib] ).set_index("sgRNA_ID") # sgRNAs fold-change sgrnas_fc = counts.loc[sgrnas.index].norm_rpm().foldchange(plasmids) # genes fold-change genes_fc = sgrnas_fc.groupby(sgrnas[gene_col]).mean() genes_fc = genes_fc.groupby(manifest["model_id"], axis=1).mean() # AROC res = pd.DataFrame( [ dict(sample=s, aroc=QCplot.aroc_threshold(genes_fc[s], fpr_thres=.2)[0]) for s in genes_fc ] ).assign(n_guides=n_guides) scores.append(res) return pd.concat(scores)
def guides_recall_benchmark(metrics, sgrna_counts, dataset, smap, nguides_thres=None, jacks_thres=1., fpr_thres=0.01): nguides_thres = [1, 2, 3, 4, 5, 100 ] if nguides_thres is None else nguides_thres # Define set of guides LOG.info(f"#(sgRNAs)={metrics.shape[0]}") # AROC scores scores = [] for m in ["KS", "JACKS_min", "combined"]: LOG.info(f"Metric = {m}") for n in nguides_thres: # Metric top guides if m == "combined": metric_topn = ( metrics.query(f"JACKS_min < {jacks_thres}").sort_values( "KS", ascending=False).groupby("Approved_Symbol").head(n=n)) else: metric_topn = (metrics.sort_values( m, ascending=( m == "JACKS_min")).groupby("Approved_Symbol").head( n=n)) # Calculate fold-changes on subset metric_fc = sgrna_counts.loc[metric_topn.index] metric_fc = metric_fc.norm_rpm().foldchange(dataset.plasmids) metric_fc = metric_fc.groupby(metrics["Approved_Symbol"]).mean() metric_fc = metric_fc.groupby(smap["model_id"], axis=1).mean() # Binarise fold-changes metric_thres = [ QCplot.aroc_threshold(metric_fc[s], fpr_thres=fpr_thres)[1] for s in metric_fc ] metric_bin = (metric_fc < metric_thres).astype(int) genes, samples = set(metric_bin.index), set(metric_bin) LOG.info(f"Genes:{len(genes)}; Samples:{len(samples)}") # Evaluation metric_recalls = pd.DataFrame([ dict( sample=s, metric=m, nguides=n, ess_aroc=QCplot.aroc_threshold(metric_fc.loc[genes, s], fpr_thres=.2)[0], recall=recall_score(ky_bin.loc[genes, s], metric_bin.loc[genes, s]), precision=precision_score(ky_bin.loc[genes, s], metric_bin.loc[genes, s]), ) for s in metric_bin ]) # Store scores.append(metric_recalls) scores = pd.concat(scores) return scores
master_lib["JACKS_min"] = abs(master_lib["JACKS"] - 1) master_lib = master_lib[master_lib.index.isin(ky_counts.index)] # Project Score KY v1.1: Fold-changes # FDR_THRES = 0.01 ky_sgrna_fc = ky_counts.loc[master_lib.index].norm_rpm().foldchange( ky.plasmids) ky_fc = (ky_sgrna_fc.groupby(master_lib["Approved_Symbol"]).mean().groupby( ky_smap["model_id"], axis=1).mean()) ky_thres = [ QCplot.aroc_threshold(ky_fc[s], fpr_thres=FDR_THRES)[1] for s in ky_fc ] ky_bin = (ky_fc < ky_thres).astype(int) # Benchmark sgRNA: Essential/Non-essential AROC # metrics_recall = guides_recall_benchmark(master_lib, ky_counts, ky, ky_smap, fpr_thres=FDR_THRES, jacks_thres=1.) metrics_recall.to_excel(f"{RPATH}/KosukeYusa_v1.1_benchmark_recall.xlsx", index=False)
data_export = ky.counts.copy() data_export.insert(0, "MinLibCas9_guide", data_export.index.isin(minlibcas9.index)) data_export.insert( 0, "Approved_Symbol", kylib.reindex(data_export.index)["Approved_Symbol"] ) data_export.to_excel(f"{RPATH}/GuideCoverage_export_data.xlsx") # Essential genes AROC # for l in libraries: libraries[l]["aurc"] = pd.Series( { c: QCplot.aroc_threshold(libraries[l]["fc"][c], fpr_thres=0.2)[0] for c in libraries[l]["fc"] } ) # Replicates correlation # for l in libraries: libraries[l]["reps"] = replicates_correlation( libraries[l]["fc_rep"].rename(columns=ky_ss["name"]), method="spearman" ) # Essential genes AURC
]: plot_df = pd.concat( [ mlib[["Library", mtype]], minlib[[mtype ]].assign(Library="MinLibCas9") ], ignore_index=True, ).dropna() fig, ax = plt.subplots(1, 1, figsize=(1.5, 2), dpi=600) QCplot.bias_boxplot( plot_df, x="Library", y=mtype, add_n=False, tick_base=None, order=order, draw_violin=True, ax=ax, ) ax.set_xticklabels(order, rotation=45, ha="right") ax.grid(True, ls=":", lw=0.1, alpha=1.0, zorder=0, axis="y") ax.set_xlabel("") ax.set_ylabel(f"{mtype.split('_')[1]}") plt.savefig( f"{RPATH}/lib_metrics_library_boxplot_{mtype}.pdf", bbox_inches="tight",
# sgRNA fold-changes fc = ky_counts.copy().loc[lib.index].norm_rpm().foldchange(ky.plasmids) # Gene fold-changes fc_gene = fc.groupby(lib["Approved_Symbol"]).mean() # Gene average fold-changes fc_gene_avg = fc_gene.groupby(ky_smap["model_id"], axis=1).mean() # Gene average scaled fold-changes fc_gene_avg_scl = ReadCounts(fc_gene_avg).scale() # FDR threshold fpr = pd.DataFrame( { s: QCplot.aroc_threshold(fc_gene_avg[s], fpr_thres=fdr_thres) for s in fc_gene_avg }, index=["auc", "thres"], ).T fpr["auc"] = [ QCplot.aroc_threshold(fc_gene_avg[s], fpr_thres=0.2)[0] for s in fpr.index ] # Gene binarised fc_gene_avg_bin = (fc_gene_avg[fpr.index] < fpr["thres"]).astype(int) # Store libraries[ltype]["fc"] = fc libraries[ltype]["fc_gene"] = fc_gene
) for i, mtype in enumerate(row_order): plot_df = pd.concat( [polyt_pos_df, master_lib.loc[polyt_pos_df.index, mtype]], axis=1) for j, p in enumerate(polyt): ax = axs[i][j] order = natsorted(set(plot_df[p].dropna().astype(int))) QCplot.bias_boxplot( plot_df[plot_df["polyt5"].isnull()] if p == "polyt4" else plot_df[~plot_df["polyt5"].isnull()], x="polyt4", y=mtype, add_n=True, tick_base=None, order=order, ax=ax, ) ax.set_xticklabels(ax.get_xticklabels() if i == (len(row_order) - 1) else [], rotation=0, horizontalalignment="right") ax.grid(True, ls=":", lw=0.1, alpha=1.0, zorder=0, axis="y") if mtype == "JACKS": ax.axhline(1, ls="-",
plt.close("all") # Recall gene lists # gsets_aucs = {} for n, gset in [ ("essential", Utils.get_essential_genes()), ("non-essential", Utils.get_non_essential_genes()), ]: # Aroc plt.figure(figsize=(2, 2), dpi=600) ax = plt.gca() _, stats_ess = QCplot.plot_cumsum_auc(fc_gene[samples], gset, palette=sample_pal, legend_prop={"size": 4}, ax=ax) plt.title(f"{n} recall curve") plt.xlabel("Percent-rank of genes") plt.ylabel("Cumulative fraction") plt.grid(True, ls=":", lw=0.1, alpha=1.0, zorder=0, axis="both") plt.savefig(f"{RPATH}/minlibcas9_screens_roccurves_{n}.pdf", bbox_inches="tight") plt.close("all") # Barplot df = pd.Series(stats_ess["auc"])[samples].rename("auc").reset_index() plt.figure(figsize=(3, 0.15 * len(samples)), dpi=600) sns.barplot(
# sgRNA fold-changes fc = ky_counts.copy().loc[lib.index].norm_rpm().foldchange(ky.plasmids) # Gene fold-changes fc_gene = fc.groupby(lib["Approved_Symbol"]).mean() # Gene average fold-changes fc_gene_avg = fc_gene.groupby(ky_smap["model_id"], axis=1).mean() # Gene average scaled fold-changes fc_gene_avg_scl = ReadCounts(fc_gene_avg).scale() # FDR threshold fpr = pd.DataFrame( { s: QCplot.aroc_threshold(fc_gene_avg[s], fpr_thres=FDR_THRES) for s in fc_gene_avg }, index=["auc", "thres"], ).T fpr["auc"] = [ QCplot.aroc_threshold(fc_gene_avg[s], fpr_thres=0.2)[0] for s in fpr.index ] # Gene binarised fc_gene_avg_bin = (fc_gene_avg[fpr.index] < fpr["thres"]).astype(int) # Store libraries[ltype]["fc"] = fc libraries[ltype]["fc_gene"] = fc_gene libraries[ltype]["fc_gene_avg"] = fc_gene_avg
libraries["Minimal"]["fc_gene_avg"].index) LOG.info(f"Genes={len(genes)}") # Essential/non-essential AROC and AURC # metrics_arocs = [] for ltype in libraries: for s in libraries[ltype]["fc_gene_avg"]: LOG.info(f"Library={ltype}; Organoid={s}") metrics_arocs.append( dict( library=ltype, sample=s, aroc=QCplot.aroc_threshold(libraries[ltype]["fc_gene_avg"][s], fpr_thres=0.2)[0], )) metrics_arocs = pd.DataFrame(metrics_arocs) # Plot essential genes recall # pal = dict(All="#e34a33", Minimal="#fee8c8") n = libraries["All"]["fc_gene_avg"].shape[1] fig, ax = plt.subplots(1, 1, figsize=(0.6 * n, 2.0), dpi=600) sns.barplot("sample", "aroc", "library",
ky_counts = ky.counts.remove_low_counts(ky.plasmids) ky_fc = ky_counts.norm_rpm().norm_rpm().foldchange(ky.plasmids) ky_gsets = define_sgrnas_sets(ky.lib, ky_fc, add_controls=True) master_lib = Library.load_library("MasterLib_v1.csv.gz").query( "Library == 'KosukeYusa'") # sgRNAs sets AURC # for m in ky_gsets: LOG.info(f"AURC: {m}") ky_gsets[m]["aurc"] = pd.Series({ s: QCplot.recall_curve(ky_fc[s], index_set=ky_gsets[m]["sgrnas"])[2] for s in ky_fc }) ky_gsets_aurc = pd.concat([ ky_gsets[m]["aurc"].rename("aurc").to_frame().assign(dtype=m) for m in ky_gsets ]) ky_gsets_aurc.to_excel(f"{RPATH}/ky_v11_guides_aurcs.xlsx") # sgRNA sets histograms # plt.figure(figsize=(2.5, 1.5)) for c in ky_gsets: sns.distplot(