def import_master_library(self): df = Library.load_library(self.masterlibfile, set_index=False) # Drop sgRNAs that do not align to GRCh38 df = df.query("Assembly == 'Human (GRCh38)'") # Remove sgRNAs that match to multiple genes sg_count = df.groupby("WGE_Sequence")["Approved_Symbol"].agg( lambda v: len(set(v))) sg_count = sg_count[sg_count != 1] df = df[~df["WGE_Sequence"].isin(sg_count.index)] # Remove sgRNAs with no alignment to GRCh38 df = df.dropna(subset=["Approved_Symbol", "Off_Target"]) # Remove sgRNAs with U6 stop codon df = df[["TTTT" not in g for g in df["WGE_Sequence"]]] # Remove duplicates (sgRNAs shared across multiple libraries) df["Library"] = pd.Categorical( df["Library"], ["KosukeYusa", "Avana", "Brunello", "TKOv3"]) df = df.sort_values("Library") df = df.groupby("WGE_Sequence").first().reset_index() # Parse off target summaries df["Off_Target"] = df["Off_Target"].apply(ast.literal_eval).values # Calculate absolute distance of JACKS scores to 1 (similar to gene sgRNAs mean) df["JACKS_min"] = abs(df["JACKS"] - 1).values # Sort guides according to KS scores df = df.sort_values(["KS", "JACKS_min", "RuleSet2"], ascending=[False, True, False]) return df
# DepMap 19Q2 Avana # avana = CRISPRDataSet("Avana_DepMap19Q2") avana_fc = (avana.counts.remove_low_counts( avana.plasmids).norm_rpm().foldchange(avana.plasmids)) avana_gsets = define_sgrnas_sets(avana.lib, avana_fc, dataset_name="Avana_DepMap19Q2", add_controls=True) avana_ks = estimate_ks(avana_fc, avana_gsets["nontargeting"]["fc"]) # CRISPR-Cas9 libraries # brunello = Library.load_library("Brunello_v1.csv.gz") avana = Library.load_library("Avana_v1.csv.gz") ky = Library.load_library("Yusa_v1.1.csv.gz") tkov3 = Library.load_library("TKOv3.csv.gz") # WGE KY guide sequence match # ky_v11_wge = pd.read_csv(f"{DPATH}/update/Yusa_v1.1_WGE_map.txt", sep="\t", index_col=0) ky_v11_wge = ky_v11_wge[~ky_v11_wge["WGE_sequence"].isna()] ky_v11_wge["WGE_sequence"] = ky_v11_wge["WGE_sequence"].apply( lambda v: v[:-3]).values # WGE annotation
import numpy as np import pandas as pd import pkg_resources import seaborn as sns import matplotlib.pyplot as plt from scipy.stats import spearmanr from crispy.CrispyPlot import CrispyPlot from minlib.Utils import define_sgrnas_sets from crispy.CRISPRData import CRISPRDataSet, Library RPATH = pkg_resources.resource_filename("notebooks", "minlib/reports/") # Master library (KosukeYusa v1.1 + Avana + Brunello) # master_lib = Library.load_library("MasterLib_v1.csv.gz", set_index=False) master_lib = master_lib.query("Library == 'KosukeYusa'") # Project Score samples acquired with Kosuke_Yusa v1.1 library # ky = CRISPRDataSet("Yusa_v1.1") ky_counts = ky.counts.remove_low_counts(ky.plasmids) ky_fc = ky_counts.norm_rpm().norm_rpm().foldchange(ky.plasmids) ky_gsets = define_sgrnas_sets(ky.lib, ky_fc, add_controls=True) ky_gmetrics = pd.concat( [
return scores # Project Score KY v1.1 # ky = CRISPRDataSet("Yusa_v1.1") ky_smap = project_score_sample_map() ky_counts = ky.counts.remove_low_counts(ky.plasmids) # Master library (KosukeYusa v1.1 + Avana + Brunello) # master_lib = (Library.load_library( "MasterLib_v1.csv.gz", set_index=True).query("Library == 'KosukeYusa'").dropna( subset=["Approved_Symbol", "KS", "JACKS"])) master_lib["JACKS_min"] = abs(master_lib["JACKS"] - 1) master_lib = master_lib[master_lib.index.isin(ky_counts.index)] # Project Score KY v1.1: Fold-changes # FDR_THRES = 0.01 ky_sgrna_fc = ky_counts.loc[master_lib.index].norm_rpm().foldchange( ky.plasmids) ky_fc = (ky_sgrna_fc.groupby(master_lib["Approved_Symbol"]).mean().groupby( ky_smap["model_id"], axis=1).mean())
from minlib.Utils import density_interpolate from sklearn.metrics import mean_squared_error from minlib.Utils import project_score_sample_map from crispy.CRISPRData import CRISPRDataSet, Library from crispy.LibRepresentationReport import LibraryRepresentaion LOG = logging.getLogger("Crispy") DPATH = pkg_resources.resource_filename("crispy", "data/") RPATH = pkg_resources.resource_filename("notebooks", "minlib/reports/") # MinLibCas9 library information # mlib = Library.load_library("MinLibCas9.csv.gz", set_index=False) mlib.index = [f"{i}" for i in mlib["WGE_ID"]] mlib["sgRNA"] = [s if len(s) == 19 else s[1:-3] for s in mlib["WGE_Sequence"]] # Assemble raw counts matrix # SPATH = pkg_resources.resource_filename("notebooks", "minlib/minlibcas9_screens") plasmid_counts = pd.read_csv(f"{SPATH}/Minimal_library_output_108.csv", index_col=0).rename(columns=dict(counts="MinLibCas9")) # #
# Project Score samples acquired with Kosuke_Yusa v1.1 library # ky = CRISPRDataSet("KM12_coverage") ky_counts = ky.counts.remove_low_counts(ky.plasmids) ky_ss = pd.read_excel( f"{DPATH}/crispr_manifests/KM12_coverage_samplesheet.xlsx", index_col="sample" ) ky_ss["name"] = [f"{c}x ({e})" for e, c in ky_ss[["experiment", "coverage"]].values] ky_ss["rep_name"] = [f"{n} (Rep{r})" for n, r in ky_ss[["name", "replicate"]].values] # KY v1.1 library # ky_lib = Library.load_library("MasterLib_v1.csv.gz").query("Library == 'KosukeYusa'") ky_lib = ky_lib[ky_lib.index.isin(ky_counts.index)] ky_lib_fc = ky_counts.loc[ky_lib.index].norm_rpm().foldchange(ky.plasmids) ky_lib_gene = ky_lib_fc.groupby(ky_lib["Approved_Symbol"]).mean() ky_lib_gene_avg = ky_lib_gene.groupby(ky_ss["name"], axis=1).mean() # Minimal library (top 2) # NGUIDES, REMOVE_DISCORDANT = 2, True ml_lib_name = ( f"MinimalLib_top{NGUIDES}{'_disconcordant' if REMOVE_DISCORDANT else ''}.csv.gz" ) ml_lib = Library.load_library(ml_lib_name).query("Library == 'KosukeYusa'")
plt.savefig( f"{RPATH}/arrayed_distributions_{dtype}_{s}.pdf", bbox_inches="tight", transparent=True, ) plt.close("all") # Gene-expression # gexp = GeneExpression().readcount # Master library (KosukeYusa v1.1 + Avana + Brunello + TKOv3) # master_lib = Library.load_library( "MasterLib_v1.csv.gz", set_index=False).dropna(subset=["WGE_Sequence"]) # Consider only sgRNAs from KosukeYusa lib and mapping to GRCh38 mlib = master_lib.query("Library == 'KosukeYusa'").query( "Assembly == 'Human (GRCh38)'") # Remove sgRNAs that match to multiple genes sg_count = mlib.groupby("WGE_Sequence")["Approved_Symbol"].agg( lambda v: len(set(v))) sg_count = sg_count[sg_count != 1] mlib = mlib[~mlib["WGE_Sequence"].isin(sg_count.index)] # Remove sgRNAs with no alignment to GRCh38 mlib = mlib.dropna(subset=["Approved_Symbol", "Off_Target"]) # Parse off target summaries
import logging import pandas as pd import pkg_resources import matplotlib.pyplot as plt from crispy.QCPlot import QCplot from crispy.CRISPRData import Library LOG = logging.getLogger("Crispy") DPATH = pkg_resources.resource_filename("crispy", "data/") RPATH = pkg_resources.resource_filename("notebooks", "minlib/reports/") # Master library # mlib = Library.load_library("MasterLib_v1.csv.gz", set_index=False).dropna(subset=["WGE_ID"]) mlib["WGE_ID"] = mlib["WGE_ID"].apply(lambda v: f"{v:.0f}") # MinLibCas9 library information # minlib = Library.load_library("MinLibCas9.csv.gz", set_index=False).dropna(subset=["WGE_ID"]) minlib = minlib[[ not (isinstance(i, str) and i.startswith("CTRL")) for i in minlib["WGE_ID"] ]] # # order = ["Avana", "Brunello", "TKOv3", "KosukeYusa", "MinLibCas9"]
import pkg_resources import seaborn as sns import matplotlib.pyplot as plt from natsort import natsorted from crispy.QCPlot import QCplot from crispy.CRISPRData import Library LOG = logging.getLogger("Crispy") DPATH = pkg_resources.resource_filename("crispy", "data/") RPATH = pkg_resources.resource_filename("notebooks", "minlib/reports/") # Libraries # master_lib = (Library.load_library( "MasterLib_v1.csv.gz", set_index=False).dropna(subset=["WGE_Sequence"]).set_index( ["sgRNA_ID", "Library"])) # # polyt = dict(polyt4="TTTT", polyt5="TTTTT") # Count polyt_df = pd.DataFrame({ i: {p: int(polyt[p] in s[:-3]) for p in polyt} for i, s in master_lib["WGE_Sequence"].iteritems() }).T polyt_count_lib = polyt_df.reset_index().groupby("level_1").sum()
RPATH = pkg_resources.resource_filename("notebooks", "minlib/reports/") # Project Score samples acquired with Kosuke_Yusa v1.1 library # ky = CRISPRDataSet("Yusa_v1.1") ky_smap = project_score_sample_map() ky_counts = ky.counts.remove_low_counts(ky.plasmids) ky_fc = ky_counts.norm_rpm().norm_rpm().foldchange(ky.plasmids) ky_gsets = define_sgrnas_sets(ky.lib, ky_fc, add_controls=True) master_lib = Library.load_library("MasterLib_v1.csv.gz").query( "Library == 'KosukeYusa'") # sgRNAs sets AURC # for m in ky_gsets: LOG.info(f"AURC: {m}") ky_gsets[m]["aurc"] = pd.Series({ s: QCplot.recall_curve(ky_fc[s], index_set=ky_gsets[m]["sgrnas"])[2] for s in ky_fc }) ky_gsets_aurc = pd.concat([ ky_gsets[m]["aurc"].rename("aurc").to_frame().assign(dtype=m) for m in ky_gsets ])