Beispiel #1
0
    def import_master_library(self):
        df = Library.load_library(self.masterlibfile, set_index=False)

        # Drop sgRNAs that do not align to GRCh38
        df = df.query("Assembly == 'Human (GRCh38)'")

        # Remove sgRNAs that match to multiple genes
        sg_count = df.groupby("WGE_Sequence")["Approved_Symbol"].agg(
            lambda v: len(set(v)))
        sg_count = sg_count[sg_count != 1]
        df = df[~df["WGE_Sequence"].isin(sg_count.index)]

        # Remove sgRNAs with no alignment to GRCh38
        df = df.dropna(subset=["Approved_Symbol", "Off_Target"])

        # Remove sgRNAs with U6 stop codon
        df = df[["TTTT" not in g for g in df["WGE_Sequence"]]]

        # Remove duplicates (sgRNAs shared across multiple libraries)
        df["Library"] = pd.Categorical(
            df["Library"], ["KosukeYusa", "Avana", "Brunello", "TKOv3"])
        df = df.sort_values("Library")
        df = df.groupby("WGE_Sequence").first().reset_index()

        # Parse off target summaries
        df["Off_Target"] = df["Off_Target"].apply(ast.literal_eval).values

        # Calculate absolute distance of JACKS scores to 1 (similar to gene sgRNAs mean)
        df["JACKS_min"] = abs(df["JACKS"] - 1).values

        # Sort guides according to KS scores
        df = df.sort_values(["KS", "JACKS_min", "RuleSet2"],
                            ascending=[False, True, False])

        return df
Beispiel #2
0
# DepMap 19Q2 Avana
#

avana = CRISPRDataSet("Avana_DepMap19Q2")
avana_fc = (avana.counts.remove_low_counts(
    avana.plasmids).norm_rpm().foldchange(avana.plasmids))
avana_gsets = define_sgrnas_sets(avana.lib,
                                 avana_fc,
                                 dataset_name="Avana_DepMap19Q2",
                                 add_controls=True)
avana_ks = estimate_ks(avana_fc, avana_gsets["nontargeting"]["fc"])

# CRISPR-Cas9 libraries
#

brunello = Library.load_library("Brunello_v1.csv.gz")
avana = Library.load_library("Avana_v1.csv.gz")
ky = Library.load_library("Yusa_v1.1.csv.gz")
tkov3 = Library.load_library("TKOv3.csv.gz")

# WGE KY guide sequence match
#

ky_v11_wge = pd.read_csv(f"{DPATH}/update/Yusa_v1.1_WGE_map.txt",
                         sep="\t",
                         index_col=0)
ky_v11_wge = ky_v11_wge[~ky_v11_wge["WGE_sequence"].isna()]
ky_v11_wge["WGE_sequence"] = ky_v11_wge["WGE_sequence"].apply(
    lambda v: v[:-3]).values

# WGE annotation
Beispiel #3
0
import numpy as np
import pandas as pd
import pkg_resources
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from crispy.CrispyPlot import CrispyPlot
from minlib.Utils import define_sgrnas_sets
from crispy.CRISPRData import CRISPRDataSet, Library

RPATH = pkg_resources.resource_filename("notebooks", "minlib/reports/")

# Master library (KosukeYusa v1.1 + Avana + Brunello)
#

master_lib = Library.load_library("MasterLib_v1.csv.gz", set_index=False)
master_lib = master_lib.query("Library == 'KosukeYusa'")

# Project Score samples acquired with Kosuke_Yusa v1.1 library
#

ky = CRISPRDataSet("Yusa_v1.1")

ky_counts = ky.counts.remove_low_counts(ky.plasmids)

ky_fc = ky_counts.norm_rpm().norm_rpm().foldchange(ky.plasmids)

ky_gsets = define_sgrnas_sets(ky.lib, ky_fc, add_controls=True)

ky_gmetrics = pd.concat(
    [
    return scores


# Project Score KY v1.1
#

ky = CRISPRDataSet("Yusa_v1.1")
ky_smap = project_score_sample_map()
ky_counts = ky.counts.remove_low_counts(ky.plasmids)

# Master library (KosukeYusa v1.1 + Avana + Brunello)
#

master_lib = (Library.load_library(
    "MasterLib_v1.csv.gz",
    set_index=True).query("Library == 'KosukeYusa'").dropna(
        subset=["Approved_Symbol", "KS", "JACKS"]))
master_lib["JACKS_min"] = abs(master_lib["JACKS"] - 1)
master_lib = master_lib[master_lib.index.isin(ky_counts.index)]

# Project Score KY v1.1: Fold-changes
#

FDR_THRES = 0.01

ky_sgrna_fc = ky_counts.loc[master_lib.index].norm_rpm().foldchange(
    ky.plasmids)

ky_fc = (ky_sgrna_fc.groupby(master_lib["Approved_Symbol"]).mean().groupby(
    ky_smap["model_id"], axis=1).mean())
from minlib.Utils import density_interpolate
from sklearn.metrics import mean_squared_error
from minlib.Utils import project_score_sample_map
from crispy.CRISPRData import CRISPRDataSet, Library
from crispy.LibRepresentationReport import LibraryRepresentaion


LOG = logging.getLogger("Crispy")
DPATH = pkg_resources.resource_filename("crispy", "data/")
RPATH = pkg_resources.resource_filename("notebooks", "minlib/reports/")


# MinLibCas9 library information
#

mlib = Library.load_library("MinLibCas9.csv.gz", set_index=False)
mlib.index = [f"{i}" for i in mlib["WGE_ID"]]
mlib["sgRNA"] = [s if len(s) == 19 else s[1:-3] for s in mlib["WGE_Sequence"]]


# Assemble raw counts matrix
#

SPATH = pkg_resources.resource_filename("notebooks", "minlib/minlibcas9_screens")

plasmid_counts = pd.read_csv(f"{SPATH}/Minimal_library_output_108.csv", index_col=0).rename(columns=dict(counts="MinLibCas9"))


#
#
Beispiel #6
0
# Project Score samples acquired with Kosuke_Yusa v1.1 library
#

ky = CRISPRDataSet("KM12_coverage")
ky_counts = ky.counts.remove_low_counts(ky.plasmids)
ky_ss = pd.read_excel(
    f"{DPATH}/crispr_manifests/KM12_coverage_samplesheet.xlsx", index_col="sample"
)
ky_ss["name"] = [f"{c}x ({e})" for e, c in ky_ss[["experiment", "coverage"]].values]
ky_ss["rep_name"] = [f"{n} (Rep{r})" for n, r in ky_ss[["name", "replicate"]].values]

# KY v1.1 library
#

ky_lib = Library.load_library("MasterLib_v1.csv.gz").query("Library == 'KosukeYusa'")
ky_lib = ky_lib[ky_lib.index.isin(ky_counts.index)]
ky_lib_fc = ky_counts.loc[ky_lib.index].norm_rpm().foldchange(ky.plasmids)
ky_lib_gene = ky_lib_fc.groupby(ky_lib["Approved_Symbol"]).mean()
ky_lib_gene_avg = ky_lib_gene.groupby(ky_ss["name"], axis=1).mean()


# Minimal library (top 2)
#

NGUIDES, REMOVE_DISCORDANT = 2, True
ml_lib_name = (
    f"MinimalLib_top{NGUIDES}{'_disconcordant' if REMOVE_DISCORDANT else ''}.csv.gz"
)

ml_lib = Library.load_library(ml_lib_name).query("Library == 'KosukeYusa'")
Beispiel #7
0
        plt.savefig(
            f"{RPATH}/arrayed_distributions_{dtype}_{s}.pdf",
            bbox_inches="tight",
            transparent=True,
        )
        plt.close("all")

# Gene-expression
#

gexp = GeneExpression().readcount

# Master library (KosukeYusa v1.1 + Avana + Brunello + TKOv3)
#

master_lib = Library.load_library(
    "MasterLib_v1.csv.gz", set_index=False).dropna(subset=["WGE_Sequence"])

# Consider only sgRNAs from KosukeYusa lib and mapping to GRCh38
mlib = master_lib.query("Library == 'KosukeYusa'").query(
    "Assembly == 'Human (GRCh38)'")

# Remove sgRNAs that match to multiple genes
sg_count = mlib.groupby("WGE_Sequence")["Approved_Symbol"].agg(
    lambda v: len(set(v)))
sg_count = sg_count[sg_count != 1]
mlib = mlib[~mlib["WGE_Sequence"].isin(sg_count.index)]

# Remove sgRNAs with no alignment to GRCh38
mlib = mlib.dropna(subset=["Approved_Symbol", "Off_Target"])

# Parse off target summaries
Beispiel #8
0
import logging
import pandas as pd
import pkg_resources
import matplotlib.pyplot as plt
from crispy.QCPlot import QCplot
from crispy.CRISPRData import Library

LOG = logging.getLogger("Crispy")
DPATH = pkg_resources.resource_filename("crispy", "data/")
RPATH = pkg_resources.resource_filename("notebooks", "minlib/reports/")

# Master library
#

mlib = Library.load_library("MasterLib_v1.csv.gz",
                            set_index=False).dropna(subset=["WGE_ID"])
mlib["WGE_ID"] = mlib["WGE_ID"].apply(lambda v: f"{v:.0f}")

# MinLibCas9 library information
#

minlib = Library.load_library("MinLibCas9.csv.gz",
                              set_index=False).dropna(subset=["WGE_ID"])
minlib = minlib[[
    not (isinstance(i, str) and i.startswith("CTRL")) for i in minlib["WGE_ID"]
]]

#
#

order = ["Avana", "Brunello", "TKOv3", "KosukeYusa", "MinLibCas9"]
Beispiel #9
0
import pkg_resources
import seaborn as sns
import matplotlib.pyplot as plt
from natsort import natsorted
from crispy.QCPlot import QCplot
from crispy.CRISPRData import Library

LOG = logging.getLogger("Crispy")
DPATH = pkg_resources.resource_filename("crispy", "data/")
RPATH = pkg_resources.resource_filename("notebooks", "minlib/reports/")

# Libraries
#

master_lib = (Library.load_library(
    "MasterLib_v1.csv.gz",
    set_index=False).dropna(subset=["WGE_Sequence"]).set_index(
        ["sgRNA_ID", "Library"]))

#
#

polyt = dict(polyt4="TTTT", polyt5="TTTTT")

# Count
polyt_df = pd.DataFrame({
    i: {p: int(polyt[p] in s[:-3])
        for p in polyt}
    for i, s in master_lib["WGE_Sequence"].iteritems()
}).T
polyt_count_lib = polyt_df.reset_index().groupby("level_1").sum()
Beispiel #10
0
RPATH = pkg_resources.resource_filename("notebooks", "minlib/reports/")

# Project Score samples acquired with Kosuke_Yusa v1.1 library
#

ky = CRISPRDataSet("Yusa_v1.1")

ky_smap = project_score_sample_map()

ky_counts = ky.counts.remove_low_counts(ky.plasmids)

ky_fc = ky_counts.norm_rpm().norm_rpm().foldchange(ky.plasmids)

ky_gsets = define_sgrnas_sets(ky.lib, ky_fc, add_controls=True)

master_lib = Library.load_library("MasterLib_v1.csv.gz").query(
    "Library == 'KosukeYusa'")

# sgRNAs sets AURC
#

for m in ky_gsets:
    LOG.info(f"AURC: {m}")
    ky_gsets[m]["aurc"] = pd.Series({
        s: QCplot.recall_curve(ky_fc[s], index_set=ky_gsets[m]["sgrnas"])[2]
        for s in ky_fc
    })

ky_gsets_aurc = pd.concat([
    ky_gsets[m]["aurc"].rename("aurc").to_frame().assign(dtype=m)
    for m in ky_gsets
])