Ejemplo n.º 1
0
        def triu_plot(x, y, color, label, **kwargs):
            z = QCplot.density_interpolate(x, y)
            idx = z.argsort()
            x, y, z = x[idx], y[idx], z[idx]

            plt.scatter(x, y, c=z, **kwargs)

            plt.axhline(0, ls=":", lw=0.1, c="#484848", zorder=0)
            plt.axvline(0, ls=":", lw=0.1, c="#484848", zorder=0)

            (x0, x1), (y0, y1) = plt.xlim(), plt.ylim()
            lims = [max(x0, y0), min(x1, y1)]
            plt.plot(lims, lims, ls=":", lw=0.1, c="#484848", zorder=0)
Ejemplo n.º 2
0
def downsample_sgrnas(
    counts,
    lib,
    manifest,
    n_guides_thresholds,
    n_iters=10,
    plasmids=None,
    gene_col="Gene",
):
    plasmids = ["CRISPR_C6596666.sample"] if plasmids is None else plasmids

    # Guides library
    glib = (
        lib[lib.index.isin(counts.index)].reset_index()[["sgRNA_ID", gene_col]].dropna()
    )
    glib = glib.groupby(gene_col)

    scores = []
    for n_guides in n_guides_thresholds:
        for iteration in range(n_iters):
            LOG.info(f"Number of sgRNAs: {n_guides}; Iteration: {iteration + 1}")

            # Downsample randomly guides per genes
            sgrnas = pd.concat(
                [d.sample(n=n_guides) if d.shape[0] > n_guides else d for _, d in glib]
            ).set_index("sgRNA_ID")

            # sgRNAs fold-change
            sgrnas_fc = counts.loc[sgrnas.index].norm_rpm().foldchange(plasmids)

            # genes fold-change
            genes_fc = sgrnas_fc.groupby(sgrnas[gene_col]).mean()
            genes_fc = genes_fc.groupby(manifest["model_id"], axis=1).mean()

            # AROC
            res = pd.DataFrame(
                [
                    dict(sample=s, aroc=QCplot.aroc_threshold(genes_fc[s], fpr_thres=.2)[0])
                    for s in genes_fc
                ]
            ).assign(n_guides=n_guides)
            scores.append(res)

    return pd.concat(scores)
Ejemplo n.º 3
0
def guides_recall_benchmark(metrics,
                            sgrna_counts,
                            dataset,
                            smap,
                            nguides_thres=None,
                            jacks_thres=1.,
                            fpr_thres=0.01):
    nguides_thres = [1, 2, 3, 4, 5, 100
                     ] if nguides_thres is None else nguides_thres

    # Define set of guides
    LOG.info(f"#(sgRNAs)={metrics.shape[0]}")

    # AROC scores
    scores = []

    for m in ["KS", "JACKS_min", "combined"]:
        LOG.info(f"Metric = {m}")

        for n in nguides_thres:
            # Metric top guides
            if m == "combined":
                metric_topn = (
                    metrics.query(f"JACKS_min < {jacks_thres}").sort_values(
                        "KS",
                        ascending=False).groupby("Approved_Symbol").head(n=n))

            else:
                metric_topn = (metrics.sort_values(
                    m, ascending=(
                        m == "JACKS_min")).groupby("Approved_Symbol").head(
                            n=n))

            # Calculate fold-changes on subset
            metric_fc = sgrna_counts.loc[metric_topn.index]
            metric_fc = metric_fc.norm_rpm().foldchange(dataset.plasmids)
            metric_fc = metric_fc.groupby(metrics["Approved_Symbol"]).mean()
            metric_fc = metric_fc.groupby(smap["model_id"], axis=1).mean()

            # Binarise fold-changes
            metric_thres = [
                QCplot.aroc_threshold(metric_fc[s], fpr_thres=fpr_thres)[1]
                for s in metric_fc
            ]
            metric_bin = (metric_fc < metric_thres).astype(int)

            genes, samples = set(metric_bin.index), set(metric_bin)
            LOG.info(f"Genes:{len(genes)}; Samples:{len(samples)}")

            # Evaluation
            metric_recalls = pd.DataFrame([
                dict(
                    sample=s,
                    metric=m,
                    nguides=n,
                    ess_aroc=QCplot.aroc_threshold(metric_fc.loc[genes, s],
                                                   fpr_thres=.2)[0],
                    recall=recall_score(ky_bin.loc[genes, s],
                                        metric_bin.loc[genes, s]),
                    precision=precision_score(ky_bin.loc[genes, s],
                                              metric_bin.loc[genes, s]),
                ) for s in metric_bin
            ])

            # Store
            scores.append(metric_recalls)

    scores = pd.concat(scores)

    return scores
Ejemplo n.º 4
0
master_lib["JACKS_min"] = abs(master_lib["JACKS"] - 1)
master_lib = master_lib[master_lib.index.isin(ky_counts.index)]

# Project Score KY v1.1: Fold-changes
#

FDR_THRES = 0.01

ky_sgrna_fc = ky_counts.loc[master_lib.index].norm_rpm().foldchange(
    ky.plasmids)

ky_fc = (ky_sgrna_fc.groupby(master_lib["Approved_Symbol"]).mean().groupby(
    ky_smap["model_id"], axis=1).mean())

ky_thres = [
    QCplot.aroc_threshold(ky_fc[s], fpr_thres=FDR_THRES)[1] for s in ky_fc
]
ky_bin = (ky_fc < ky_thres).astype(int)

# Benchmark sgRNA: Essential/Non-essential AROC
#

metrics_recall = guides_recall_benchmark(master_lib,
                                         ky_counts,
                                         ky,
                                         ky_smap,
                                         fpr_thres=FDR_THRES,
                                         jacks_thres=1.)
metrics_recall.to_excel(f"{RPATH}/KosukeYusa_v1.1_benchmark_recall.xlsx",
                        index=False)
Ejemplo n.º 5
0
data_export = ky.counts.copy()
data_export.insert(0, "MinLibCas9_guide", data_export.index.isin(minlibcas9.index))
data_export.insert(
    0, "Approved_Symbol", kylib.reindex(data_export.index)["Approved_Symbol"]
)
data_export.to_excel(f"{RPATH}/GuideCoverage_export_data.xlsx")


# Essential genes AROC
#

for l in libraries:
    libraries[l]["aurc"] = pd.Series(
        {
            c: QCplot.aroc_threshold(libraries[l]["fc"][c], fpr_thres=0.2)[0]
            for c in libraries[l]["fc"]
        }
    )


# Replicates correlation
#

for l in libraries:
    libraries[l]["reps"] = replicates_correlation(
        libraries[l]["fc_rep"].rename(columns=ky_ss["name"]), method="spearman"
    )


# Essential genes AURC
Ejemplo n.º 6
0
]:
    plot_df = pd.concat(
        [
            mlib[["Library", mtype]], minlib[[mtype
                                              ]].assign(Library="MinLibCas9")
        ],
        ignore_index=True,
    ).dropna()

    fig, ax = plt.subplots(1, 1, figsize=(1.5, 2), dpi=600)

    QCplot.bias_boxplot(
        plot_df,
        x="Library",
        y=mtype,
        add_n=False,
        tick_base=None,
        order=order,
        draw_violin=True,
        ax=ax,
    )

    ax.set_xticklabels(order, rotation=45, ha="right")

    ax.grid(True, ls=":", lw=0.1, alpha=1.0, zorder=0, axis="y")

    ax.set_xlabel("")
    ax.set_ylabel(f"{mtype.split('_')[1]}")

    plt.savefig(
        f"{RPATH}/lib_metrics_library_boxplot_{mtype}.pdf",
        bbox_inches="tight",
Ejemplo n.º 7
0
        # sgRNA fold-changes
        fc = ky_counts.copy().loc[lib.index].norm_rpm().foldchange(ky.plasmids)

        # Gene fold-changes
        fc_gene = fc.groupby(lib["Approved_Symbol"]).mean()

        # Gene average fold-changes
        fc_gene_avg = fc_gene.groupby(ky_smap["model_id"], axis=1).mean()

        # Gene average scaled fold-changes
        fc_gene_avg_scl = ReadCounts(fc_gene_avg).scale()

        # FDR threshold
        fpr = pd.DataFrame(
            {
                s: QCplot.aroc_threshold(fc_gene_avg[s], fpr_thres=fdr_thres)
                for s in fc_gene_avg
            },
            index=["auc", "thres"],
        ).T
        fpr["auc"] = [
            QCplot.aroc_threshold(fc_gene_avg[s], fpr_thres=0.2)[0]
            for s in fpr.index
        ]

        # Gene binarised
        fc_gene_avg_bin = (fc_gene_avg[fpr.index] < fpr["thres"]).astype(int)

        # Store
        libraries[ltype]["fc"] = fc
        libraries[ltype]["fc_gene"] = fc_gene
Ejemplo n.º 8
0
)

for i, mtype in enumerate(row_order):
    plot_df = pd.concat(
        [polyt_pos_df, master_lib.loc[polyt_pos_df.index, mtype]], axis=1)

    for j, p in enumerate(polyt):
        ax = axs[i][j]

        order = natsorted(set(plot_df[p].dropna().astype(int)))

        QCplot.bias_boxplot(
            plot_df[plot_df["polyt5"].isnull()]
            if p == "polyt4" else plot_df[~plot_df["polyt5"].isnull()],
            x="polyt4",
            y=mtype,
            add_n=True,
            tick_base=None,
            order=order,
            ax=ax,
        )

        ax.set_xticklabels(ax.get_xticklabels() if i == (len(row_order) -
                                                         1) else [],
                           rotation=0,
                           horizontalalignment="right")

        ax.grid(True, ls=":", lw=0.1, alpha=1.0, zorder=0, axis="y")

        if mtype == "JACKS":
            ax.axhline(1,
                       ls="-",
plt.close("all")

# Recall gene lists
#

gsets_aucs = {}
for n, gset in [
    ("essential", Utils.get_essential_genes()),
    ("non-essential", Utils.get_non_essential_genes()),
]:
    # Aroc
    plt.figure(figsize=(2, 2), dpi=600)
    ax = plt.gca()
    _, stats_ess = QCplot.plot_cumsum_auc(fc_gene[samples],
                                          gset,
                                          palette=sample_pal,
                                          legend_prop={"size": 4},
                                          ax=ax)
    plt.title(f"{n} recall curve")
    plt.xlabel("Percent-rank of genes")
    plt.ylabel("Cumulative fraction")
    plt.grid(True, ls=":", lw=0.1, alpha=1.0, zorder=0, axis="both")
    plt.savefig(f"{RPATH}/minlibcas9_screens_roccurves_{n}.pdf",
                bbox_inches="tight")
    plt.close("all")

    # Barplot
    df = pd.Series(stats_ess["auc"])[samples].rename("auc").reset_index()

    plt.figure(figsize=(3, 0.15 * len(samples)), dpi=600)
    sns.barplot(
    # sgRNA fold-changes
    fc = ky_counts.copy().loc[lib.index].norm_rpm().foldchange(ky.plasmids)

    # Gene fold-changes
    fc_gene = fc.groupby(lib["Approved_Symbol"]).mean()

    # Gene average fold-changes
    fc_gene_avg = fc_gene.groupby(ky_smap["model_id"], axis=1).mean()

    # Gene average scaled fold-changes
    fc_gene_avg_scl = ReadCounts(fc_gene_avg).scale()

    # FDR threshold
    fpr = pd.DataFrame(
        {
            s: QCplot.aroc_threshold(fc_gene_avg[s], fpr_thres=FDR_THRES)
            for s in fc_gene_avg
        },
        index=["auc", "thres"],
    ).T
    fpr["auc"] = [
        QCplot.aroc_threshold(fc_gene_avg[s], fpr_thres=0.2)[0] for s in fpr.index
    ]

    # Gene binarised
    fc_gene_avg_bin = (fc_gene_avg[fpr.index] < fpr["thres"]).astype(int)

    # Store
    libraries[ltype]["fc"] = fc
    libraries[ltype]["fc_gene"] = fc_gene
    libraries[ltype]["fc_gene_avg"] = fc_gene_avg
Ejemplo n.º 11
0
    libraries["Minimal"]["fc_gene_avg"].index)
LOG.info(f"Genes={len(genes)}")

# Essential/non-essential AROC and AURC
#

metrics_arocs = []
for ltype in libraries:
    for s in libraries[ltype]["fc_gene_avg"]:
        LOG.info(f"Library={ltype}; Organoid={s}")

        metrics_arocs.append(
            dict(
                library=ltype,
                sample=s,
                aroc=QCplot.aroc_threshold(libraries[ltype]["fc_gene_avg"][s],
                                           fpr_thres=0.2)[0],
            ))
metrics_arocs = pd.DataFrame(metrics_arocs)

# Plot essential genes recall
#

pal = dict(All="#e34a33", Minimal="#fee8c8")

n = libraries["All"]["fc_gene_avg"].shape[1]

fig, ax = plt.subplots(1, 1, figsize=(0.6 * n, 2.0), dpi=600)

sns.barplot("sample",
            "aroc",
            "library",
Ejemplo n.º 12
0
ky_counts = ky.counts.remove_low_counts(ky.plasmids)

ky_fc = ky_counts.norm_rpm().norm_rpm().foldchange(ky.plasmids)

ky_gsets = define_sgrnas_sets(ky.lib, ky_fc, add_controls=True)

master_lib = Library.load_library("MasterLib_v1.csv.gz").query(
    "Library == 'KosukeYusa'")

# sgRNAs sets AURC
#

for m in ky_gsets:
    LOG.info(f"AURC: {m}")
    ky_gsets[m]["aurc"] = pd.Series({
        s: QCplot.recall_curve(ky_fc[s], index_set=ky_gsets[m]["sgrnas"])[2]
        for s in ky_fc
    })

ky_gsets_aurc = pd.concat([
    ky_gsets[m]["aurc"].rename("aurc").to_frame().assign(dtype=m)
    for m in ky_gsets
])
ky_gsets_aurc.to_excel(f"{RPATH}/ky_v11_guides_aurcs.xlsx")

# sgRNA sets histograms
#

plt.figure(figsize=(2.5, 1.5))
for c in ky_gsets:
    sns.distplot(