Ejemplo n.º 1
0
    def aroc_threshold(values,
                       true_set=None,
                       false_set=None,
                       fpr_thres=0.01,
                       return_curve=False):
        if true_set is None:
            true_set = Utils.get_essential_genes(return_series=False)

        if false_set is None:
            false_set = Utils.get_non_essential_genes(return_series=False)

        index_set = true_set.union(false_set)

        rank = values[values.index.isin(index_set)]
        y_true = rank.index.isin(true_set).astype(int)

        fpr, tpr, thres = roc_curve(y_true, -rank)

        auc_fpr = roc_auc_score(y_true, -rank, max_fpr=fpr_thres)

        if fpr_thres is not None:
            fc_thres_fpr = -min(thres[fpr <= fpr_thres])

        else:
            fc_thres_fpr = None

        res = ((auc_fpr, fc_thres_fpr, fpr, tpr) if return_curve else
               (auc_fpr, fc_thres_fpr))

        return res
Ejemplo n.º 2
0
    def precision_recall_curve(values,
                               true_set=None,
                               false_set=None,
                               fdr_thres=0.01,
                               return_curve=False):
        if true_set is None:
            true_set = Utils.get_essential_genes(return_series=False)

        if false_set is None:
            false_set = Utils.get_non_essential_genes(return_series=False)

        index_set = true_set.union(false_set)

        rank = values[values.index.isin(index_set)]
        y_true = rank.index.isin(true_set).astype(int)

        ap = average_precision_score(y_true, -rank)

        precision, recall, thres = precision_recall_curve(y_true, -rank)
        recall_fdr = recall[precision > (1 - fdr_thres)].max()

        res = ((ap, recall_fdr, precision, recall, thres) if return_curve else
               (ap, recall_fdr))

        return res
Ejemplo n.º 3
0
    def recall_curve(rank, index_set=None, min_events=None):
        """
        Calculate x and y of recall curve.

        :param rank: pandas.Series

        :param index_set: pandas.Series
            indices in rank

        :param min_events: int or None, optional
            Number of minimum number of index_set to calculate curve

        :return:
        """
        x = rank.sort_values().dropna()

        # Observed cumsum
        if index_set is None:
            index_set = Utils.get_essential_genes(return_series=False)

        y = x.index.isin(index_set)

        if (min_events is not None) and (sum(y) < min_events):
            return None

        y = np.cumsum(y) / sum(y)

        # Rank fold-changes
        x = st.rankdata(x) / x.shape[0]

        # Calculate AUC
        xy_auc = auc(x, y)

        return x, y, xy_auc
Ejemplo n.º 4
0
def define_sgrnas_sets(clib, fc=None, add_controls=True, dataset_name="Yusa_v1"):
    sgrna_sets = dict()

    # sgRNA essential
    sgrnas_essential = Utils.get_essential_genes(return_series=False)
    sgrnas_essential = set(clib[clib["Gene"].isin(sgrnas_essential)].index)
    sgrnas_essential_fc = (
        None if fc is None else fc.reindex(sgrnas_essential).median(1).dropna()
    )

    sgrna_sets["essential"] = dict(
        color="#e6550d", sgrnas=sgrnas_essential, fc=sgrnas_essential_fc
    )

    # sgRNA non-essential
    sgrnas_nonessential = Utils.get_non_essential_genes(return_series=False)
    sgrnas_nonessential = set(clib[clib["Gene"].isin(sgrnas_nonessential)].index)
    sgrnas_nonessential_fc = (
        None if fc is None else fc.reindex(sgrnas_nonessential).median(1).dropna()
    )

    sgrna_sets["nonessential"] = dict(
        color="#3182bd", sgrnas=sgrnas_nonessential, fc=sgrnas_nonessential_fc
    )

    # sgRNA non-targeting
    if add_controls:
        if dataset_name in ["Yusa_v1", "Yusa v1", "Yusa_v1.1", "Yusa v1.1", "Sabatini_Lander_AML"]:
            sgrnas_control = {i for i in clib.index if i.startswith("CTRL0")}
        else:
            sgrnas_control = set(
                clib[[i.startswith("NO_CURRENT_") for i in clib["Gene"]]].index
            )

        sgrnas_control_fc = fc.reindex(sgrnas_control).median(1).dropna()

        sgrna_sets["nontargeting"] = dict(
            color="#31a354",
            sgrnas=sgrnas_control,
            fc=None if fc is None else sgrnas_control_fc,
        )

    return sgrna_sets
Ejemplo n.º 5
0
    def filter(
        self,
        dtype="merged",
        subset=None,
        scale=True,
        std_filter=False,
        abs_thres=None,
        drop_core_essential=False,
        min_events=5,
        drop_core_essential_broad=False,
        binarise_thres=None,
    ):
        df = self.get_data(scale=True, dtype=dtype)

        # - Filters
        # Subset matrices
        if subset is not None:
            df = df.loc[:, df.columns.isin(subset)]

        # Filter by scaled scores
        if abs_thres is not None:
            df = df[(df.abs() > abs_thres).sum(1) >= min_events]

        # Filter out core essential genes
        if drop_core_essential:
            df = df[~df.index.isin(Utils.get_adam_core_essential())]

        if drop_core_essential_broad:
            df = df[~df.index.isin(Utils.get_broad_core_essential())]

        # - Subset matrices
        x = self.get_data(scale=scale, dtype=dtype).reindex(
            index=df.index, columns=df.columns
        )

        if binarise_thres is not None:
            x = (x < binarise_thres).astype(int)

        if std_filter:
            x = x.reindex(x.std(1) > 0)

        return x
Ejemplo n.º 6
0
def project_score_data(sgrnas, subset=None):
    ddir = pkg_resources.resource_filename("crispy", "data/")

    score_manifest = pd.read_csv(
        f"{ddir}/crispr_manifests/project_score_manifest.csv.gz")

    s_map = []
    for i in score_manifest.index:
        s_map.append(
            pd.DataFrame(
                dict(
                    model_id=score_manifest.iloc[i]["model_id"],
                    s_ids=score_manifest.iloc[i]["library"].split(", "),
                    s_lib=score_manifest.iloc[i]
                    ["experiment_identifier"].split(", "),
                )))
    s_map = pd.concat(s_map).set_index("s_lib")

    if subset is not None:
        s_map = s_map[s_map["model_id"].isin(subset)]

    score_v1 = CRISPRDataSet("Yusa_v1")
    score_v1_fc = score_v1.counts.norm_rpm().foldchange(score_v1.plasmids)
    score_v1_fc = score_v1_fc.groupby(s_map["model_id"], axis=1).mean()

    score_v11 = CRISPRDataSet("Yusa_v1.1")
    score_v11_fc = score_v11.counts.norm_rpm().foldchange(score_v11.plasmids)
    score_v11_fc = score_v11_fc.groupby(s_map["model_id"], axis=1).mean()

    ess = set(score_v1.lib[score_v1.lib["Gene"].isin(
        Utils.get_essential_genes())].index)
    ness = set(score_v1.lib[score_v1.lib["Gene"].isin(
        Utils.get_non_essential_genes())].index)
    score_v1_fc = ReadCounts(score_v1_fc).scale(essential=ess,
                                                non_essential=ness)
    score_v11_fc = ReadCounts(score_v11_fc).scale(essential=ess,
                                                  non_essential=ness)

    score_fc = pd.concat([score_v1_fc.loc[sgrnas], score_v11_fc.loc[sgrnas]],
                         axis=1).dropna()

    return score_fc
Ejemplo n.º 7
0
    def pr_curve(rank, true_set=None, false_set=None, min_events=10):
        if true_set is None:
            true_set = Utils.get_essential_genes(return_series=False)

        if false_set is None:
            false_set = Utils.get_non_essential_genes(return_series=False)

        index_set = true_set.union(false_set)

        rank = rank[rank.index.isin(index_set)]

        if len(rank) == 0:
            return np.nan

        y_true = rank.index.isin(true_set).astype(int)

        if sum(y_true) < min_events:
            return np.nan

        return roc_auc_score(y_true, -rank)
Ejemplo n.º 8
0
    def scale(self, essential=None, non_essential=None, metric=np.median):
        if essential is None:
            essential = Utils.get_essential_genes(return_series=False)

        if non_essential is None:
            non_essential = Utils.get_non_essential_genes(return_series=False)

        assert (
            len(essential.intersection(self.index)) != 0
        ), "DataFrame has no index overlapping with essential list"

        assert (
            len(non_essential.intersection(self.index)) != 0
        ), "DataFrame has no index overlapping with non essential list"

        essential_metric = metric(self.reindex(essential).dropna(), axis=0)
        non_essential_metric = metric(self.reindex(non_essential).dropna(), axis=0)

        return self.subtract(non_essential_metric).divide(
            non_essential_metric - essential_metric
        )
Ejemplo n.º 9
0
    def plot_rearrangements(
        cls,
        brass_bedpe,
        ascat_bed,
        crispy_bed,
        chrm,
        chrm_size=None,
        xlim=None,
        scale=1e6,
        show_legend=True,
        unfold_inversions=False,
        sv_alpha=1.0,
        sv_lw=0.3,
        highlight=None,
        mark_essential=False,
    ):
        # - Define default params
        chrm_size = Utils.CHR_SIZES_HG19 if chrm_size is None else chrm_size

        xlim = (0, chrm_size[chrm]) if xlim is None else xlim

        # - Build data-frames
        # BRASS
        brass_ = brass_bedpe[(brass_bedpe["chr1"] == chrm) |
                             (brass_bedpe["chr2"] == chrm)]

        # ASCAT
        ascat_ = ascat_bed.query(f"chr == '{chrm}'")

        # CRISPR
        crispr_ = crispy_bed[crispy_bed["chr"] == chrm]
        crispr_ = crispr_.assign(
            location=crispr_[["sgrna_start", "sgrna_end"]].mean(1))

        crispr_gene_ = crispr_.groupby("gene")[["fold_change",
                                                "location"]].mean()

        if brass_.shape[0] == 0:
            return None, None, None

        # - Plot
        f, (ax1, ax2,
            ax3) = plt.subplots(3,
                                1,
                                sharex="all",
                                gridspec_kw={"height_ratios": [1, 2, 2]})

        # Top panel
        ax1.axhline(0.0, lw=0.3, color=cls.PAL_DBGD[0])
        ax1.set_ylim(-1, 1)

        # Middle panel
        for i, (_, s, e, cn) in ascat_.iterrows():
            ax2.plot(
                (s / scale, e / scale),
                (cn, cn),
                alpha=1.0,
                c=cls.PAL_DBGD[2],
                zorder=3,
                label="ASCAT",
                lw=2,
            )

        # Bottom panel
        ax3.scatter(
            crispr_["location"] / scale,
            crispr_["fold_change"],
            s=1,
            alpha=0.5,
            lw=0,
            c=cls.PAL_DBGD[1],
            label="CRISPR-Cas9",
            zorder=1,
        )
        ax3.axhline(0.0, lw=0.3, color=cls.PAL_DBGD[0])

        for (s, e), gp_mean in crispr_.groupby(["start",
                                                "end"])["fold_change"]:
            ax3.plot(
                (s / scale, e / scale),
                (gp_mean.mean(), gp_mean.mean()),
                alpha=1.0,
                c=cls.PAL_DBGD[2],
                zorder=3,
                label="Segment mean",
                lw=2,
            )

        if mark_essential:
            ess = Utils.get_adam_core_essential()
            ax3.scatter(
                crispr_gene_.reindex(ess)["location"] / scale,
                crispr_gene_.reindex(ess)["fold_change"],
                s=5,
                marker="x",
                lw=0.3,
                c=cls.PAL_DBGD[1],
                alpha=0.4,
                edgecolors="#fc8d62",
                label="Core-essential",
            )

        # Highlight
        if highlight is not None:
            for ic, i in zip(
                    *(sns.color_palette("tab20", n_colors=len(highlight)),
                      highlight)):
                if i in crispr_.index:
                    ax3.scatter(
                        crispr_.loc[i, "location"] / scale,
                        crispr_.loc[i]["fold_change"],
                        s=14,
                        marker="X",
                        lw=0,
                        c=ic,
                        alpha=0.9,
                        label=i,
                    )

        #
        for c1, s1, e1, c2, s2, e2, st1, st2, sv in brass_[[
                "chr1",
                "start1",
                "end1",
                "chr2",
                "start2",
                "end2",
                "strand1",
                "strand2",
                "svclass",
        ]].values:
            stype = Utils.svtype(st1, st2, sv, unfold_inversions)
            stype_col = cls.SV_PALETTE[stype]

            zorder = 2 if stype == "tandem-duplication" else 1

            x1_mean, x2_mean = np.mean([s1, e1]), np.mean([s2, e2])

            # Plot arc
            if c1 == c2:
                angle = 0 if stype in ["tandem-duplication", "deletion"
                                       ] else 180

                xy = (np.mean([x1_mean, x2_mean]) / scale, 0)

                ax1.add_patch(
                    Arc(
                        xy,
                        (x2_mean - x1_mean) / scale,
                        1.0,
                        angle=angle,
                        theta1=0,
                        theta2=180,
                        edgecolor=stype_col,
                        lw=sv_lw,
                        zorder=zorder,
                        alpha=sv_alpha,
                    ))

            # Plot segments
            for ymin, ymax, ax in [(-1, 0.5, ax1), (-1, 1, ax2), (0, 1, ax3)]:
                if (c1 == chrm) and (xlim[0] <= x1_mean <= xlim[1]):
                    ax.axvline(
                        x=x1_mean / scale,
                        ymin=ymin,
                        ymax=ymax,
                        c=stype_col,
                        linewidth=sv_lw,
                        zorder=zorder,
                        clip_on=False,
                        label=stype,
                        alpha=sv_alpha,
                    )

                if (c2 == chrm) and (xlim[0] <= x2_mean <= xlim[1]):
                    ax.axvline(
                        x=x2_mean / scale,
                        ymin=ymin,
                        ymax=ymax,
                        c=stype_col,
                        linewidth=sv_lw,
                        zorder=zorder,
                        clip_on=False,
                        label=stype,
                        alpha=sv_alpha,
                    )

            # Translocation label
            if stype == "translocation":
                if (c1 == chrm) and (xlim[0] <= x1_mean <= xlim[1]):
                    ax1.text(
                        x1_mean / scale,
                        0,
                        " to {}".format(c2),
                        color=stype_col,
                        ha="center",
                        fontsize=5,
                        rotation=90,
                        va="bottom",
                    )

                if (c2 == chrm) and (xlim[0] <= x2_mean <= xlim[1]):
                    ax1.text(
                        x2_mean / scale,
                        0,
                        " to {}".format(c1),
                        color=stype_col,
                        ha="center",
                        fontsize=5,
                        rotation=90,
                        va="bottom",
                    )

        #
        if show_legend:
            by_label = {
                l.capitalize(): p
                for p, l in zip(*(ax2.get_legend_handles_labels()))
                if l in cls.SV_PALETTE
            }
            ax1.legend(
                by_label.values(),
                by_label.keys(),
                loc="center left",
                bbox_to_anchor=(1.02, 0.5),
                prop={"size": 6},
                frameon=False,
            )

            by_label = {
                l: p
                for p, l in zip(*(ax2.get_legend_handles_labels()))
                if l not in cls.SV_PALETTE
            }
            ax2.legend(
                by_label.values(),
                by_label.keys(),
                loc="center left",
                bbox_to_anchor=(1.02, 0.5),
                prop={"size": 6},
                frameon=False,
            )

            by_label = {
                l: p
                for p, l in zip(*(ax3.get_legend_handles_labels()))
                if l not in cls.SV_PALETTE
            }
            ax3.legend(
                by_label.values(),
                by_label.keys(),
                loc="center left",
                bbox_to_anchor=(1.02, 0.5),
                prop={"size": 6},
                frameon=False,
            )

        #
        ax1.axis("off")

        #
        ax2.set_ylim(0, np.ceil(ascat_["copy_number"].quantile(0.9999) + 0.5))

        #
        ax2.yaxis.set_major_locator(plticker.MultipleLocator(base=2.0))
        ax3.yaxis.set_major_locator(plticker.MultipleLocator(base=1.0))

        #
        ax2.tick_params(axis="both", which="major", labelsize=6)
        ax3.tick_params(axis="both", which="major", labelsize=6)

        #
        ax1.set_ylabel("SV")
        ax2.set_ylabel("Copy-number", fontsize=7)
        ax3.set_ylabel("Loss of fitness", fontsize=7)

        #
        plt.xlabel("Position on chromosome {} (Mb)".format(
            chrm.replace("chr", "")))

        #
        plt.xlim(xlim[0] / scale, xlim[1] / scale)

        return ax1, ax2, ax3
Ejemplo n.º 10
0
    def plot_chromosome(
        cls,
        crispy_bed,
        ascat_bed,
        chrm,
        y_var="fold_change",
        highlight=None,
        ax=None,
        legend=False,
        scale=1e6,
        tick_base=1,
        legend_size=5,
    ):

        if ax is None:
            ax = plt.gca()

        # - Build data-frames
        # ASCAT
        ascat_ = ascat_bed.query(f"Chr == '{chrm}'")

        # CRISPR
        crispr_ = crispy_bed[crispy_bed["Chr"] == chrm]
        crispr_ = crispr_.assign(
            location=crispr_[["sgRNA_Start", "sgRNA_End"]].mean(1))

        crispr_gene_ = crispr_.groupby("gene")[[y_var, "location"]].mean()

        # Plot original values
        ax.scatter(
            crispr_["location"] / scale,
            crispr_[y_var],
            s=6,
            marker=".",
            lw=0,
            c=cls.PAL_DBGD[1],
            alpha=0.4,
            label="CRISPR-Cas9",
        )

        # Segment mean
        for (s, e), gp_mean in crispr_.groupby(["Start", "End"])[y_var]:
            ax.plot(
                (s / scale, e / scale),
                (gp_mean.mean(), gp_mean.mean()),
                alpha=1.0,
                c=cls.PAL_DBGD[2],
                zorder=3,
                label="CRISPR-Cas9 segment mean",
                lw=2,
            )

        # Plot segments
        for s, e, cn in ascat_[["Start", "End", "copy_number"]].values:
            ax.plot(
                (s / scale, e / scale),
                (cn, cn),
                alpha=1.0,
                c=cls.PAL_DBGD[0],
                zorder=3,
                label="Copy-number segment",
                lw=2,
            )

        # Highlight
        if highlight is not None:
            for ic, i in zip(
                    *(sns.color_palette("tab20", n_colors=len(highlight)),
                      highlight)):
                if i in crispr_gene_.index:
                    ax.scatter(
                        crispr_gene_["location"].loc[i] / scale,
                        crispr_gene_[y_var].loc[i],
                        s=14,
                        marker="X",
                        lw=0,
                        c=ic,
                        alpha=0.9,
                        label=i,
                    )
        # Misc
        ax.axhline(0, lw=0.3, ls="-", color="black")

        # Cytobads
        cytobands = Utils.get_cytobands(chrm=chrm)

        for i, (s, e, t) in enumerate(cytobands[["Start", "End",
                                                 "band"]].values):
            if t == "acen":
                ax.axvline(s / scale,
                           lw=0.2,
                           ls="-",
                           color=cls.PAL_DBGD[0],
                           alpha=0.1)
                ax.axvline(e / scale,
                           lw=0.2,
                           ls="-",
                           color=cls.PAL_DBGD[0],
                           alpha=0.1)

            elif not i % 2:
                ax.axvspan(s / scale,
                           e / scale,
                           alpha=0.1,
                           facecolor=cls.PAL_DBGD[0])

        # Legend
        if legend:
            handles, labels = plt.gca().get_legend_handles_labels()
            by_label = OrderedDict(zip(labels, handles))
            plt.legend()
            ax.legend(
                by_label.values(),
                by_label.keys(),
                loc="center left",
                bbox_to_anchor=(1, 0.5),
                prop={"size": legend_size},
                frameon=False,
            )

        ax.set_xlim(crispr_["Start"].min() / scale,
                    crispr_["End"].max() / scale)

        ax.tick_params(axis="both", which="major", labelsize=5)

        ax.yaxis.set_major_locator(plticker.MultipleLocator(base=tick_base))

        return ax
Ejemplo n.º 11
0
def define_controls(
    n_genes=3,
    cancer_type="Colorectal Carcinoma",
    cn_min=1,
    cn_max=5,
    crisp_min=-0.10,
    jacks_thres=0.25,
    offtarget=[1, 0, 0],
):
    # Samples
    samples = set(DataImporter.Sample().samplesheet.query(
        f"cancer_type == '{cancer_type}'").index)

    # Non-essential genes
    ness = Utils.get_non_essential_genes(return_series=False)
    ness = ness - set(Utils.get_sanger_essential()["Gene"])

    # Non-essential genes sgRNAs
    ness_sgrnas = pd.concat(
        [
            gselection.select_sgrnas(
                g, 2, jacks_thres=jacks_thres,
                offtarget=offtarget).assign(gene=g) for g in ness
        ],
        ignore_index=True,
    ).query("Library == 'KosukeYusa'")

    ness_sgrnas_fc = project_score_data(ness_sgrnas["sgRNA_ID"], samples)

    ness_sgrnas_fc_ds = ness_sgrnas_fc.T.describe().T.dropna()
    ness_sgrnas_fc_ds = ness_sgrnas_fc_ds[
        ness_sgrnas_fc_ds["25%"] >= crisp_min]
    ness_sgrnas_fc_ds["Approved_Symbol"] = (
        ness_sgrnas.set_index("sgRNA_ID").loc[ness_sgrnas_fc_ds.index,
                                              "Approved_Symbol"].values)

    ness_sgrnas = ness_sgrnas.set_index("sgRNA_ID").loc[
        ness_sgrnas_fc_ds.index]

    # Import different levels of information
    ddir = pkg_resources.resource_filename("crispy", "data/")

    cn = DataImporter.CopyNumber(
        f"{ddir}/copy_number/cnv_abs_copy_number_picnic_20191101.csv.gz"
    ).filter(subset=samples)

    hgnc = pd.read_csv(f"{DPATH}/protein-coding_gene.txt",
                       sep="\t",
                       index_col=1)

    # Control genes
    controls = ness_sgrnas.groupby("Approved_Symbol")["Library"].count()
    controls = list(controls[controls == 2].index)
    controls = pd.concat(
        [
            cn.reindex(controls).dropna().T.describe().T,
            hgnc.reindex(controls)["location"],
        ],
        axis=1,
        sort=False,
    ).dropna()
    controls = controls.query(f"(min >= {cn_min}) and (max <= {cn_max})")
    controls = controls.reset_index().rename(
        columns={"index": "Approved_Symbol"})
    controls = controls.merge(
        ness_sgrnas_fc_ds.reset_index(),
        on="Approved_Symbol",
        suffixes=("_cn", "_crispr"),
    )

    control_genes = list(
        controls.groupby("Approved_Symbol")["min_crispr"].mean().sort_values(
            ascending=False)[:n_genes].index)
    controls = controls[controls["Approved_Symbol"].isin(control_genes)]
    controls["location"] = hgnc.loc[controls["Approved_Symbol"],
                                    "location"].values

    control_guides = gselection.masterlib[
        gselection.masterlib["sgRNA_ID"].isin(
            controls["sgRNA"])].assign(Confidence="Control")[LIB_COLUMNS]

    return control_guides.sort_values("Approved_Symbol")
    lw=0.05,
    col_colors=pd.Series(sample_pal)[plot_df.columns].rename("Library"),
    row_colors=pd.Series(sample_pal)[plot_df.index].rename("Library"),
    cbar_pos=None,
)

plt.savefig(f"{RPATH}/minlibcas9_screens_clustermap_gene_fc.pdf",
            bbox_inches="tight")
plt.close("all")

# Recall gene lists
#

gsets_aucs = {}
for n, gset in [
    ("essential", Utils.get_essential_genes()),
    ("non-essential", Utils.get_non_essential_genes()),
]:
    # Aroc
    plt.figure(figsize=(2, 2), dpi=600)
    ax = plt.gca()
    _, stats_ess = QCplot.plot_cumsum_auc(fc_gene[samples],
                                          gset,
                                          palette=sample_pal,
                                          legend_prop={"size": 4},
                                          ax=ax)
    plt.title(f"{n} recall curve")
    plt.xlabel("Percent-rank of genes")
    plt.ylabel("Cumulative fraction")
    plt.grid(True, ls=":", lw=0.1, alpha=1.0, zorder=0, axis="both")
    plt.savefig(f"{RPATH}/minlibcas9_screens_roccurves_{n}.pdf",
    cnv = cnv_obj.filter(subset=list(prot))
    cnv_norm = np.log2(cnv.divide(prot_obj.ss.loc[cnv.columns, "ploidy"]) + 1)
    LOG.info(f"Copy number: {cnv.shape}")

    # Overlaps
    #
    samples = list(set.intersection(set(prot), set(gexp), set(cnv)))
    genes = list(
        set.intersection(set(prot.index), set(gexp.index), set(cnv.index),
                         set(prot_broad.index)))
    LOG.info(f"Genes: {len(genes)}; Samples: {len(samples)}")

    # Data tranformations
    #
    gexp_t = pd.DataFrame(
        {i: Utils.gkn(gexp.loc[i].dropna()).to_dict()
         for i in genes}).T

    ##
    #
    s_corr = pd.DataFrame({
        s1: {
            s2: two_vars_correlation(prot[s1], gexp[s2])["corr"]
            for s2 in samples
        }
        for s1 in samples
    })
    s_corr.to_csv(
        "/Users/Downloads/Proteomics_Transcriptomics_Corr_Matrix.csv")

    # Sample-wise Protein/Gene correlation with CopyNumber - Attenuation
Ejemplo n.º 14
0
        stromal_count != 1].index)]

    # Import proteomics data-sets
    #
    dmatrix, ms_type, ctypes = [], [], []
    for ctype, dfile in CPTAC_DATASETS:
        df = pd.read_csv(f"{CPTAC_DPATH}/linkedomics/{dfile}",
                         sep="\t",
                         index_col=0)

        if "COADREAD" in dfile:
            df = df.replace(0, np.nan)
            df = df.pipe(np.log2)

        df = pd.DataFrame(
            {i: Utils.gkn(df.loc[i].dropna()).to_dict()
             for i in df.index}).T

        # Simplify barcode
        df.columns = [i[:12].replace(".", "-") for i in df]

        # Cancer type
        ctypes.append(pd.Series(ctype, index=df.columns))

        # MS type
        ms_type.append(
            pd.Series("LF" if "COADREAD" in dfile else "TMT",
                      index=df.columns))

        dmatrix.append(df)