def apply_qc_filters(unidata: UnimodalData):
    """ Apply QC filters to filter out low quality cells """
    if "passed_qc" in unidata.obs:
        prior_n = unidata.shape[0]
        unidata._inplace_subset_obs(unidata.obs["passed_qc"])

        cols = ["passed_qc"]
        if unidata.uns.get("__del_demux_type", False):
            cols.append("demux_type")
            if "assignment" in unidata.obs:
                # remove categories that contain no elements
                series = unidata.obs["assignment"].value_counts(sort=False)
                unidata.obs["assignment"] = pd.Categorical(
                    unidata.obs["assignment"],
                    categories=series[series > 0].index.astype(str))
            # del unidata.uns["__del_demux_type"]

        unidata.obs.drop(columns=cols, inplace=True)
        if len(unidata.obsm) > 0:
            unidata.obsm.clear()
        if len(unidata.varm) > 0:
            unidata.varm.clear()
        for key in list(unidata.uns):
            if key not in {'genome', 'modality', 'norm_count', 'df_qcplot'}:
                del unidata.uns[key]
        logger.info(
            f"After filtration, {unidata.shape[0]} out of {prior_n} cell barcodes are kept in UnimodalData object {unidata.get_uid()}."
        )
def estimate_background_probs(hashing_data: UnimodalData,
                              random_state: int = 0) -> None:
    """For cell-hashing data, estimate antibody background probability using KMeans algorithm.

    Parameters
    ----------
    hashing_data: ``UnimodalData``
        Annotated data matrix for antibody.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    Returns
    -------
    ``None``

    Update ``hashing_data``: Filtered cell barcodes with 0 counts

    Update ``hashing_data.uns``:
        * ``hashing_data.uns["background_probs"]``: estimated antibody background probability.

    Example
    -------
    >>> estimate_background_probs(hashing_data)
    """
    hashing_data.obs["counts"] = hashing_data.X.sum(axis=1).A1

    # Remove barcodes with 0 total counts
    idx = hashing_data.obs["counts"] == 0
    ncell_zero = idx.sum()
    if ncell_zero > 0:
        logger.warning(
            f"Detected {ncell_zero} cell barcodes with 0 hashtag counts, which are removed from the hashing data object."
        )
        hashing_data._inplace_subset_obs(~idx)

    counts_log10 = np.log10(hashing_data.obs["counts"].values.reshape(-1, 1))
    kmeans = KMeans(n_clusters=2, random_state=random_state).fit(counts_log10)
    signal = 0 if kmeans.cluster_centers_[0] > kmeans.cluster_centers_[1] else 1
    hashing_data.obs["hto_type"] = "background"
    hashing_data.obs.loc[kmeans.labels_ == signal, "hto_type"] = "signal"

    idx = np.isin(hashing_data.obs["hto_type"], "background")
    pvec = hashing_data.X[idx, ].sum(axis=0).A1
    back_probs = pvec / pvec.sum()

    idx = back_probs <= 0.0
    if idx.sum() > 0:
        logger.warning(
            f"Detected {idx.sum()} antibody barcodes {','.join(hashing_data.var_names[idx])} with 0 counts in the background! These barcodes are likely not in the experiment and thus removed."
        )
        hashing_data._inplace_subset_var(~idx)
        back_probs = back_probs[~idx]

    hashing_data.uns["background_probs"] = back_probs
    logger.info("Background probability distribution is estimated.")
Example #3
0
def apply_qc_filters(unidata: UnimodalData):
    """ Apply QC filters to filter out low quality cells """
    if "passed_qc" in unidata.obs:
        prior_n = unidata.shape[0]
        unidata._inplace_subset_obs(unidata.obs["passed_qc"])

        cols = ["passed_qc"]
        if unidata.uns.get("__del_demux_type", False):
            cols.append("demux_type")
            del unidata.uns["__del_demux_type"]

        unidata.obs.drop(columns=cols, inplace=True)
        logger.info(
            f"After filtration, {unidata.shape[0]} out of {prior_n} cell barcodes are kept in UnimodalData object {unidata.get_uid()}."
        )