Beispiel #1
0
    def h(self, w, i=None, j=None, split='train'):
        '''Function value at w. If i is None, returns f(x); if i is not None but j is, returns the function value in the i-th machine; otherwise,return the function value of j-th sample in i-th machine.'''

        if split == 'train':
            X = self.X_train
            Y = self.Y_train
        elif split == 'test':
            if w.ndim > 1 or i is not None or j is not None:
                log.fatal(
                    "Function value on test set only applies to one parameter vector"
                )
            X = self.X_test
            Y = self.Y_test

        if i is None:  # Return the function value
            tmp = X.dot(w)
            return -xp.sum(
                (Y - 1) * tmp - xp.log1p(xp.exp(-tmp))) / X.shape[0] + xp.sum(
                    w**2) * self.LAMBDA / 2

        elif j is None:  # Return the function value in machine i
            tmp = self.X[i].dot(w)
            return -xp.sum((self.Y[i] - 1) * tmp - xp.log1p(xp.exp(-tmp))
                           ) / self.m + xp.sum(w**2) * self.LAMBDA / 2
        else:  # Return the gradient of sample j in machine i
            tmp = self.X[i][j].dot(w)
            return -((self.Y[i][j] - 1) * tmp -
                     xp.log1p(xp.exp(-tmp))) + xp.sum(w**2) * self.LAMBDA / 2
def log1p(x: Array, /) -> Array:
    """
    Array API compatible wrapper for :py:func:`np.log1p <numpy.log1p>`.

    See its docstring for more information.
    """
    if x.dtype not in _floating_dtypes:
        raise TypeError("Only floating-point dtypes are allowed in log1p")
    return Array._new(np.log1p(x._array))
 def _truncnorm_ppf(self,q, N):
     out = cp.zeros(cp.shape(q))
     delta = self._truncnorm_get_delta(N)
     cond1 = delta > 0
     cond2 = (delta > 0) & (self.a > 0)
     cond21 = (delta > 0) & (self.a<=0)
     if cp.any(cond1) == True:
         sa = self.norm_sf(a[cond2])
         out[:,cond2] = -self.ndtri((1 - q[:,cond2]) * sa)
     if cp.any(cond21) == True:
         na = norm_cdf(self.a[cond21])
         out[:,cond21] = self._ndtri(q[:,cond21]  + na * (1.0 - q[:,cond21]))
     cond3 = ~cond1 & cp.isinf(self.b)
     cond4 = ~cond1 & cp.isinf(self.a)
     if cp.any(cond3) == True:
         out[:,cond3] = -self._norm_ilogcdf(cp.log1p(-q[:,cond3]) + self.norm_logsf(self.a[cond3]))
     if cp.any(cond4) == True:
         out[:,cond4] = self._norm_ilogcdf(cp.log(q) + self.norm_logcdf(self.b))
     cond5 = out < self.a
     if cp.any(cond5) == True:
         out[cond5] = ((cond5) * self.a)[cond5]
     return out
Beispiel #4
0
def filter_cells(adata: AnnData,
                 device="cpu",
                 p_level=None,
                 subset=True,
                 plot=False,
                 copy=False):
    """\
    Filter cells using on gene/molecule relationship.

    Code has been translated from pagoda2 R function gene.vs.molecule.cell.filter.


    Parameters
    ----------
    adata
        Annotated data matrix.
    device
        Run gene and molecule counting on either `cpu` or on `gpu`.
    p_level
        Statistical confidence level for deviation from the main trend, used for cell filtering (default=min(1e-3,1/adata.shape[0]))
    subset
        if False, add a column `outlier` in adata.obs, otherwise subset the adata.
    plot
        Plot the molecule distribution and the gene/molecule dependency fit.
    copy
        Return a copy instead of writing to adata.
    Returns
    -------

    adata : anndata.AnnData
        if `copy=True` and `subset=True` it returns subsetted (removing outliers) or else add fields to `adata`:

        `.obs['outlier']`
            whether a cell is an outlier.

    """

    adata = adata.copy() if copy else adata

    logg.info("Filtering cells", reset=True)
    X = adata.X.copy()

    logg.info("    obtaining gene and molecule counts")
    if device == "cpu":
        log1p_total_counts = np.log1p(np.array(X.sum(axis=1))).ravel()
        X.data = np.ones_like(X.data)
        log1p_n_genes_by_counts = np.log1p(np.array(X.sum(axis=1))).ravel()
    elif device == "gpu":
        import cupy as cp
        from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu

        X = csr_matrix_gpu(X)
        log1p_total_counts = cp.log1p(X.sum(axis=1)).get().ravel()
        X.data = cp.ones_like(X.data)
        log1p_n_genes_by_counts = cp.log1p(X.sum(axis=1)).get().ravel()

    df = pd.DataFrame(
        {
            "log1p_total_counts": log1p_total_counts,
            "log1p_n_genes_by_counts": log1p_n_genes_by_counts,
        },
        index=adata.obs_names,
    )

    logg.info("    fitting RLM")

    rlm_model = sm.RLM.from_formula(
        "log1p_n_genes_by_counts ~ log1p_total_counts",
        df,
    ).fit()

    p_level = min(1e-3, 1 / adata.shape[0]) if p_level is None else p_level

    SSE_line = ((df.log1p_n_genes_by_counts - rlm_model.predict())**2).sum()
    MSE = SSE_line / df.shape[0]
    z = t.ppf((p_level / 2, 1 - p_level / 2), df.shape[0])

    se = np.zeros(df.shape[0])
    get_SE(MSE, df.log1p_total_counts.values, se)
    pr = pd.DataFrame(
        {
            0: rlm_model.predict(),
            1: rlm_model.predict() + se * z[0],
            2: rlm_model.predict() + se * z[1],
        },
        index=adata.obs_names,
    )

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")

    outlier = (df.log1p_n_genes_by_counts <
               pr[1]) | (df.log1p_n_genes_by_counts > pr[2])

    if plot:
        fig, ax = plt.subplots()
        idx = df.sort_values("log1p_total_counts").index
        ax.fill_between(
            df.log1p_total_counts[[idx[0], idx[-1]]],
            pr[1][[idx[0], idx[-1]]],
            pr[2][[idx[0], idx[-1]]],
            color="yellow",
            alpha=0.3,
        )
        df.loc[~outlier].plot.scatter(x="log1p_total_counts",
                                      y="log1p_n_genes_by_counts",
                                      c="k",
                                      ax=ax,
                                      s=1)
        df.loc[outlier].plot.scatter(x="log1p_total_counts",
                                     y="log1p_n_genes_by_counts",
                                     c="grey",
                                     ax=ax,
                                     s=1)

    if subset:
        adata._inplace_subset_obs(adata.obs_names[~outlier])
        logg.hint("subsetted adata.")

    else:
        adata.obs["outlier"] = outlier
        logg.hint("added \n"
                  "    .obs['outlier'], boolean column indicating outliers.")

    return adata if copy else None