def h(self, w, i=None, j=None, split='train'): '''Function value at w. If i is None, returns f(x); if i is not None but j is, returns the function value in the i-th machine; otherwise,return the function value of j-th sample in i-th machine.''' if split == 'train': X = self.X_train Y = self.Y_train elif split == 'test': if w.ndim > 1 or i is not None or j is not None: log.fatal( "Function value on test set only applies to one parameter vector" ) X = self.X_test Y = self.Y_test if i is None: # Return the function value tmp = X.dot(w) return -xp.sum( (Y - 1) * tmp - xp.log1p(xp.exp(-tmp))) / X.shape[0] + xp.sum( w**2) * self.LAMBDA / 2 elif j is None: # Return the function value in machine i tmp = self.X[i].dot(w) return -xp.sum((self.Y[i] - 1) * tmp - xp.log1p(xp.exp(-tmp)) ) / self.m + xp.sum(w**2) * self.LAMBDA / 2 else: # Return the gradient of sample j in machine i tmp = self.X[i][j].dot(w) return -((self.Y[i][j] - 1) * tmp - xp.log1p(xp.exp(-tmp))) + xp.sum(w**2) * self.LAMBDA / 2
def log1p(x: Array, /) -> Array: """ Array API compatible wrapper for :py:func:`np.log1p <numpy.log1p>`. See its docstring for more information. """ if x.dtype not in _floating_dtypes: raise TypeError("Only floating-point dtypes are allowed in log1p") return Array._new(np.log1p(x._array))
def _truncnorm_ppf(self,q, N): out = cp.zeros(cp.shape(q)) delta = self._truncnorm_get_delta(N) cond1 = delta > 0 cond2 = (delta > 0) & (self.a > 0) cond21 = (delta > 0) & (self.a<=0) if cp.any(cond1) == True: sa = self.norm_sf(a[cond2]) out[:,cond2] = -self.ndtri((1 - q[:,cond2]) * sa) if cp.any(cond21) == True: na = norm_cdf(self.a[cond21]) out[:,cond21] = self._ndtri(q[:,cond21] + na * (1.0 - q[:,cond21])) cond3 = ~cond1 & cp.isinf(self.b) cond4 = ~cond1 & cp.isinf(self.a) if cp.any(cond3) == True: out[:,cond3] = -self._norm_ilogcdf(cp.log1p(-q[:,cond3]) + self.norm_logsf(self.a[cond3])) if cp.any(cond4) == True: out[:,cond4] = self._norm_ilogcdf(cp.log(q) + self.norm_logcdf(self.b)) cond5 = out < self.a if cp.any(cond5) == True: out[cond5] = ((cond5) * self.a)[cond5] return out
def filter_cells(adata: AnnData, device="cpu", p_level=None, subset=True, plot=False, copy=False): """\ Filter cells using on gene/molecule relationship. Code has been translated from pagoda2 R function gene.vs.molecule.cell.filter. Parameters ---------- adata Annotated data matrix. device Run gene and molecule counting on either `cpu` or on `gpu`. p_level Statistical confidence level for deviation from the main trend, used for cell filtering (default=min(1e-3,1/adata.shape[0])) subset if False, add a column `outlier` in adata.obs, otherwise subset the adata. plot Plot the molecule distribution and the gene/molecule dependency fit. copy Return a copy instead of writing to adata. Returns ------- adata : anndata.AnnData if `copy=True` and `subset=True` it returns subsetted (removing outliers) or else add fields to `adata`: `.obs['outlier']` whether a cell is an outlier. """ adata = adata.copy() if copy else adata logg.info("Filtering cells", reset=True) X = adata.X.copy() logg.info(" obtaining gene and molecule counts") if device == "cpu": log1p_total_counts = np.log1p(np.array(X.sum(axis=1))).ravel() X.data = np.ones_like(X.data) log1p_n_genes_by_counts = np.log1p(np.array(X.sum(axis=1))).ravel() elif device == "gpu": import cupy as cp from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu X = csr_matrix_gpu(X) log1p_total_counts = cp.log1p(X.sum(axis=1)).get().ravel() X.data = cp.ones_like(X.data) log1p_n_genes_by_counts = cp.log1p(X.sum(axis=1)).get().ravel() df = pd.DataFrame( { "log1p_total_counts": log1p_total_counts, "log1p_n_genes_by_counts": log1p_n_genes_by_counts, }, index=adata.obs_names, ) logg.info(" fitting RLM") rlm_model = sm.RLM.from_formula( "log1p_n_genes_by_counts ~ log1p_total_counts", df, ).fit() p_level = min(1e-3, 1 / adata.shape[0]) if p_level is None else p_level SSE_line = ((df.log1p_n_genes_by_counts - rlm_model.predict())**2).sum() MSE = SSE_line / df.shape[0] z = t.ppf((p_level / 2, 1 - p_level / 2), df.shape[0]) se = np.zeros(df.shape[0]) get_SE(MSE, df.log1p_total_counts.values, se) pr = pd.DataFrame( { 0: rlm_model.predict(), 1: rlm_model.predict() + se * z[0], 2: rlm_model.predict() + se * z[1], }, index=adata.obs_names, ) logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") outlier = (df.log1p_n_genes_by_counts < pr[1]) | (df.log1p_n_genes_by_counts > pr[2]) if plot: fig, ax = plt.subplots() idx = df.sort_values("log1p_total_counts").index ax.fill_between( df.log1p_total_counts[[idx[0], idx[-1]]], pr[1][[idx[0], idx[-1]]], pr[2][[idx[0], idx[-1]]], color="yellow", alpha=0.3, ) df.loc[~outlier].plot.scatter(x="log1p_total_counts", y="log1p_n_genes_by_counts", c="k", ax=ax, s=1) df.loc[outlier].plot.scatter(x="log1p_total_counts", y="log1p_n_genes_by_counts", c="grey", ax=ax, s=1) if subset: adata._inplace_subset_obs(adata.obs_names[~outlier]) logg.hint("subsetted adata.") else: adata.obs["outlier"] = outlier logg.hint("added \n" " .obs['outlier'], boolean column indicating outliers.") return adata if copy else None