def SCTransform(adata, min_cells=5, gmean_eps=1, n_genes=2000, n_cells=None, bin_size=500, bw_adjust=3, inplace=True): """ This is a port of SCTransform from the Satija lab. See the R package for original documentation. Currently, only regression against the log UMI counts are supported. The only significant modification is that negative Pearson residuals are zero'd out to preserve the sparsity structure of the data. """ X = adata.X.copy() X = sp.sparse.csr_matrix(X) X.eliminate_zeros() gn = np.array(list(adata.var_names)) cn = np.array(list(adata.obs_names)) genes_cell_count = X.sum(0).A.flatten() genes = np.where(genes_cell_count >= min_cells)[0] genes_ix = genes.copy() X = X[:, genes] Xraw = X.copy() gn = gn[genes] genes = np.arange(X.shape[1]) genes_cell_count = X.sum(0).A.flatten() genes_log_gmean = np.log10(gmean(X, axis=0, eps=gmean_eps)) if n_cells is not None and n_cells < X.shape[0]: cells_step1 = np.sort( np.random.choice(X.shape[0], replace=False, size=n_cells)) genes_cell_count_step1 = X[cells_step1].sum(0).A.flatten() genes_step1 = np.where(genes_cell_count_step1 >= min_cells)[0] genes_log_gmean_step1 = np.log10( gmean(X[cells_step1][:, genes_step1], axis=0, eps=gmean_eps)) else: cells_step1 = np.arange(X.shape[0]) genes_step1 = genes genes_log_gmean_step1 = genes_log_gmean umi = X.sum(1).A.flatten() log_umi = np.log10(umi) X2 = X.copy() X2.data[:] = 1 gene = X2.sum(1).A.flatten() log_gene = np.log10(gene) umi_per_gene = umi / gene log_umi_per_gene = np.log10(umi_per_gene) cell_attrs = pd.DataFrame(index=cn, data=np.vstack( (umi, log_umi, gene, log_gene, umi_per_gene, log_umi_per_gene)).T, columns=[ 'umi', 'log_umi', 'gene', 'log_gene', 'umi_per_gene', 'log_umi_per_gene' ]) data_step1 = cell_attrs.iloc[cells_step1] if n_genes is not None and n_genes < len(genes_step1): log_gmean_dens = stats.gaussian_kde(genes_log_gmean_step1, bw_method='scott') xlo = np.linspace(genes_log_gmean_step1.min(), genes_log_gmean_step1.max(), 512) ylo = log_gmean_dens.evaluate(xlo) xolo = genes_log_gmean_step1 sampling_prob = 1 / (np.interp(xolo, xlo, ylo) + _EPS) genes_step1 = np.sort( np.random.choice(genes_step1, size=n_genes, p=sampling_prob / sampling_prob.sum(), replace=False)) genes_log_gmean_step1 = np.log10( gmean(X[cells_step1, :][:, genes_step1], eps=gmean_eps)) bin_ind = np.ceil(np.arange(1, genes_step1.size + 1) / bin_size) max_bin = max(bin_ind) ps = Manager().dict() for i in range(1, int(max_bin) + 1): genes_bin_regress = genes_step1[bin_ind == i] umi_bin = X[cells_step1, :][:, genes_bin_regress] mm = np.vstack((np.ones(data_step1.shape[0]), data_step1['log_umi'].values.flatten())).T pc_chunksize = umi_bin.shape[1] // os.cpu_count() + 1 pool = Pool(os.cpu_count(), _parallel_init, [genes_bin_regress, umi_bin, gn, mm, ps]) try: pool.map(_parallel_wrapper, range(umi_bin.shape[1]), chunksize=pc_chunksize) finally: pool.close() pool.join() ps = ps._getvalue() model_pars = pd.DataFrame(data=np.vstack([ps[x] for x in gn[genes_step1]]), columns=['Intercept', 'log_umi', 'theta'], index=gn[genes_step1]) min_theta = 1e-7 x = model_pars['theta'].values.copy() x[x < min_theta] = min_theta model_pars['theta'] = x dispersion_par = np.log10(1 + 10**genes_log_gmean_step1 / model_pars['theta'].values.flatten()) model_pars = model_pars.iloc[:, model_pars.columns != 'theta'].copy() model_pars['dispersion'] = dispersion_par outliers = np.vstack(([ is_outlier(model_pars.values[:, i], genes_log_gmean_step1) for i in range(model_pars.shape[1]) ])).sum(0) > 0 filt = np.invert(outliers) model_pars = model_pars[filt] genes_step1 = genes_step1[filt] genes_log_gmean_step1 = genes_log_gmean_step1[filt] z = FFTKDE(kernel='gaussian', bw='ISJ').fit(genes_log_gmean_step1) z.evaluate() bw = z.bw * bw_adjust x_points = np.vstack( (genes_log_gmean, np.array([min(genes_log_gmean_step1)] * genes_log_gmean.size))).max(0) x_points = np.vstack( (x_points, np.array([max(genes_log_gmean_step1)] * genes_log_gmean.size))).min(0) full_model_pars = pd.DataFrame(data=np.zeros( (x_points.size, model_pars.shape[1])), index=gn, columns=model_pars.columns) for i in model_pars.columns: kr = statsmodels.nonparametric.kernel_regression.KernelReg( model_pars[i].values, genes_log_gmean_step1[:, None], ['c'], reg_type='ll', bw=[bw]) full_model_pars[i] = kr.fit(data_predict=x_points)[0] theta = 10**genes_log_gmean / (10**full_model_pars['dispersion'].values - 1) full_model_pars['theta'] = theta del full_model_pars['dispersion'] model_pars_outliers = outliers regressor_data = np.vstack( (np.ones(cell_attrs.shape[0]), cell_attrs['log_umi'].values)).T d = X.data x, y = X.nonzero() mud = np.exp(full_model_pars.values[:, 0][y] + full_model_pars.values[:, 1][y] * cell_attrs['log_umi'].values[x]) vard = mud + mud**2 / full_model_pars['theta'].values.flatten()[y] X.data[:] = (d - mud) / vard**0.5 X.data[X.data < 0] = 0 X.eliminate_zeros() clip = np.sqrt(X.shape[0] / 30) X.data[X.data > clip] = clip if inplace: adata.raw = adata.copy() d = dict(zip(np.arange(X.shape[1]), genes_ix)) x, y = X.nonzero() y = np.array([d[i] for i in y]) data = X.data Xnew = sp.sparse.coo_matrix((data, (x, y)), shape=adata.shape).tocsr() adata.X = Xnew # TODO: add log1p of corrected umi counts to layers for c in full_model_pars.columns: adata.var[c + '_sct'] = full_model_pars[c] for c in cell_attrs.columns: adata.obs[c + '_sct'] = cell_attrs[c] for c in model_pars.columns: adata.var[c + '_step1_sct'] = model_pars[c] z = pd.Series(index=gn, data=np.zeros(gn.size, dtype='int')) z[gn[genes_step1]] = 1 w = pd.Series(index=gn, data=np.zeros(gn.size, dtype='int')) w[gn] = genes_log_gmean adata.var['genes_step1_sct'] = z adata.var['log10_gmean_sct'] = w else: adata_new = AnnData(X=X) adata_new.var_names = pd.Index(gn) adata_new.obs_names = adata.obs_names adata_new.raw = adata.copy() for c in full_model_pars.columns: adata_new.var[c + '_sct'] = full_model_pars[c] for c in cell_attrs.columns: adata_new.obs[c + '_sct'] = cell_attrs[c] for c in model_pars.columns: adata_new.var[c + '_step1_sct'] = model_pars[c] z = pd.Series(index=gn, data=np.zeros(gn.size, dtype='int')) z[gn[genes_step1]] = 1 adata_new.var['genes_step1_sct'] = z adata_new.var['log10_gmean_sct'] = genes_log_gmean return adata_new
processes = [] # variable that stores an array of CPU benchmarks and URLs proc_array = min_mark_gpu(10000)['items'] # iterate through each processor for i in range(len(proc_array)): # variable to store a process that calls the # Kijiji scraping function p = Process(target=kijiji_scrape, args=(proc_array[i], manager_list)) # start the process p.start() # append the process to the processes array processes.append(p) # iterate through the processes for p in processes: # block a calling thread until the process whose # join method ends p.join() # variable containing a sorted JSON object sorted_object = sorted(manager_list._getvalue(), key=lambda x: x['specs']['Benchmark'], reverse=True) # append sorted array object to a JSON object final_object = {'items': sorted_object} # print the sorted JSON object print(json.dumps(final_object, indent=4))
def _refine_corr_parallel( sam1, sam2, st, gnnm, gn1, gn2, corr_mode="pearson", THR=0, use_seq=False, T1=0.0, T2=0.0, ncpus=os.cpu_count(), ): import scipy as sp gn = np.append(gn1, gn2) w1 = sam1.adata.var["weights"][gn1].values w2 = sam2.adata.var["weights"][gn2].values w = np.append(w1, w2) w[w > T1] = 1 w[w < 1] = 0 ix = np.array(["a"] * gn1.size + ["b"] * gn2.size) gnO = gn[w > 0] ix = ix[w > 0] gn1O = gnO[ix == "a"] gn2O = gnO[ix == "b"] # gnnmO = gnnm[w > 0, :][:, w > 0] x, y = gnnmO.nonzero() pairs = np.unique(np.sort(np.vstack((x, y)).T, axis=1), axis=0) pairs[pairs >= gn1O.size] = pairs[pairs >= gn1O.size] - gn1O.size idx1 = np.where(st.adata.obs["batch"] == "batch1")[0] idx2 = np.where(st.adata.obs["batch"] == "batch2")[0] nnm = st.adata.obsp["connectivities"] x1 = sam1.adata[:, gn1O].X.tocsc().astype("float16") # [:,pairs[:,0]] x2 = sam2.adata[:, gn2O].X.tocsc().astype("float16") # [:,pairs[:,1]] nnm1 = nnm[:, idx1].astype("float16") nnm2 = nnm[:, idx2].astype("float16") s1 = nnm1.sum(1).A s1[s1 < 1e-3] = 1 s1 = s1.flatten()[:, None] s2 = nnm2.sum(1).A s2[s2 < 1e-3] = 1 s2 = s2.flatten()[:, None] pl1x = nnm1.dot(x1).multiply(1 / s1).tocsc() sc1x = nnm2.dot(x2).multiply(1 / s2).tocsc() CORR = {} from multiprocessing import Pool, Manager CORR = Manager().dict() p = pairs pl1 = pl1x sc1 = sc1x pc_chunksize = pl1.shape[1] // ncpus + 1 pool = Pool(ncpus, _parallel_init, [pl1, sc1, p, gn1O, gn2O, T2, CORR, corr_mode]) try: pool.map(_parallel_wrapper, range(p.shape[0]), chunksize=pc_chunksize) finally: pool.close() pool.join() CORR = CORR._getvalue() for k in CORR.keys(): CORR[k] = 0 if CORR[k] < THR else CORR[k] gnnm2 = gnnm.multiply(w[:, None]).multiply(w[None, :]).tocsr() x, y = gnnm2.nonzero() pairs = np.unique(np.sort(np.vstack((x, y)).T, axis=1), axis=0) CORR = np.array([CORR[x] for x in to_vn(gn[pairs])]) gnnm3 = sp.sparse.lil_matrix(gnnm.shape) if use_seq: gnnm3[pairs[:, 0], pairs[:, 1]] = (CORR * gnnm2[pairs[:, 0], pairs[:, 1]].A.flatten()) gnnm3[pairs[:, 1], pairs[:, 0]] = (CORR * gnnm2[pairs[:, 1], pairs[:, 0]].A.flatten()) else: gnnm3[pairs[:, 0], pairs[:, 1]] = CORR # *gnnm2[x,y].A.flatten() gnnm3[pairs[:, 1], pairs[:, 0]] = CORR # *gnnm2[x,y].A.flatten() gnnm3 = gnnm3.tocsr() gnnm3.eliminate_zeros() return gnnm3, CORR