def test_scrublet_batched(): """ Test that Scrublet run works with batched data. Check that scrublet runs and detects some doublets. """ pytest.importorskip("scrublet") adata = sc.datasets.pbmc3k() adata.obs['batch'] = 1350 * ['a'] + 1350 * ['b'] split = [adata[adata.obs["batch"] == x].copy() for x in ("a", "b")] sce.pp.scrublet(adata, use_approx_neighbors=False, batch_key='batch') # replace assertions by conditions assert "predicted_doublet" in adata.obs.columns assert "doublet_score" in adata.obs.columns assert adata.obs["predicted_doublet"].any(), "Expect some doublets to be identified" assert ( 'batches' in adata.uns['scrublet'].keys() ), "Expect .uns to contain batch info" # Check that results are independent for s in split: sce.pp.scrublet(s, use_approx_neighbors=False) merged = sc.concat(split) pd.testing.assert_frame_equal(adata.obs[merged.obs.columns], merged.obs)
def normalizeMultiAd(multiAd, removeAmbiguous=True): """ 对二代三代数据分开normalize, 最终获得的每个细胞有3e4的reads """ multiCountAd = multiAd[:, ~multiAd.var.index.str.contains("_")] multiOtherAd = multiAd[:, multiAd.var.index.str.contains("_")] sc.pp.normalize_total(multiCountAd, target_sum=1e4) sc.pp.normalize_total(multiOtherAd, target_sum=2e4) multiAd = sc.concat([multiCountAd, multiOtherAd], axis=1) if removeAmbiguous: multiAd = multiAd[:, ~(multiAd.var.index.str.contains("Ambiguous") | multiAd.var.index.str.contains("_N_")), ] return multiAd
def combineAdataUseScanorama(adataLs, batchKey, batchCateLs, subSample=False, subSampleCounts=0): """ 利用Scanorama整合不同adata adataLs: [adata1, adata2] batchKey: 添加的label batchCateLs: 每个batch的名称 需要和adataLs一致 subSampleCounts: 下采样的样本数。 return: 整合后的adata """ import scanorama adataLs = [x.copy() for x in adataLs] if subSample: sampleSize = min([x.shape[0] for x in adataLs]) if subSampleCounts: sampleSize = min(sampleSize, subSampleCounts) logger.info(f"sample size: {sampleSize}") [sc.pp.subsample(x, n_obs=sampleSize) for x in adataLs] for adata in adataLs: sc.pp.normalize_total(adata, inplace=True) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000, inplace=True) print(f"↓↓↓↓↓↓↓↓↓{batchCateLs}↓↓↓↓↓↓↓↓") combineScanoramaLs = scanorama.correct_scanpy(adataLs, return_dimred=True) print(f"↑↑↑↑↑↑↑↑↑{batchCateLs}↑↑↑↑↑↑↑↑") combineAdata = sc.concat(combineScanoramaLs, label=batchKey, index_unique="-", keys=batchCateLs) sc.pp.neighbors(combineAdata, n_pcs=50, use_rep="X_scanorama") sc.tl.umap(combineAdata) return combineAdata
def combineAdataUseScanoramaOld( adataLs, batchKey, batchCateLs, subSample=False, subSampleCounts=0 ): """ 利用Scanorama整合不同adata adataLs: [adata1, adata2] batchKey: 添加的label batchCateLs: 每个batch的名称 需要和adataLs一致 subSampleCounts: 下采样的样本数。 return: 整合后的adata """ import scanorama adataLs = [x.copy() for x in adataLs] if subSample: sampleSize = min([x.shape[0] for x in adataLs]) if subSampleCounts: sampleSize = min(sampleSize, subSampleCounts) logger.info(f"sample size: {sampleSize}") [sc.pp.subsample(x, n_obs=sampleSize) for x in adataLs] combineAdata = adataLs[0].concatenate( adataLs[1:], batch_key=batchKey, batch_categories=batchCateLs ) sc.pp.normalize_per_cell(combineAdata, counts_per_cell_after=1e4) sc.pp.log1p(combineAdata) sc.pp.highly_variable_genes( combineAdata, min_mean=0.0125, max_mean=3, min_disp=1.5, batch_key=batchKey ) sc.pl.highly_variable_genes(combineAdata) varGenes = combineAdata.var.highly_variable varGenes = varGenes[varGenes].keys() varGenes = list(varGenes) alldata = {} for oneBatchName in combineAdata.obs[batchKey].unique(): alldata[oneBatchName] = combineAdata[ combineAdata.obs[batchKey] == oneBatchName, varGenes ] combineAdataLs = list(alldata.values()) print(f"↓↓↓↓↓↓↓↓↓{batchCateLs}↓↓↓↓↓↓↓↓") combineScanoramaLs = scanorama.correct_scanpy(combineAdataLs, return_dimred=True) print(f"↑↑↑↑↑↑↑↑↑{batchCateLs}↑↑↑↑↑↑↑↑") combineAdata = sc.concat(combineScanoramaLs) # import pdb; pdb.set_trace() # combineScanoramaAr = np.concatenate(combineScanoramaLs) # combineAdata.obsm["SC"] = combineScanoramaAr # combineAdata.raw = combineAdata # combineAdata = combineAdata[:, varGenes] # sc.pp.scale(combineAdata, max_value=10) # sc.tl.pca(combineAdata, svd_solver="arpack", n_comps=50) # sc.pl.pca(combineAdata) sc.pp.neighbors(combineAdata, n_pcs=50, use_rep="X_scanorama") sc.tl.umap(combineAdata) return combineAdata