def correct_batch(data: MultimodalData, features: str = None) -> None: """Batch correction on data using Location-Scale (L/S) Adjustment method. ([Li-and-Wong03]_, [Li20]_). If L/S adjustment method is used, users must call this function every time before they call the pca function. Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. features: `str`, optional, default: ``None`` Features to be included in batch correction computation. If ``None``, simply consider all features. Returns ------- ``None`` Update ``data.X`` by the corrected count matrix. Examples -------- >>> pg.correct_batch(data, features = "highly_variable_features") """ tot_seconds = 0.0 # estimate adjustment parameters start = time.perf_counter() can_correct = estimate_adjustment_matrices(data) end = time.perf_counter() tot_seconds += end - start logger.info("Adjustment parameters are estimated.") # select dense matrix keyword = select_features( data, features=features, standardize=False, max_value=None) # do not standardize or truncate max_value logger.info("Features are selected.") if can_correct: start = time.perf_counter() correct_batch_effects(data, keyword, features) end = time.perf_counter() tot_seconds += end - start logger.info( "Batch correction is finished. Time spent = {:.2f}s.".format( tot_seconds))
def correct_batch(data: AnnData, features: str = None) -> None: """Batch correction on data. Parameters ---------- data: ``anndata.AnnData`` Annotated data matrix with rows for cells and columns for genes. features: `str`, optional, default: ``None`` Features to be included in batch correction computation. If ``None``, simply consider all features. Returns ------- ``None`` Update ``data.X`` by the corrected count matrix. Examples -------- >>> pg.correct_batch(adata, features = "highly_variable_features") """ tot_seconds = 0.0 # estimate adjustment parameters start = time.perf_counter() can_correct = estimate_adjustment_matrices(data) end = time.perf_counter() tot_seconds += end - start logger.info("Adjustment parameters are estimated.") # select dense matrix keyword = select_features(data, features) logger.info("Features are selected.") if can_correct: start = time.perf_counter() correct_batch_effects(data, keyword, features) end = time.perf_counter() tot_seconds += end - start logger.info( "Batch correction is finished. Time spent = {:.2f}s.".format(tot_seconds) )
def run_scanorama( data: MultimodalData, n_components: int = 50, features: str = "highly_variable_features", standardize: bool = True, max_value: float = 10, random_state: int = 0, ) -> str: """Batch correction using Scanorama. This is a wrapper of `Scanorama <https://github.com/brianhie/scanorama>`_ package. See [Hie19]_ for details on the algorithm. Parameters ---------- data: ``MultimodalData``. Annotated data matrix with rows for cells and columns for genes. n_components: ``int``, optional default: ``50``. Number of integrated embedding components to keep. This sets Scanorama's dimred parameter. features: ``str``, optional, default: ``"highly_variable_features"``. Keyword in ``data.var`` to specify features used for Scanorama. standardize: ``bool``, optional, default: ``True``. Whether to scale the data to unit variance and zero mean. max_value: ``float``, optional, default: ``10``. The threshold to truncate data after scaling. If ``None``, do not truncate. random_state: ``int``, optional, default: ``0``. Seed for random number generator. Returns ------- out_rep: ``str`` The keyword in ``data.obsm`` referring to the embedding calculated by Scanorama algorithm. out_rep is always equal to "scanorama" Update ``data.obsm``: * ``data.obsm['X_scanorama']``: The embedding calculated by Scanorama algorithm. Examples -------- >>> pg.run_scanorama(data, random_state = 25) """ if not is_categorical_dtype(data.obs['Channel']): data.obs['Channel'] = pd.Categorical(data.obs['Channel']) if data.obs['Channel'].cat.categories.size == 1: logger.warning("Warning: data only contains 1 channel. Cannot apply Scanorama!") return 'pca' try: from scanorama import integrate except ImportError as e: print(f"ERROR: {e}") print("ERROR: Need Scanorama! Try 'pip install scanorama'.") import sys sys.exit(-1) logger.info("Start integration using Scanorama.") rep = 'scanorama' keyword = select_features(data, features=features, standardize=standardize, max_value=max_value, use_cache=False) X = data.uns[keyword] datasets = [] for channel in data.obs['Channel'].cat.categories: idx = (data.obs['Channel'] == channel).values assert idx.sum() > 0 datasets.append(X[idx, :]) genes_list = [[str(i) for i in range(X.shape[1])]] * data.obs['Channel'].cat.categories.size integrated, genes = integrate(datasets, genes_list, dimred = n_components, seed = random_state) data.obsm[f'X_{rep}'] = np.concatenate(integrated, axis = 0) return rep