Esempio n. 1
0
def test_scanorama_correct_scanpy():
    """
    Test that Scanorama correction with Scanpy AnnData works.
    Ensures that the function call runs, dimensions match, and
    metadata is in the correct order.
    """
    from anndata import AnnData
    import pandas as pd

    datasets, genes_list = data_gen()

    adatas = []
    for i in range(len(datasets)):
        adata = AnnData(datasets[i])
        adata.obs = pd.DataFrame(list(range(datasets[i].shape[0])),
                                 columns=['obs1'])
        adata.var = pd.DataFrame(genes_list[i], columns=['var1'])
        adata.var_names = genes_list[i]
        adatas.append(adata)

    corrected = scanorama.correct_scanpy(adatas)

    for adata_cor, adata_orig in zip(corrected, adatas):
        assert (adata_cor.X.shape[0] == adata_orig.X.shape[0])
        assert (adata_cor.X.shape[1] == adatas[0].X.shape[1])
        assert (list(adata_cor.obs['obs1']) == list(adata_orig.obs['obs1']))
        assert (list(adata_cor.var['var1']) == list(adatas[0].var['var1']))
Esempio n. 2
0
def correct_scanorama(dataset_list, cell_metadata):
    ''' This function runs Scanorama and saves the corrected object to h5ad file'''
    #Scanorama
    corrected = scanorama.correct_scanpy(dataset_list)
    #merge Scanorama corrected object
    corrected_dataset = corrected[0].concatenate(corrected[1:], join="inner", batch_key = 'Batch')
    print("Scanorama worked!")
    #append metadata
    corrected_dataset.obs = cell_metadata

    save_h5ad(corrected_dataset)
Esempio n. 3
0
def runScanorama(adata, batch, hvg=None):
    import scanorama
    checkSanity(adata, batch, hvg)
    split = splitBatches(adata.copy(), batch)
    emb, corrected = scanorama.correct_scanpy(split, return_dimred=True)
    corrected = corrected[0].concatenate(corrected[1:])
    emb = np.concatenate(emb, axis=0)
    corrected.obsm['X_emb'] = emb
    #corrected.uns['emb']=True

    return corrected
def correct_scanorama(dataset_list, cell_metadata):
    ''' This function runs Scanorama and saves the corrected object to h5ad file'''
    # run Scanorama
    corrected = scanorama.correct_scanpy(dataset_list)
    # merge Scanorama corrected object
    corrected_dataset = corrected[0].concatenate(corrected[1:],
                                                 join='inner',
                                                 batch_key=args.batch_key)
    # append metadata
    corrected_dataset.obs = cell_metadata

    save_h5ad(corrected_dataset)
Esempio n. 5
0
 def correction(self):
     print("Start Scanorama...\n")
     start = time.time()
     adata_scanorama = self.adata.copy()
     adata_list = [
         adata_scanorama[adata_scanorama.obs[self.batch] == i]
         for i in adata_scanorama.obs[self.batch].unique()
     ]
     corrected = scanorama.correct_scanpy(adata_list, return_dimred=True)
     corrected_merged_dge = corrected[0].concatenate(corrected[1:],
                                                     join="outer")
     corrected_merged_dge.obs = adata_scanorama.obs
     self.adata = corrected_merged_dge
     print(f"Scanorama has taken {round(time.time() - start, 2)} seconds")
Esempio n. 6
0
def correct(datasets):
    ''' This function runs Scanorama and saves the corrected object to h5ad file'''
    #Scanorama
    corrected = scanorama.correct_scanpy(datasets)

    #merge Scanorama corrected object
    corrected_dataset = corrected[0].concatenate(corrected[1:],
                                                 join="inner",
                                                 batch_key='Batch')
    print("Scanorama worked!")

    #save Scanorama corrected object
    corrected_dataset.write(args.output)
    print("Corrected object saved!")
Esempio n. 7
0
def runScanorama(adata, batch, hvg=None):
    import scanorama
    checkSanity(adata, batch, hvg)
    split, categories = splitBatches(adata.copy(),
                                     batch,
                                     return_categories=True)
    corrected = scanorama.correct_scanpy(split, return_dimred=True)
    corrected = anndata.AnnData.concatenate(*corrected,
                                            batch_key=batch,
                                            batch_categories=categories,
                                            index_unique=None)
    corrected.obsm['X_emb'] = corrected.obsm['X_scanorama']
    #corrected.uns['emb']=True

    return corrected
Esempio n. 8
0
def combineAdataUseScanorama(adataLs,
                             batchKey,
                             batchCateLs,
                             subSample=False,
                             subSampleCounts=0):
    """
    利用Scanorama整合不同adata
    adataLs:
        [adata1, adata2]
    batchKey:
        添加的label
    batchCateLs:
        每个batch的名称 需要和adataLs一致
    subSampleCounts:
        下采样的样本数。
    return:
        整合后的adata
    """
    import scanorama

    adataLs = [x.copy() for x in adataLs]
    if subSample:
        sampleSize = min([x.shape[0] for x in adataLs])
        if subSampleCounts:
            sampleSize = min(sampleSize, subSampleCounts)
        logger.info(f"sample size: {sampleSize}")
        [sc.pp.subsample(x, n_obs=sampleSize) for x in adataLs]

    for adata in adataLs:
        sc.pp.normalize_total(adata, inplace=True)
        sc.pp.log1p(adata)
        sc.pp.highly_variable_genes(adata,
                                    flavor="seurat",
                                    n_top_genes=2000,
                                    inplace=True)

    print(f"↓↓↓↓↓↓↓↓↓{batchCateLs}↓↓↓↓↓↓↓↓")
    combineScanoramaLs = scanorama.correct_scanpy(adataLs, return_dimred=True)
    print(f"↑↑↑↑↑↑↑↑↑{batchCateLs}↑↑↑↑↑↑↑↑")
    combineAdata = sc.concat(combineScanoramaLs,
                             label=batchKey,
                             index_unique="-",
                             keys=batchCateLs)
    sc.pp.neighbors(combineAdata, n_pcs=50, use_rep="X_scanorama")
    sc.tl.umap(combineAdata)
    return combineAdata
Esempio n. 9
0
def scanorama_merge(adata_trains, adata_pred, keepdimensionality):
    """ corrects datasets using scanorama and merge training datasets subsequently
    
    This function reads a list of training datasets (at least one) and one testing dataset from .h5ad files and returns a merged and corrected training dataset anndata object, and a corrected testing anndata object.
    
    parameters
    ----------
    adata_trains: `list`
        list of training dataset adata objects
    adata_pred: AnnData
        testing dataset anndata object
    keepdimensionality: `bool` 
        determines if we should use all common genes or if we should reduce dimensionality to 100. False not currently implemented    
    train_datasets: `list`
        names of train datasets

    returns
    -------
    AnnData
        A concatenated and corrected anndata object of all training datasets.
    AnnData
        An anndata object containing corrected testing dataset.
    """

    adata_pred_obssave = adata_pred
    nonmerged_adata_train = naive_merge(
        adata_trains)  # to have merged obs ready
    all_adata = naive_merge([nonmerged_adata_train, adata_pred])
    adata_trains.append(adata_pred)
    print('using scanorama rn')
    integrated, corrected = scan.correct_scanpy(adata_trains,
                                                return_dimred=True)
    print('integrating training set')
    if len(adata_trains) != 2:
        adata_train = sc.AnnData.concatenate(*corrected[:-1])
    else:
        adata_train = corrected[0]
    adata_train.obs = nonmerged_adata_train.obs
    adata_train.var = all_adata.var
    adata_pred = sc.AnnData(corrected[-1])
    adata_pred.obs = adata_pred_obssave.obs
    adata_pred.var = all_adata.var

    adata_trains.pop()
    return adata_train, adata_pred
Esempio n. 10
0
# Plot UMAP and T-SNE before correction
umapplot(adata,
         color_by=[args.celltype, args.batch],
         save_file_prefix=f"scanorama_umap_{args.adata}_before_cor")
# tsneplot(adata, color_by=[args.celltype, args.batch], save_file_prefix=f"tsne_{args.adata}_before_cor")

# Correction
print("Starting Scanorama...")
start = time.time()
adata_scanorama = adata.copy()
adata_list = [
    adata_scanorama[adata_scanorama.obs[args.batch] == i]
    for i in adata_scanorama.obs[args.batch].unique()
]
corrected = scanorama.correct_scanpy(adata_list, return_dimred=True)
corrected_merged_dge = corrected[0].concatenate(*corrected[1:])
corrected_merged_dge.obs = adata_scanorama.obs
print(f"Scanorama has taken {time.time() - start} seconds")

# Plot UMAP after correction
sc.pp.neighbors(corrected_merged_dge, n_neighbors=10, n_pcs=20)
sc.tl.umap(corrected_merged_dge)
sc.pl.umap(corrected_merged_dge, color=[args.celltype, args.batch], show=False)
resname = f"./visualization/scanorama_umap_{args.adata}_after_cor.png"
plt.savefig(resname, dpi=100)

# Save corrected adata
if not os.path.exists(f"./{args.adata[:6]}"):
    os.makedirs(f"./{args.adata[:6]}")
corrected_merged_dge.write_h5ad(
Esempio n. 11
0
import numpy as np
from scbean.tools import utils as tl
import scanpy as sc
import pandas as pd
import scanorama
import argparse

base_path = '/Users/zhongyuanke/data/'

file1 = 'dropviz/mouse_brain_dropviz_filtered.h5ad'
file2 = 'nuclei/adata_nuclei_filtered.h5ad'
scan_path = 'results/scan_mouse.h5ad'

# -------------train---------------------
adata1 = tl.read_sc_data(file1, fmt='h5ad')
adata2 = tl.read_sc_data(file2, fmt='h5ad')
# orig_label =adata_orig.obs['label']
print(adata1)
print(adata2)
datas = [adata1, adata2]
corrected = scanorama.correct_scanpy(datas, return_dimred=True, dimred=16)
adata_corrected = corrected[0].concatenate(corrected[1])

print(adata_corrected)
sc.pp.neighbors(adata_corrected, use_rep='X_scanorama')
sc.tl.umap(adata_corrected)
adata_corrected.write_h5ad(scan_path)
Esempio n. 12
0
import scanpy.api as sc
from umap import UMAP
import scanorama
import sys

script_path = os.path.dirname(os.path.realpath(__file__))
output_dir = os.path.join(script_path, '../../Figures') + '/'
adata_scv_pru = sc.read_h5ad(output_dir + '../Data/pru/adata_sc_velocyto.h5ad')
adata_scv_me49 = sc.read_h5ad(output_dir +
                              '../Data/011_me49/adata_sc_velocyto.h5ad')

adatas = [adata_scv_me49.copy(), adata_scv_pru.copy()]
integrated, corrected = scanorama.correct_scanpy(adatas, return_dimred=True)
merged_x = np.concatenate(integrated)
umap_merged_x = UMAP(n_components=2,
                     random_state=4,
                     min_dist=0.3,
                     n_neighbors=50).fit_transform(merged_x)
adatas = corrected[0].concatenate(corrected[1])
adatas.obs_names = [x.split('-')[0] for x in adatas.obs_names]
adatas.obsm['X_corrected'] = merged_x
adatas.obsm['X_corrected_umap'] = umap_merged_x
adatas.layers['original_mat'] = sp.sparse.csr_matrix(
    np.concatenate([adata_scv_me49.X.A, adata_scv_pru.X.A]))
batch = ['ME49' if '10099011' in x else 'Pru' for x in adatas.obs_names]
adatas.obs['batch'] = batch

## Save scanorama results
adatas.write_h5ad(filename=output_dir +
                  '../Data/pru/adata_integrated_0506_me49.h5ad',
                  compression='gzip')
Esempio n. 13
0
                    help="base path")

opt = parser.parse_args()

base_path = opt.base_path
file1 = base_path + 'blood_5w.h5ad'
file2 = base_path + 'bone_5w.h5ad'

time_list = []
adata1 = sc.read_h5ad(base_path + 'blood_5w.h5ad')
adata2 = sc.read_h5ad(base_path + 'bone_5w.h5ad')
print(adata1)
print(adata2)
data_list = [adata1, adata2]
t0 = time.time()
integrated, corrected = scanorama.correct_scanpy(data_list, return_dimred=True)
t1 = time.time()
print("Total time running DAVAE 10w cells: %s seconds" % (str(t1 - t0)))
time_list.append(t1 - t0)

adata1 = sc.read_h5ad(base_path + 'blood_10w.h5ad')
adata2 = sc.read_h5ad(base_path + 'bone_10w.h5ad')

data_list = [adata1, adata2]
t0 = time.time()
integrated, corrected = scanorama.correct_scanpy(data_list, return_dimred=True)
t1 = time.time()
print("Total time running DAVAE 20w cells: %s seconds" % (str(t1 - t0)))
time_list.append(t1 - t0)

adata1 = sc.read_h5ad(base_path + 'blood_20w.h5ad')
Esempio n. 14
0
def combineAdataUseScanoramaOld(
    adataLs, batchKey, batchCateLs, subSample=False, subSampleCounts=0
):
    """
    利用Scanorama整合不同adata
    adataLs:
        [adata1, adata2]
    batchKey:
        添加的label
    batchCateLs:
        每个batch的名称 需要和adataLs一致
    subSampleCounts:
        下采样的样本数。
    return:
        整合后的adata
    """
    import scanorama

    adataLs = [x.copy() for x in adataLs]
    if subSample:
        sampleSize = min([x.shape[0] for x in adataLs])
        if subSampleCounts:
            sampleSize = min(sampleSize, subSampleCounts)
        logger.info(f"sample size: {sampleSize}")
        [sc.pp.subsample(x, n_obs=sampleSize) for x in adataLs]

    combineAdata = adataLs[0].concatenate(
        adataLs[1:], batch_key=batchKey, batch_categories=batchCateLs
    )

    sc.pp.normalize_per_cell(combineAdata, counts_per_cell_after=1e4)
    sc.pp.log1p(combineAdata)

    sc.pp.highly_variable_genes(
        combineAdata, min_mean=0.0125, max_mean=3, min_disp=1.5, batch_key=batchKey
    )
    sc.pl.highly_variable_genes(combineAdata)

    varGenes = combineAdata.var.highly_variable

    varGenes = varGenes[varGenes].keys()

    varGenes = list(varGenes)

    alldata = {}

    for oneBatchName in combineAdata.obs[batchKey].unique():
        alldata[oneBatchName] = combineAdata[
            combineAdata.obs[batchKey] == oneBatchName, varGenes
        ]

    combineAdataLs = list(alldata.values())

    print(f"↓↓↓↓↓↓↓↓↓{batchCateLs}↓↓↓↓↓↓↓↓")
    combineScanoramaLs = scanorama.correct_scanpy(combineAdataLs, return_dimred=True)
    print(f"↑↑↑↑↑↑↑↑↑{batchCateLs}↑↑↑↑↑↑↑↑")
    combineAdata = sc.concat(combineScanoramaLs)
    #     import pdb; pdb.set_trace()
    #     combineScanoramaAr = np.concatenate(combineScanoramaLs)

    #     combineAdata.obsm["SC"] = combineScanoramaAr

    #     combineAdata.raw = combineAdata
    #     combineAdata = combineAdata[:, varGenes]
    #     sc.pp.scale(combineAdata, max_value=10)
    #     sc.tl.pca(combineAdata, svd_solver="arpack", n_comps=50)
    #     sc.pl.pca(combineAdata)
    sc.pp.neighbors(combineAdata, n_pcs=50, use_rep="X_scanorama")
    sc.tl.umap(combineAdata)
    return combineAdata
Esempio n. 15
0
 def correct(adatas):
     integrated, correcteds = scanorama.correct_scanpy(adatas,
                                                       return_dimred=True)
     return correcteds
Esempio n. 16
0
 def integrate_and_correct(adatas, assay="counts"):
     correcteds = scanorama.correct_scanpy(adatas)
     correct = correcteds.pop(0)
     corrected = correct.concatenate(*correcteds, batch_key="batch")
     return corrected
Esempio n. 17
0
def scanorama_bc(adatas, n_comps, save_folder, possible_batch_effects, batch_key='library_id'):
    """Apply Scanorama Batch correction
    Scanorama enables batch-correction and integration of heterogeneous scRNA-seq datasets

    Parameters
    ----------
    adatas : annData
    n_comps : int
    save_folder : str
    possible_batch_effects : list, str
    batch_key : str

    Returns
    -------

    """
    # 1. Score uncorrectd matrix
    _score_uncorrectd_data(adatas=adatas, n_comps=n_comps, save_folder=save_folder, batch_key=batch_key,
                           possible_batch_effects=possible_batch_effects)

    # 2. Apply scanoroma batch correction method
    # 2.1 Split list into annData objects
    split = _split_batches(adatas[:, adatas.var['highly_variable']].copy(), batch=batch_key)

    # 2.2 run scanorama batch correction
    kwargs = {"return_dimred": True}
    emb, corrected = scanorama.correct_scanpy(split, **kwargs)
    # concatenate corrected adatas
    emb = np.concatenate(emb, axis=0)
    adata_cor = ann.AnnData.concatenate(*corrected, batch_key=batch_key,
                                        batch_categories=adatas.obs[batch_key].cat.categories,).copy()
    adatas.obsm['X_emb'] = emb

    # 2.3 Score correct matrix
    # 2.3.2 Determine No. PCs
    pca = pc_determination.pcs_combs(adatas.obsm['X_emb'], save_folder, raw=False, type_dataset="No_HVG_corrected",
                                     use_highly_variable=False, copy=True, return_info=True)

    # 2.4 Calculate variance after batch correction - might be that the variance increased within a covariate
    dict_r2var = dict()
    adata_cor.obs[batch_key] = adatas.obs[batch_key].values
    # Score variance contribution by batch
    score_variance(adata=adata_cor, obs=batch_key, pca=pca, r2var=dict_r2var)

    for poss_be in possible_batch_effects:
        adata_cor.obs[poss_be] = adatas.obs[poss_be].values

        # Score variance contribution by other covariate
        score_variance(adata=adata_cor, obs=poss_be, pca=pca, r2var=dict_r2var)

    print(dict_r2var)

    # Compute Visualisation of corrected matrix
    try:
        n_comps = int(input("Please provide the No. principal components (default 50): "))
    except ValueError:
        n_comps = int(50)
    sc.pp.pca(adata_cor, n_comps=n_comps, use_highly_variable=False, svd_solver='arpack')
    sc.pp.neighbors(adata_cor)
    sc.tl.umap(adata_cor)

    # Plot
    for poss_be in possible_batch_effects:
        plt_pp_plots.plot_batch_correction(adata_cor, save_folder, batch_key="bc_matrix", possible_batch_effect=poss_be)

    # Compute Visualisation of corrected embedding
    sc.pp.neighbors(adatas, use_rep='X_emb')
    sc.tl.umap(adatas)

    for poss_be in possible_batch_effects:
        plt_pp_plots.plot_batch_correction(adatas, save_folder, batch_key="bc_embedding", possible_batch_effect=poss_be)

    adatas.X = sparse.csr_matrix(adatas.X)

    return adatas
Esempio n. 18
0
opt = parser.parse_args()

base_path = opt.base_path
file1 = base_path + 'blood_' + opt.type + '.h5ad'
file2 = base_path + 'bone_' + opt.type + '.h5ad'

adata1 = sc.read_h5ad(file1)
adata2 = sc.read_h5ad(file2)
# adata = anndata.AnnData(X=adata.X.A,obs=adata.obs, var=adata.var)
print(adata1)
print(adata2)
t0 = time.time()
data_list = [adata1, adata2]

integrated, corrected = scanorama.correct_scanpy(data_list,
                                                 return_dimred=True,
                                                 approx=False)
# integrated, corrected = scanorama.correct_scanpy(data_list, return_dimred=True)

t1 = time.time()
#
# info = psutil.virtual_memory()
# print('内存使用:', psutil.Process(os.getpid()).memory_info().rss/1024/1024/1024, 'GB')
# print('总内存:', info.total/1024/1024/1024, 'GB')
# print('内存占比:', info.percent)
# print('cpu个数:', psutil.cpu_count())
t = (t1 - t0) / 60
print("Total time running: %s min" % (str(t)))

# result = [psutil.Process(os.getpid()).memory_info().rss/1024/1024/1024, t1-t0]
# result = np.array(result)