def read_and_qc(sample_name, path): r""" This function reads the data for one 10X spatial experiment into the anndata object. It also calculates QC metrics. Modify this function if required by your workflow. :param sample_name: Name of the sample :param path: path to data """ adata = sc.read_visium(path + str(sample_name), count_file='filtered_feature_bc_matrix.h5', load_images=True) adata.obs['sample'] = sample_name adata.var['SYMBOL'] = adata.var_names adata.var.rename(columns={'gene_ids': 'ENSEMBL'}, inplace=True) adata.var_names = adata.var['ENSEMBL'] adata.var.drop(columns='ENSEMBL', inplace=True) # Calculate QC metrics sc.pp.calculate_qc_metrics(adata, inplace=True) adata.var['mt'] = [gene.startswith('mt-') for gene in adata.var['SYMBOL']] adata.obs['mt_frac'] = adata[:, adata.var['mt'].tolist()].X.sum(1).A.squeeze()/adata.obs['total_counts'] # add sample name to obs names adata.obs["sample"] = [str(i) for i in adata.obs['sample']] adata.obs_names = adata.obs["sample"] \ + '_' + adata.obs_names adata.obs.index.name = 'spot_id' return adata
def test_visium_default(image_comparer): # default values save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) adata = sc.read_visium(HERE / '_data' / 'visium_data' / '1.0.0') adata.obs = adata.obs.astype({'array_row': 'str'}) sc.pl.spatial(adata, show=False) save_and_compare_images('master_spatial_visium_default')
def test_visium_empty_img_key(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) adata = sc.read_visium(HERE / '_data' / 'visium_data' / '1.0.0') adata.obs = adata.obs.astype({'array_row': 'str'}) sc.pl.spatial(adata, img_key=None, color="array_row") save_and_compare_images('master_spatial_visium_empty_image') sc.pl.embedding(adata, basis="spatial", color="array_row") save_and_compare_images('master_spatial_visium_embedding')
def test_visium_circles(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) adata = sc.read_visium(HERE / '_data' / 'visium_data' / '1.0.0') adata.obs = adata.obs.astype({'array_row': 'str'}) sc.pl.spatial( adata, color="array_row", groups=["24", "33"], crop_coord=(100, 400, 400, 100), alpha=0.5, size=1.3, ) save_and_compare_images('master_spatial_visium')
def test_spatial_external_img(image_comparer): # external image save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) adata = sc.read_visium(HERE / '_data' / 'visium_data' / '1.0.0') adata.obs = adata.obs.astype({'array_row': 'str'}) img = adata.uns["spatial"]["custom"]["images"]["hires"] scalef = adata.uns["spatial"]["custom"]["scalefactors"]["tissue_hires_scalef"] sc.pl.spatial( adata, color="array_row", scale_factor=scalef, img=img, basis="spatial", show=False, ) save_and_compare_images('master_spatial_external_img')
def test_spatial_general(image_comparer): # general coordinates save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) adata = sc.read_visium(HERE / '_data' / 'visium_data' / '1.0.0') adata.obs = adata.obs.astype({'array_row': 'str'}) spatial_metadata = adata.uns.pop( "spatial") # spatial data don't have imgs, so remove entry from uns # Required argument for now spot_size = list( spatial_metadata.values())[0]["scalefactors"]["spot_diameter_fullres"] sc.pl.spatial(adata, show=False, spot_size=spot_size) save_and_compare_images('master_spatial_general_nocol') # category sc.pl.spatial(adata, show=False, spot_size=spot_size, color="array_row") save_and_compare_images('master_spatial_general_cat') # continuous sc.pl.spatial(adata, show=False, spot_size=spot_size, color="array_col") save_and_compare_images('master_spatial_general_cont')
def preprocess_spdata_single(data_folder, sample_name): adata = sc.read_visium(os.path.join(data_folder, sample_name), \ count_file="filtered_feature_bc_matrix.h5", load_images=True) adata.obs["sample"] = sample_name adata.var["SYMBOL"] = adata.var_names adata.var.rename(columns={"gene_ids": "ENSEMBL"}, inplace=True) adata.var_names = adata.var["ENSEMBL"] adata.var.drop(columns="ENSEMBL", inplace=True) # Calculate QC metrics sc.pp.calculate_qc_metrics(adata, inplace=True) adata.var["mt"] = [gene.startswith("mt-") for gene in adata.var["SYMBOL"]] adata.obs["mt_frac"] = adata[:, adata.var["mt"].tolist()].X.sum(1).A.squeeze()/adata.obs["total_counts"] # add sample name to obs names adata.obs["sample"] = [str(i) for i in adata.obs["sample"]] adata.obs_names = adata.obs["sample"] \ + "_" + adata.obs_names adata.obs.index.name = "spot_id" return adata
def main(): prs = arp.ArgumentParser() prs.add_argument('sp_data_path', type=str, help='path to spatial data') prs.add_argument('result_dir', type=str, help='directory to regression model and results') prs.add_argument('cuda_device', type=str, help="index of cuda device ID, from 0-7") prs.add_argument('-a', '--annotation_column', default='celltype', type=str, help='column name for covariate') prs.add_argument('-r', '--regression_model_path', default=None, type=str, help='path to regression model') prs.add_argument('-s', '--slide', default="1", type=str, help='select slide 1-4, or all') args = prs.parse_args() cuda_device = args.cuda_device sp_data_path = args.sp_data_path results_folder = args.result_dir covariate_col_names = args.annotation_column slide = args.slide if args.regression_model_path is None: regression_model_output = os.listdir(results_folder + "/regression_model")[0] reg_path = f'{results_folder}regression_model/{regression_model_output}/' else: reg_path = args.regression_model_path assert cuda_device in ["0", "1", "2", "3", "4", "5", "6", "7"], "invalid device id" assert slide in ["1", "2", "3", "4" ] or slide == "all", "slide does not exist" if slide.isdigit(): assert 'filtered_feature_bc_matrix.h5' in os.listdir( sp_data_path + "/JBO0" + slide), "file path does not contain h5 feature matrix" else: assert all('filtered_feature_bc_matrix.h5' in os.listdir(sp_data_path + "/JBO0" + str(i)) for i in range(1,5)),\ "one or more file path does not contain h5 feature matrix" ##### MAIN PART ##### os.environ["CUDA_VISIBLE_DEVICES"] = cuda_device os.environ["CPATH"] = "/usr/local/cuda/include:$CPATH" #To use cuDNN import sys import scanpy as sc import anndata import pandas as pd import numpy as np data_type = 'float32' # this line forces theano to use the GPU and should go before importing cell2location os.environ[ "THEANO_FLAGS"] = 'device=cuda,floatX=' + data_type + ',force_device=True' import cell2location import matplotlib as mpl from matplotlib import rcParams import matplotlib.pyplot as plt import seaborn as sns mpl.use('Agg') # silence scanpy that prints a lot of warnings import warnings warnings.filterwarnings('ignore') if not os.path.exists(results_folder + "std_model/"): os.makedirs(results_folder + "std_model/") ## READ IN SPATIAL DATA ## if slide == "all": # We will merge all slides together in one adata object adata_list, sample_name = [], [] for i in range(1, 5): name = 'JBO0' + str(i) temp_adata = sc.read_visium(sp_data_path + "/" + name) print("Read in file from " + sp_data_path + "/" + name) temp_adata.var_names_make_unique() temp_adata.var["mt"] = temp_adata.var_names.str.startswith("mt-") sc.pp.calculate_qc_metrics(temp_adata, qc_vars=["mt"], inplace=True) temp_adata.obs['sample'] = name sample_name.append(name) adata_list.append(temp_adata) adata = adata_list[0].concatenate(adata_list[1:], batch_key="sample", uns_merge="unique", \ batch_categories = sample_name, index_unique=None) else: adata = sc.read_visium(sp_data_path + "/JBO0" + slide) print("Read in file from " + sp_data_path) adata.var_names_make_unique() adata.obs['sample'] = "JBO0" + slide adata.var['mt'] = adata.var_names.str.startswith("mt-") sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True) adata.obs_names_make_unique() # Calculate QC metrics and filter print("Before filtering: {} spots and {} genes".format(*adata.shape)) adata.var['SYMBOL'] = adata.var_names sc.pp.filter_cells(adata, min_counts=11000) sc.pp.filter_cells(adata, max_counts=50000) adata = adata[adata.obs["pct_counts_mt"] < 20] sc.pp.filter_genes(adata, min_cells=10) # mitochondria-encoded (MT) genes should be removed for spatial mapping adata.obsm['mt'] = adata[:, adata.var['mt'].values].X.toarray() adata = adata[:, ~adata.var['mt'].values] print("After filtering: {} spots and {} genes".format(*adata.shape)) adata_vis = adata.copy() adata_vis.raw = adata_vis ## READ IN REFERENCE DATA adata_raw = sc.read(f'{reg_path}sc.h5ad') # Export cell type expression signatures: inf_aver = adata_raw.raw.var.copy() inf_aver.index = adata_raw.raw.var['SYMBOL'] inf_aver = inf_aver.loc[:, [ f'means_cov_effect_{covariate_col_names}_{i}' for i in adata_raw.obs[covariate_col_names].unique() ]] from re import sub inf_aver.columns = [ sub(f'means_cov_effect_{covariate_col_names}_{i}', '', i) for i in adata_raw.obs[covariate_col_names].unique() ] inf_aver = inf_aver.iloc[:, inf_aver.columns.argsort()] # scale up by average sample scaling factor inf_aver = inf_aver * adata_raw.uns['regression_mod']['post_sample_means'][ 'sample_scaling'].mean() ## RUN CELL2LOCATION ## r = cell2location.run_cell2location( # Single cell reference signatures as pd.DataFrame # (could also be data as anndata object for estimating signatures analytically - `sc_data=adata_snrna_raw`) sc_data=inf_aver, # Spatial data as anndata object sp_data=adata_vis, verbose=True, # the column in sc_data.obs that gives cluster idenitity of each cell summ_sc_data_args={'cluster_col': covariate_col_names}, train_args={ 'use_raw': True, # By default uses raw slots in both of the input datasets. 'n_iter': 15000, # Increase the number of iterations if needed (see below) # Whe analysing the data that contains multiple samples, # cell2location will select a model version which pools information across samples # For details see https://cell2location.readthedocs.io/en/latest/cell2location.models.html#module-cell2location.models.CoLocationModelNB4E6V2 'sample_name_col': 'sample' }, # Column in sp_data.obs with Sample ID # Number of posterios samples to use for estimating parameters, # reduce if not enough GPU memory posterior_args={'n_samples': 1000}, export_args={ 'path': results_folder + 'std_model/', # path where to save results 'run_name_suffix': '' # optinal suffix to modify the name the run }, model_kwargs= { # Prior on the number of cells, cell types and co-located combinations 'cell_number_prior': { # Use visual inspection of the tissue image to determine # the average number of cells per spot, # an approximate count is good enough: 'cells_per_spot': 8, # Prior on the number of cell types (or factors) in each spot 'factors_per_spot': 7, # Prior on the number of correlated cell type combinations in each spot 'combs_per_spot': 2.5 }, # Prior on change in sensitivity between technologies 'gene_level_prior': { # Prior on average change in expression level from scRNA-seq to spatial technology, # this reflects your belief about the sensitivity of the technology in you experiment 'mean': 1 / 2, # Prior on how much individual genes differ from that average, # a good choice of this value should be lower that the mean 'sd': 1 / 4 } })
seed(2021) matplotlib.use('TkAgg') base_path = '/Users/zhongyuanke/data/' anterior_out_path = 'dann_vae/spatial/rna_anterior_davae_01.h5ad' posterior_out_path = 'dann_vae/spatial/rna_posterior_davae_01.h5ad' file_rna = base_path + 'spatial/mouse_brain/adata_processed_sc.h5ad' rna_anterior_orig = base_path + 'dann_vae/spatial/rna_anterior_orig.h5ad' file1_spatial = base_path + 'spatial/mouse_brain/10x_mouse_brain_Anterior/' file2_spatial = base_path + 'spatial/mouse_brain/10x_mouse_brain_Posterior/' file1 = base_path + 'spatial/mouse_brain/10x_mouse_brain_Anterior/V1_Mouse_Brain_Sagittal_Anterior_filtered_feature_bc_matrix.h5' file2 = base_path + 'spatial/mouse_brain/10x_mouse_brain_Posterior/V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix.h5' figure_umap = base_path + 'dann_vae/spatial/umap.png' adata_spatial_anterior = sc.read_visium(file1_spatial, count_file=file1) adata_spatial_posterior = sc.read_visium(file2_spatial, count_file=file2) adata_spatial_anterior.var_names_make_unique() adata_spatial_posterior.var_names_make_unique() adata_rna = sc.read_h5ad(file_rna) # sc.pp.filter_genes(adata_rna, min_cells=500) # sc.pp.highly_variable_genes(adata_rna, n_top_genes=5000) # features = adata_rna.var_names[adata_rna.var['highly_variable']] # adata_rna = adata_rna[:,features] # adata_rna.write_h5ad(base_path+'spatial/mouse_brain/cortex_for_seurat.h5ad') print(adata_rna) print(adata_spatial_anterior) print(adata_spatial_posterior) adata_spatial_anterior = adata_spatial_anterior[ adata_spatial_anterior.obsm["spatial"][:, 1] < 6000, :] adata_spatial_posterior = adata_spatial_posterior[
def test_read_visium_counts(): # Test that checks the read_visium function visium_pth = ROOT / '../visium_data/1.0.0' spec_genome_v3 = sc.read_visium(visium_pth, genome='GRCh38') nospec_genome_v3 = sc.read_visium(visium_pth) assert_anndata_equal(spec_genome_v3, nospec_genome_v3)
def Read10X( path: Union[str, Path], genome: Optional[str] = None, count_file: str = "filtered_feature_bc_matrix.h5", library_id: str = None, load_images: Optional[bool] = True, quality: _QUALITY = "hires", image_path: Union[str, Path] = None, ) -> AnnData: """\ Read Visium data from 10X (wrap read_visium from scanpy) In addition to reading regular 10x output, this looks for the `spatial` folder and loads images, coordinates and scale factors. Based on the `Space Ranger output docs`_. .. _Space Ranger output docs: https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/overview Parameters ---------- path Path to directory for visium datafiles. genome Filter expression to genes within this genome. count_file Which file in the passed directory to use as the count file. Typically would be one of: 'filtered_feature_bc_matrix.h5' or 'raw_feature_bc_matrix.h5'. library_id Identifier for the visium library. Can be modified when concatenating multiple adata objects. load_images Load image or not. quality Set quality that convert to stlearn to use. Store in anndata.obs['imagecol' & 'imagerow'] image_path Path to image. Only need when loading full resolution image. Returns ------- Annotated data matrix, where observations/cells are named by their barcode and variables/genes by gene name. Stores the following information: :attr:`~anndata.AnnData.X` The data matrix is stored :attr:`~anndata.AnnData.obs_names` Cell names :attr:`~anndata.AnnData.var_names` Gene names :attr:`~anndata.AnnData.var`\\ `['gene_ids']` Gene IDs :attr:`~anndata.AnnData.var`\\ `['feature_types']` Feature types :attr:`~anndata.AnnData.uns`\\ `['spatial']` Dict of spaceranger output files with 'library_id' as key :attr:`~anndata.AnnData.uns`\\ `['spatial'][library_id]['images']` Dict of images (`'fulres'`, `'hires'` and `'lowres'`) :attr:`~anndata.AnnData.uns`\\ `['spatial'][library_id]['scalefactors']` Scale factors for the spots :attr:`~anndata.AnnData.uns`\\ `['spatial'][library_id]['metadata']` Files metadata: 'chemistry_description', 'software_version' :attr:`~anndata.AnnData.obsm`\\ `['spatial']` Spatial spot coordinates, usable as `basis` by :func:`~scanpy.pl.embedding`. """ from scanpy import read_visium adata = read_visium( path, genome=genome, count_file=count_file, library_id=library_id, load_images=load_images, ) adata.var_names_make_unique() if library_id is None: library_id = list(adata.uns["spatial"].keys())[0] if quality == "fulres": image_coor = adata.obsm["spatial"] img = plt.imread(image_path, 0) adata.uns["spatial"][library_id]["images"]["fulres"] = img else: scale = adata.uns["spatial"][library_id]["scalefactors"]["tissue_" + quality + "_scalef"] image_coor = adata.obsm["spatial"] * scale adata.obs["imagecol"] = image_coor[:, 0] adata.obs["imagerow"] = image_coor[:, 1] adata.uns["spatial"][library_id]["use_quality"] = quality return adata
def read_each(i): adata = sc.read_visium(i) adata.var_names_make_unique() # flip Y axis to show correctly in cellxgene VIP adata.obsm['spatial'][:, 1] = -adata.obsm['spatial'][:, 1] return (adata)
adata_spatial.obs['celltype'] = pred_type # adata_davae.obs['cell type'] = all_type adata_spatial.write_h5ad(base_path + 'dann_vae/spatial/'+type+'_label_02.h5ad') base_path = '/Users/zhongyuanke/data/' file1 = base_path + 'spatial/mouse_brain/10x_mouse_brain_Anterior/' \ 'V1_Mouse_Brain_Sagittal_Anterior_filtered_feature_bc_matrix.h5' file2 = base_path + 'spatial/mouse_brain/10x_mouse_brain_Posterior/' \ 'V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix.h5' file1_spatial = base_path+'spatial/mouse_brain/10x_mouse_brain_Anterior/' file2_spatial = base_path+'spatial/mouse_brain/10x_mouse_brain_Posterior/' rna_path = base_path+'spatial/mouse_brain/adata_processed_sc.h5ad' adata1 = sc.read_visium(file1_spatial, count_file=file1) adata2 = sc.read_visium(file2_spatial, count_file=file2) adata_rna = sc.read_h5ad(rna_path) adata1.var_names_make_unique() adata2.var_names_make_unique() adata1 = adata1[ adata1.obsm["spatial"][:, 1] < 6000, : ] adata2 = adata2[ (adata2.obsm["spatial"][:, 1] < 4000) & (adata2.obsm["spatial"][:, 0] < 6000), :, ] deep_label_transfer(adata2, adata_rna, type='posterior')