def read_file(filename, transpose=False): adata = None if os.path.exists(filename): if os.path.isdir(filename): adata = sc.read_10x_mtx(filename) elif os.path.isfile(filename): name, filetype = os.path.splitext(filename) if filetype == ".txt": print() adata = sc.read_text(filename) if filetype == ".csv": adata = sc.read_csv(filename) if filetype == ".h5ad": adata = sc.read(filename) else: print( "ERROR: the format must be [H5AD|CSV|TXT] for file or 10x-MTX for directory." ) sys.exit() if transpose: adata = adata.transpose() elif not os.path.exists(filename): sys.exit("ERROR: no such file or directory.") if not isinstance(adata.X, np.ndarray): X = adata.X.toarray() adata = anndata.AnnData(X, obs=adata.obs, var=adata.var) return adata
def reading(path): log.info("Reading single cell expression matrix.") if path.endswith(".h5"): adata = sc.read_h5ad(path) else: adata = sc.read_text(path) adata.var_names_make_unique() return (adata)
def process_mereu(root_dir): """ In this case, because names are informative, we only need to download the data, read the csv files and output the adatas. """ tsv_dir = root_dir + "/tsv/" df_cell_types_human = pd.read_csv(root_dir + "/cell_types/human.csv", index_col="colnames") df_cell_types_mouse = pd.read_csv(root_dir + "/cell_types/mouse.csv", index_col="colnames") list_techniques = [ "CELseq2", "Dropseq", "QUARTZseq", "SMARTseq2", "SingleNuclei", "ddSEQ", "inDrop", "10X", ] file_list = os.listdir(tsv_dir) for technique in list_techniques: for org in ["mouse", "human"]: # TODO: add mouse when I have the df print(technique, org) file_select = [ f for f in file_list if (technique in f) & (org in f) ][0] adata = sc.read_text(tsv_dir + file_select).transpose() adata.var_names_make_unique() if org == "human": cells_select = np.intersect1d(df_cell_types_human.index.values, adata.obs_names.values) cell_types = ( df_cell_types_human["cell_types"].loc[cells_select].values) else: cells_select = np.intersect1d(df_cell_types_mouse.index.values, adata.obs_names.values) cell_types = ( df_cell_types_mouse["cell_types"].loc[cells_select].values) len_before, len_after = len(adata.obs_names), len(cells_select) print( f"{len_before} before removal, {len_after} after cell removal." ) adata = adata[cells_select] adata.obs["cell_types"] = cell_types sc.pp.filter_genes(adata, min_cells=5) adata = ensembl2symbol(adata, root_dir[:-1], org, ".") adata.write_h5ad(root_dir + f"{technique}_{org}.h5ad")
def get_single_batch(row_tpl, col_names, experiments_data_dir): row = row_tpl[1] cur_data = sc.read_text( Path(experiments_data_dir, row[meta_data_columns_names.BATCH_ID] + ".txt")) cur_data = cur_data.T for col_name in col_names: cur_data.obs[col_name] = row[col_name] logging.info( f"Reading , batch id - {row[meta_data_columns_names.BATCH_ID]}") return cur_data
def cluster(inputMat, modelName): print('Working on {} cells and {} genes'.format(*inputMat.shape)) dataPath = str('/home/ahmadazim/data/modelImputations' + modelName + '.txt') # Output data as txt file np.savetxt(dataPath, inputMat) # Import data (export and then import to keep record of data) data = sc.read_text(dataPath) print("Data imported.") data.var_names_make_unique() data.var['mt'] = data.var_names.str.startswith('MT-') sc.pp.calculate_qc_metrics(data, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) sc.pl.violin(data, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True) maxGene = input( "Filter out all cells with n_genes_by_counts greater than: ") maxMT = input( "Filter out all cells with pct_counts_mt greater than (input \"NA\" to ignore): " ) data = data[data.obs.n_genes_by_counts < int(maxGene), :] if maxMT != "NA": data = data[data.obs.pct_counts_mt < int(maxMT), :] sc.pp.highly_variable_genes(data, min_mean=0.0125, max_mean=3, min_disp=0.5) data = data[:, data.var.highly_variable] sc.pp.regress_out(data, ['total_counts', 'pct_counts_mt']) sc.pp.scale(data, max_value=10) print("QC steps done.") sc.tl.pca(data, svd_solver='arpack') sc.pp.neighbors(data, n_neighbors=10, n_pcs=40) sc.tl.umap(data) sc.tl.leiden(data) print("Plotting PCA and UMAP...") sc.pl.pca(data, color='leiden', save=str(modelName + '.png')) sc.pl.umap(data, color='leiden', save=str(modelName + '.png'))
def txt_to_hfad(dge_in, dge_out): k = sc.read_text(dge_in) n_obs = len(k.var) n_var = len(k.obs) k_t = anndata.AnnData(X=None, shape=(n_obs, n_var)) k_t.X = k.X.transpose() k_t.obs = k.var k_t.var = k.obs k_t.write(dge_out, compression='gzip')
def ReadOldST( count_matrix_file: Union[str, Path] = None, spatial_file: Union[str, Path] = None, image_file: Union[str, Path] = None, library_id: str = "OldST", scale: float = 1.0, quality: str = "hires", spot_diameter_fullres: float = 50, ) -> AnnData: """\ Read Old Spatial Transcriptomics data Parameters ---------- count_matrix_file Path to count matrix file. spatial_file Path to spatial location file. image_file Path to the tissue image file library_id Identifier for the visium library. Can be modified when concatenating multiple adata objects. scale Set scale factor. quality Set quality that convert to stlearn to use. Store in anndata.obs['imagecol' & 'imagerow'] spot_diameter_fullres Diameter of spot in full resolution Returns ------- AnnData """ adata = scanpy.read_text(count_matrix_file) adata = stlearn.add.parsing(adata, coordinates_file=spatial_file) stlearn.add.image( adata, library_id=library_id, quality=quality, imgpath=image_file, scale=scale, spot_diameter_fullres=spot_diameter_fullres, ) return adata
import sys import numpy as np import scanpy as sc from scanpy.tools._utils import get_init_pos_from_paga as get_paga wd = sys.argv[1] adata = sc.read_text(filename="{}/NormExpr.txt".format(wd)) sc.pp.neighbors(adata, use_rep='X', n_neighbors=30) sc.tl.leiden(adata) sc.tl.paga(adata, groups='leiden') sc.pl.paga(adata) sc.tl.umap(adata, init_pos=get_paga(adata), n_components=2) np.savetxt(X=adata.obsm['X_umap'], fname='{}/UMAP_Paga.txt'.format(wd), delimiter='\t')
mpl.rcParams.update(mpl.rcParamsDefault) mpl.rcParams.update({ 'font.sans-serif': 'Arial', 'font.family': 'sans-serif', 'axes.titlesize': 18, 'axes.labelsize': 14, }) #%% Load prenormed data and metadata input_file = 'data/raw-data/airway-smoking-GSE134174/GSE134174_Processed_invivo_norm.txt' metadata = pd.read_csv( 'data/raw-data/airway-smoking-GSE134174/GSE134174_Processed_invivo_metadata.txt', sep='\t', index_col=0) adata = sc.read_text(input_file).T adata.obs = metadata all_adata = adata.copy() VARIANT = 'all' #%% def exploratory_plots(adata): num_non_int = (adata.to_df().applymap( float.is_integer) == False).sum().sum() print('Num non-int: ', num_non_int) plt.figure() sc.pp.filter_cells(adata, min_genes=0) plt.hist(adata.obs.n_genes, bins=500) plt.title('Genes per cell')
def upload(pathname): import anndata filename, file_extension = os.path.splitext(pathname) if file_extension == ".mat": x = loadmat(pathname) keys = [] for key in x.keys(): keys.append(key) #obs is the cell #var is gene #pick the largest largest = 3 largest_size = 0 for i in range(len(keys) - 3): if len(x[keys[i + 3]].shape) == 2: size = (x[keys[i + 3]].shape[0] * x[keys[i + 3]].shape[1]) else: size = x[keys[i + 3]].shape[0] if size >= largest_size: largest = i + 3 largest_size = size obs_d, var_d = {}, {} for i in range(len(keys) - 3): if i != largest - 3: if (x[keys[i + 3]].flatten()).shape[0] == ( x[keys[largest]]).shape[0]: obs_d[keys[i + 3]] = x[keys[i + 3]].flatten() elif (x[keys[i + 3]].flatten()).shape[0] == ( x[keys[largest]]).shape[1]: var_d[keys[i + 3]] = x[keys[i + 3]].flatten() #else: obs_df = pd.DataFrame(data=obs_d) var_df = pd.DataFrame(data=var_d) data = anndata.AnnData(X=x[keys[largest]].todense(), obs=None if obs_df.empty else obs_df, var=None if var_df.empty else var_df) elif file_extension == ".npz": x = np.load(pathname) #pick largest size file largest = 0 largest_size = 0 for i in range(len(x.files)): if len(x[x.files[i]].shape) == 2: size = (x[x.files[i]].shape[0] * x[x.files[i]].shape[1]) else: size = x[x.files[i]].shape[0] if size >= largest_size: largest = i largest_size = size obs_d, var_d = {}, {} for i in range(len(x.files)): if i != largest: if len(x[x.files[i]].flatten()) == len(x[x.files[largest]]): obs_d[x.files[i]] = x[x.files[i]].flatten() elif len(x[x.files[i]].flatten()) == len( x[x.files[largest]][0]): var_d[x.files[i]] = x[x.files[i]].flatten() #else: obs_df = pd.DataFrame(data=obs_d) var_df = pd.DataFrame(data=var_d) data = anndata.AnnData(X=x[x.files[largest]], obs=None if obs_df.empty else obs_df, var=None if var_df.empty else var_df) elif file_extension == ".mtx": data = sc.read_10x_mtx(os.path.dirname(pathname)) data.X = data.X.todense() elif file_extension == ".csv": data = sc.read_csv(pathname) elif file_extension == ".xlsx": data = sc.read_excel(pathname) elif file_extension == ".txt": data = sc.read_text(pathname) else: data = sc.read(pathname) print(pathname, " uploaded !") return data
from matplotlib import pyplot as plt import scanpy as sc from scipy.sparse import csr_matrix from soptsc import * from _probability import * import networkx as nx import collections # First initialise some settings for scanpy sc.settings.verbosity = 3 # Possible values: (0) errors, (1) warnings, (2) info, (3) hints sc.settings.set_figure_params(dpi = 80, facecolor='white') # First load the data (we have to take the transpose, because we need the cells to be the rows and genes to be the columns) # joostdata = sc.read_text('/Users/axelalmet/Documents/scRNASeqData/Joost2016/GSE67602_Joost_et_al_expression.txt').transpose() # Directory with the text file joostdata = sc.read_text('/Users/axelalmet/Documents/MATLAB/SoptSC/Data/JoostData.txt').transpose() # Directory with the text file joostdata.var_names_make_unique() # If var_names = 'gene_ids', when this step isn't necessary sc.pp.log1p(joostdata, base = 10) # For some reason Shuxiong does this ### Test that the soptsc object initialises correctly joost_soptsc = SoptSC(joostdata) # Test that we can store variables correctly pathway_names = ['Tgfb', 'Wnt', 'Bmp'] # Name of the signalling_pathways ligand_receptor_pairs = [[('Tgfb1', 'Tgfbr1'), ('Tgfb1', 'Tgfbr2'), ('Tgfb2', 'Tgfbr1'), ('Tgfb2', 'Tgfbr2')], \ [('Wnt3', 'Fzd1'), ('Wnt4', 'Fzd1'), ('Wnt5a', 'Fzd1'), ('Wnt6', 'Fzd1'), ('Wnt10a', 'Fzd1')], \ [('Bmp1', 'Bmpr2'), ('Bmp2', 'Bmpr2'), ('Bmp4', 'Bmpr2'), ('Bmp7', 'Bmpr2')]] # Name of the ligand-receptor pairs upregulated_genes = [['Zeb2','Smad2','Wnt4','Wnt11','Bmp7','Sox9','Notch1'], \ ['Ctnnb1','Lgr5','Runx2','Apc','Mmp7','Dkk1','Ccnd1'], \ ['Crebbp','Fos','Id1','Jun','Runx1','Smad1','Smad5','Sox4','Cdh1']] # The upregulated genes
#scanpy HNSCC.py import numpy as np import pandas as pd import scanpy as sc sc.settings.verbosity = 3 sc.logging.print_versions() results_file = '/home/ressf/Documenti/RessBachelorsThesisCode/Downstream_analysis/HNSCC/results_scanpy.h5ad' adata = sc.read_text( '/home/ressf/Documenti/RessBachelorsThesisCode/Downstream_analysis/HNSCC/hnscc_clean_trasp.txt', delimiter='\t', dtype='float32') adata.var_names_make_unique() adata #preprocessing sc.pl.highest_expr_genes( adata, n_top=20, ) sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) adata.var['mt'] = adata.var_names.str.startswith('MT-') sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False,
py.init_notebook_mode(connected=False) import plotly.graph_objs as go from plotly.graph_objs import XAxis, YAxis, ZAxis, Scene from sklearn.decomposition import FastICA as ICA from sklearn.manifold import LocallyLinearEmbedding as LLE from sklearn.manifold import SpectralEmbedding as LaplacianEigenMaps from sklearn.manifold import Isomap # In[2]: # verbosity: errors (0), warnings (1), info (2), hints (3) sc.settings.verbosity = 3 sc.logging.print_versions() sc.settings.set_figure_params(dpi=80) # In[3]: adata = sc.read_text("C:/Users/saite/Desktop/Datasets/dataset1.txt") #adata=sc.read_csv("C:/Users/saite/Desktop/Datasets/wang.csv") # ============================================================================= #reading PBMC dataset # adata=sc.read_10x_mtx( # 'C:/Users/saite/Desktop/Datasets/PBMC/filtered_gene_bc_matrices/hg19', # the directory with the `.mtx` file # var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index) # cache=True) # ============================================================================= #adata=adata.transpose() adata.var_names_make_unique() #adata.obs_names_make_unique() # In[4]: print(adata) sc.pl.highest_expr_genes(adata, n_top=20) #Computes, for each gene, the fraction of counts assigned to that gene within a cell.
import sys, os import numpy as np import matplotlib import matplotlib.pyplot as plt import scvelo as scv import scanpy as sc import pandas as pd import loompy scv.settings.set_figure_params('scvelo') proj_path = "/Users/kriemo/Projects/publication_repos/lung-scrna/results/revision_2/" input_path = os.path.join(proj_path, "revision", "geo") adata = sc.read_text(os.path.join(input_path, "count_matrix.tsv.gz"), delimiter="\t") adata = adata.T mdata = pd.read_csv(os.path.join(proj_path, "revision/geo/cell_metadata.tsv.gz"), sep="\t") adata.obs["cluster"] = np.array(mdata["cluster"]) adata.obs["cell_type"] = np.array(mdata["cell_type"]) #add tSNE projections tsne_mat = np.column_stack( (np.array(mdata["tSNE_1"]), np.array(mdata["tSNE_2"]))) adata.obsm['X_tsne'] = tsne_mat good_clusters = [
# combine and calculate Alignment score # all need data can be obtained in the corresponding GEO database ## load mp data mpAdata = sc.read_10x_h5("mp_filtered_feature_bc_matrix.h5") tempTools.plotCellScatter(mpAdata) mpAdata = mpAdata[mpAdata.obs.eval("500 <= n_genes <= 5000")] ## load science data scienceAdata = sc.read_10x_h5("science_filtered_gene_bc_matrices.h5") tempTools.plotCellScatter(scienceAdata) scienceAdata = scienceAdata[:, ~scienceAdata.var.index.duplicated()] scienceAdata.var.index = scienceAdata.var.gene_ids ## load dc data dcAdata = sc.read_text("dc_Root_single_cell_wt_datamatrix.csv", ",") dcAdata = dcAdata.T tempTools.plotCellScatter(dcAdata) ## load pp data ppAdata = sc.read_text("pp_5way_merge_raw.tsv", "\t") ppAdata = ppAdata.T ppuseBarcodeLs = list( ppAdata.obs.index.str.split("_").map( lambda x: x[2] + "-1-" + str(int(x[1][-1]) - 1) ) ) ppRawAdatas = [ sc.read_10x_h5(x) for x in [ "pp_1_filtered_feature_bc_matrix.h5",
def read_sc_data(input_file, fmt='h5ad', backed=None, transpose=False, sparse=False, delimiter=" ", unique_name=True, batch_name=None, var_names="gene_symbols"): """\ Read single cell dataset Parameters ---------- input_file : string The path of the file to be read. fmt : string, optional (default: 'h5ad') The file type of the file to be read. backed : Union[Literal[‘r’, ‘r+’], bool, None] (default: None) If 'r', load AnnData in backed mode instead of fully loading it into memory (memory mode). If you want to modify backed attributes of the AnnData object, you need to choose 'r+'. transpose: bool, optional (default: False) Whether to transpose the read data. sparse: bool, optional (default: False) Whether the data in the dataset is stored in sparse matrix format. delimiter: str, optional (default: ' ') Delimiter that separates data within text file. If None, will split at arbitrary number of white spaces, which is different from enforcing splitting at single white space ' '. unique_name: bool, optional (default: False) If Ture, AnnData object execute var_names_make_unique() and obs_names_make_unique() functions. batch_name: string, optional (default: None) Batch name of current batch data var_names: Literal[‘gene_symbols’, ‘gene_ids’] (default: 'gene_symbols') The variables index when the file type is 'mtx'. Returns ------- :class:`~anndata.AnnData` adata """ if fmt == '10x_h5': adata = sc.read_10x_h5(input_file) elif fmt == '10x_mtx': adata = sc.read_10x_mtx(input_file, var_names=var_names) elif fmt == "mtx": adata = sc.read_mtx(input_file) elif fmt == 'h5ad': adata = sc.read_h5ad(input_file, backed=backed) elif fmt == "csv": adata = sc.read_csv(input_file) elif fmt == "txt": adata = sc.read_text(input_file, delimiter=delimiter) elif fmt == "tsv": adata = sc.read_text(input_file, delimiter="\t") else: raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'') if transpose: adata = adata.transpose() if sparse: adata.X = csr_matrix(adata.X, dtype='float32') if unique_name: adata.var_names_make_unique() adata.obs_names_make_unique() if batch_name is not None: adata.obs["_batch"] = batch_name return adata