def annotate_cell_cycle_scores_human( adata_results_file, cell_cycle_file='/ahg/regevdata/users/oursu/code/general_data/cellcycle/regev_lab_cell_cycle_genes.txt' ): cell_cycle_genes = [x.strip() for x in open(cell_cycle_file)] s_genes = cell_cycle_genes[:43] g2m_genes = cell_cycle_genes[43:] adata_cellcycle = sc.read(adata_results_file + '.basic.h5ad') adata_cellcycle sc.pp.log1p(adata_cellcycle) sc.pp.scale(adata_cellcycle) sc.tl.score_genes_cell_cycle(adata_cellcycle, s_genes=s_genes, g2m_genes=g2m_genes) adata_annotated = sc.read(adata_results_file + '.basic.h5ad') #now, assign the cell cycle scores from the adata_cellcycle to adata_annotated s_scores = adata_cellcycle.obs['S_score'] g2m_scores = adata_cellcycle.obs['G2M_score'] adata_annotated.obs['S_score_added'] = s_scores.loc[ adata_annotated.obs_names] adata_annotated.obs['G2M_score_added'] = g2m_scores.loc[ adata_annotated.obs_names] adata_annotated adata_annotated.write(adata_results_file + '.basic.cc.h5ad')
def __init__(self, ah5_path, scanpy_pca): self.prefix = ah5_path.split("/")[0] self.scanpy_pca = scanpy_pca self.adata_dict = {} # Create output directories for output_dir in glob.glob(ah5_path+"/*"): # Made change for in vitro data if output_dir.split("/")[-1] == 'preprocessing_summary': pass else: try: if not os.path.exists(output_dir + "/principle_component_matrices"): os.makedirs(output_dir + "/principle_component_matrices") if not os.path.exists(output_dir + "/principle_component_analysis_figures"): os.makedirs(output_dir + "/principle_component_analysis_figures") except OSError: print("Error creating directory") for processed_file in glob.glob(ah5_path+"*/gene_matrices/*.h5ad"): # Made changes for in vitro data if len(processed_file.split('/')[-1].split('_')) <= 2: print(processed_file) self.adata_dict[processed_file.split('/')[-1].split('.')[0]] = sc.read(processed_file)
def generate_cluster_expression_output_file(sample_cluster_dict, raw_log_scanpy_obj, output_dir): """ :param sample_clustered_dict: :param output_dir: return: """ for sample_key, adata in sample_cluster_dict.items(): sample_cluster_dict[sample_key].raw = sc.read(raw_log_scanpy_obj) for cluster in list(set(adata.obs.louvain)): cluster_specific_cells = sample_cluster_dict[sample_key].obs.loc[( sample_cluster_dict[sample_key].obs.louvain == cluster )].index.tolist() sample_cluster_subset = sample_cluster_dict[sample_key][ cluster_specific_cells, :] try: # this works for sparse matrices subset_df = pd.DataFrame( data=sample_cluster_subset.raw.X.toarray(), index=sample_cluster_subset.obs.index.tolist(), columns=sample_cluster_subset.raw.var.index.tolist()) except AttributeError: # AttributeError occurs for non-sparse matrices (no toarrray()) subset_df = pd.DataFrame( data=sample_cluster_subset.raw.X, index=sample_cluster_subset.obs.index.tolist(), columns=sample_cluster_subset.raw.var.index.tolist()) subset_df.to_csv(output_dir + sample_key + '/' + sample_key + '_cluster_' + cluster + '.tsv', sep='\t', index=True)
def read_raw_file(self): """ Reads the raw data file and turns it into a dense matrix, stored as an attribute. Returns ------- """ print("reading single cell data from {}".format(self.raw_file)) file_format = self.raw_file.split('.')[-1] if file_format == 'h5': andata = sc.read_10x_h5(self.raw_file) elif file_format == 'h5ad': andata = sc.read(self.raw_file) else: raise ValueError('Reading [ %s ] failed, the inferred file ' 'format [ %s ] is not supported. Please convert ' 'your file to either h5 or h5ad format.' % (self.raw_file, file_format)) # appends -1 -2... to the name of genes that already exist andata.var_names_make_unique() if sp_sparse.issparse(andata.X): andata.X = andata.X.toarray() self.sc_raw = andata
def load_10x_scanpy(path, batch_label): sc01 = sc.read('{}/matrix.mtx'.format(path), cache=True).T sc01.var_names = pd.read_table('{}/genes.tsv'.format(path), header=None)[1] sc01.obs_names = pd.read_table('{}/barcodes.tsv'.format(path), header=None)[0] sc01.obs_names = sc01.obs_names.str.replace('-1', '') sc01.var_names_make_unique() sc.pp.filter_cells(sc01, min_genes=200) sc.pp.filter_genes(sc01, min_cells=3) sc01.obs['n_UMI'] = np.sum(sc01.X, axis=1).A1 mito_genes = sc01.var_names[sc01.var_names.str.match(r'^mt-')] sc01.obs['percent_mito'] = np.sum(sc01[:, mito_genes].X, axis=1).A1 / sc01.obs['n_UMI'] ribo_genes = sc01.var_names[sc01.var_names.str.match( r'^(Rpl|Rps|Mrpl|Mrps)')] sc01.obs['percent_ribo'] = np.sum(sc01[:, ribo_genes].X, axis=1).A1 / sc01.obs['n_UMI'] assgn = pd.read_csv('{}/{}_assgn.csv'.format( os.path.join(CUR_DIR, '..', '01-cluster-sc01-sc02'), batch_label, ), index_col=0) assgn.columns = ['cluster'] sc01.obs['cluster'] = assgn.cluster[sc01.obs.index] return sc01
def read_dataset(adata, transpose=False, test_split=False, copy=False): if isinstance(adata, sc.AnnData): if copy: adata = adata.copy() elif isinstance(adata, str): adata = sc.read(adata, first_column_names=True) else: raise NotImplementedError # check if observations are unnormalized using first 10 X_subset = adata.X[:10] norm_error = 'Make sure that the dataset (adata.X) contains unnormalized count data.' if sp.sparse.issparse(X_subset): assert (X_subset.astype(int) != X_subset).nnz == 0, norm_error else: assert np.all(X_subset.astype(int) == X_subset), norm_error if transpose: adata = adata.transpose() if test_split: train_idx, test_idx = train_test_split(np.arange(adata.n_obs), test_size=0.1, random_state=42) spl = pd.Series(['train'] * adata.n_obs) spl.iloc[test_idx] = 'test' adata.obs['dca_split'] = spl.values else: adata.obs['dca_split'] = 'train' adata.obs['dca_split'] = adata.obs['dca_split'].astype('category') print('dca: Successfully preprocessed {} genes and {} cells.'.format( adata.n_vars, adata.n_obs)) return adata
def main() : # check there is exactly one command-line argument provided if(len(sys.argv) != 2): sys.stderr.write("usage: " + __file__ + " <adata-file-path>\n") sys.exit(1) anndata = sc.read(sys.argv[1]) # run PCA, compute pairwise distance, k-nearest-neighbors \n# trick to replacing scanpy implementation with our own # is to \n# update the anndata object with our intermediate values\n# \n# we replace the scanpy version of PCA by # updating\n# .adata.obsm['X_pca'] = our PCA() output\n#\n# we replace the scanpy version of k-nearest-neighbors by # updating\n# self.adata.uns['neighbors']['connectivities'] = our knn() output\n # self.adata.uns['neighbors']['distances'] = out knn() # output\n\nknng = KnnG(anndata, n_neighbors=12, runPCA=True, nPC=50)") # umap() reduce results to 2 deminsions so that we can plot the data\nsc.tl.umap(anndata)') # ### 1.c. [5 pts] Turn in a UMAP plot of your 12-NN graph calculated from the combined chemistry PBMC dataset # colored by batch (the chemistry used) scanpy.pl.umap(anndata, color=['Method']) # ### 1.d. [5 pts] Turn in another UMAP plot of your 12-NN graph calculated from the combined chemistry PBMC dataset # but colored by cell type scanpy.pl.umap(anndata, color=['Cell type'])
def testAdata(self): ''' The real deal ''' self.logger.info("BEGIN") anndata = sc.read("../PBMC.merged.h5ad") xxx = anndata.uns['neighbors']['connectivities'] # anndata.uns['neighbors']['connectivities'] csr_matrix # shape <class 'tuple'>: (15476, 15476) # # adata.obs['louvain'] # Series: index # data_3p-AAACCTGAGCATCATC-0 9 # data_3p-AAACCTGAGCTAGTGG-0 5 # <class 'tuple'>: (15476,) #knn takes about 3 or 4 min # run our implementation of nearest neighboors and update anndata # todo try running with out knng maybe adata has values already save time KnnG(anndata, n_neighbors=12, runPCA=True, nPC=50) self.logger.info("begin Louvain.runWithAdata") start = timer() root = Louvain.runWithAdata(anndata) end = timer() self.logger.info("Louvain.runWithAdata execution time:{}"\ .format(timedelta(seconds=end-start))) self.logger.info("END\n")
def Smillie2019_processed(): """Processed data from Smillie et al. Intra- and Inter-cellular Rewiring of the Human Colon during Ulcerative Colitis. Cell. 2019 The data consists of processed single cell expression data from colon mucosa from 7 ulcerative colitis (UC) patients and 10 healthy controls, paired samples (inlamed, non-inflamed for UC, location-matched for healthy): 34 samples. Epithelial (EPI) and lamina propria (LP) fractions enriched in a two-step digestion process. Data was filtered, batch corrected using BBKNN and celltypes were annotated. Returns ------- adata : :class:`~anndata.AnnData` Annotated data matrix. Example ------- >>> import besca as bc >>> adata = bc.datasets.Smillie2019_processed() >>> adata """ filename = pkg_resources.resource_filename( 'besca', 'datasets/data/Smillie2019_processed.h5ad') adata = read(filename, cache=True) return adata
def read10xData(path,min_genes): """ Reads, precesses and returns single cell data. Parameters ---------- path : Str Diractory path, the location of single cell data. min_genes : Int The minimum number of genes for a cell to have in order to participate the analysis. Returns ------- scData : AnnData Single cell data. """ result = sc.read(path + 'matrix.mtx').transpose() #, cache=True result.var_names = np.genfromtxt(path + 'genes.tsv', dtype=str)[:, 1] result.obs_names = np.genfromtxt(path + 'barcodes.tsv', dtype=str) result.var_names_make_unique() result.obs['n_counts'] = np.sum(result.X, axis=1).A1 sc.pp.filter_cells(result, min_genes=min_genes) return result
def generate_sample_expression_output_file(sample_cluster_dict, raw_log_scanpy_obj, output_dir): """ :param sample_clustered_dict: :param output_dir: return: """ for sample_key, adata in sample_cluster_dict.items(): sample_cluster_dict[sample_key].raw = sc.read(raw_log_scanpy_obj) #sample_expression_df = pd.DataFrame(adata.X, index = adata.obs.index.tolist(), columns = adata.var.index.tolist()) try: # this works for sparse matrices sample_expression_df = pd.DataFrame( data=sample_cluster_dict[sample_key].raw.X.toarray(), index=sample_cluster_dict[sample_key].obs.index.tolist(), columns=sample_cluster_dict[sample_key].raw.var.index.tolist()) except AttributeError: # AttributeError occurs for non-sparse matrices (no toarrray()) sample_expression_df = pd.DataFrame( data=sample_cluster_dict[sample_key].raw.X, index=sample_cluster_dict[sample_key].obs.index.tolist(), columns=sample_cluster_dict[sample_key].raw.var.index.tolist()) sample_expression_df.to_csv(output_dir + sample_key + '_log_expression.tsv', sep='\t', index=True)
def __init__(self, pca_h5ad, n_pcs, n_neighbors): self.adata_dict = {} self.prefix = pca_h5ad for processed_file in glob.glob( pca_h5ad + "*/principle_component_matrices/*.h5ad"): self.adata_dict[processed_file.split('/')[-3]] = sc.read( processed_file) for output_dir in glob.glob(pca_h5ad + "*/"): if output_dir.split("/")[-1] == 'preprocessing_summary': pass else: try: if not os.path.exists(output_dir + '/cluster_matrices'): os.makedirs(output_dir + '/cluster_matrices') if not os.path.exists(output_dir + "/cluster_analysis"): os.makedirs(output_dir + "/cluster_analysis") # if not os.path.exists(output_dir + "/cluster_analysis/tSNE"): # os.makedirs(output_dir + "/cluster_analysis/tSNE") # # if not os.path.exists(output_dir + "/cluster_analysis/umap"): # os.makedirs(output_dir + "/cluster_analysis/umap") # # if not os.path.exists(output_dir + "/cluster_analysis/louvain"): # os.makedirs(output_dir + "/cluster_analysis/louvain") except OSError: print("Error creating directory") self.n_pcs = n_pcs self.n_neighbors = n_neighbors
def read_h5ad(args): #read input h5ad dataset = sc.read(args.input) print("File read!") compute_entropy(dataset)
def getdata(dataset): basedir = os.path.abspath(os.path.join(__file__, "..")) if dataset == "green": adata = sc.read(basedir + "/data/green/green.h5ad") process_clusts(adata, "CellType") elif dataset == "paul": adata = sc.read(basedir + "/data/paul/paul.h5ad") process_clusts(adata, "paul15_clusters") elif dataset == "zeisel": adata = sc.read(basedir + "/data/zeisel/zeisel.h5ad") process_clusts(adata, "group") elif dataset == "zheng": adata = sc.read(basedir + "/data/zheng/fresh_68k_bulk_labels.h5ad") process_clusts(adata) else: raise ValueError("No such dataset") return adata
def check_dl(filename, url): try: adata = read(filename, backup_url=url, cache=True) except Exception: raise URLError( f'\n\n\n {filename} could not be downloaded from {url}; \n Please download it manually and store it in your besca installation: besca/datasets/data/' ) return adata
def train(data_name="pbmc", cell_type="CD4T", p_type="unbiased"): train_path = f"../data/train_{data_name}.h5ad" if data_name == "pbmc": ctrl_key = "control" stim_key = "stimulated" cell_type_key = "cell_type" elif data_name == "hpoly": ctrl_key = "Control" stim_key = "Hpoly.Day10" cell_type_key = "cell_label" elif data_name == "salmonella": ctrl_key = "Control" stim_key = "Salmonella" cell_type_key = "cell_label" data = sc.read(train_path) print("data has been loaded!") train = data[~((data.obs["condition"] == stim_key) & (data.obs[cell_type_key] == cell_type))] pca = PCA(n_components=100) pca.fit(train.X.A) train_real_cd = train[train.obs["condition"] == "control", :] if p_type == "unbiased": train_real_cd = scgen.util.balancer(train_real_cd) train_real_stimulated = train[train.obs["condition"] == "stimulated", :] if p_type == "unbiased": train_real_stimulated = scgen.util.balancer(train_real_stimulated) import scipy.sparse as sparse if sparse.issparse(train_real_cd.X): train_real_cd.X = train_real_cd.X.A train_real_stimulated.X = train_real_stimulated.X.A train_real_stimulated_PCA = pca.transform(train_real_stimulated.X) train_real_cd_PCA = pca.transform(train_real_cd.X) adata_list = scgen.util.extractor(data, cell_type, { "ctrl": ctrl_key, "stim": stim_key }) if sparse.issparse(adata_list[1].X): adata_list[1].X = adata_list[1].X.A adata_list[2].X = adata_list[2].X.A ctrl_CD4T_PCA = pca.transform(adata_list[1].X) predicted_cells = predict(pca, train_real_cd_PCA, train_real_stimulated_PCA, ctrl_CD4T_PCA, p_type) all_Data = sc.AnnData( np.concatenate([adata_list[1].X, adata_list[2].X, predicted_cells])) all_Data.obs["condition"] = ["ctrl"] * len(adata_list[1].X) + ["real_stim"] * len(adata_list[2].X) + \ ["pred_stim"] * len(predicted_cells) all_Data.var_names = adata_list[3].var_names if p_type == "unbiased": sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T.h5ad", all_Data) else: sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T_biased.h5ad", all_Data)
def __init__(self, cluster_h5ad, marker_gene_file, cell_type): self.prefix = cluster_h5ad self.cell_type = cell_type for clusters in glob.glob(self.prefix + "/*"): if clusters.split("/")[-1] != 'preprocessing_summary': try: if not os.path.exists(clusters + "/cluster_analysis"): os.makedirs(clusters + "/cluster_analysis") # Create marker gene analysis directory if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis"): os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis") if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/tSNE"): os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/tSNE") if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/louvain"): os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/louvain") if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/umap"): os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/umap") if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/heatmap"): os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/heatmap") if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/dotplot"): os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/dotplot") if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/cluster_gene_rankings"): os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/cluster_gene_rankings") # Create cell type analysis directory if not os.path.exists(clusters + "/cluster_analysis/cell_type_analysis"): os.makedirs(clusters + "/cluster_analysis/cell_type_analysis") if not os.path.exists(clusters + "/cluster_analysis/cell_type_analysis/tSNE"): os.makedirs(clusters + "/cluster_analysis/cell_type_analysis/tSNE") if not os.path.exists(clusters + "/cluster_analysis/cell_type_analysis/louvain"): os.makedirs(clusters + "/cluster_analysis/cell_type_analysis/louvain") if not os.path.exists(clusters + "/cluster_analysis/cell_type_analysis/umap"): os.makedirs(clusters + "/cluster_analysis/cell_type_analysis/umap") except OSError: print("Error creating directory.") # Create dictionary containing cell cluster cluster cluster_matrices self.cluster_matrices_dict = {} for cluster_matrices in glob.glob(self.prefix + "/*/cluster_matrices/*"): self.cluster_matrices_dict[cluster_matrices.split('/')[-3]] = sc.read(cluster_matrices) # Create marker gene list with provided txt file self.marker_gene_file = marker_gene_file if self.marker_gene_file: self.marker_gene_list = list(map(lambda x: x.strip(), marker_gene_file.readlines()))
def adata_neighbors(): adata = sc.read('./data/pbmc3k_raw.h5ad', backup_url='http://falexwolf.de/data/pbmc3k_raw.h5ad') sc.pp.filter_genes(adata, min_cells=1) sc.pp.normalize_per_cell(adata) sc.pp.log1p(adata) sc.pp.pca(adata) sc.pp.neighbors(adata) return adata
def reconstruct(): train_path = "../data/train_pbmc.h5ad" data = sc.read(train_path) ctrl_key = "control" stim_key = "stimulated" all_data = anndata.AnnData() print(data.obs["cell_type"].unique().tolist()) for idx, cell_type in enumerate(data.obs["cell_type"].unique().tolist()): pca = PCA(n_components=100) train = data[~((data.obs["condition"] == stim_key) & (data.obs["cell_type"] == cell_type))] pca.fit(train.X.A) print(cell_type, end="\t") train_real_stimulated = data[data.obs["condition"] == stim_key, :] train_real_stimulated = train_real_stimulated[ train_real_stimulated.obs["cell_type"] != cell_type] train_real_stimulated = scgen.util.balancer(train_real_stimulated) train_real_stimulated_PCA = pca.transform(train_real_stimulated.X) train_real_cd = data[data.obs["condition"] == ctrl_key, :] train_real_cd = scgen.util.balancer(train_real_cd) train_real_cd_PCA = pca.transform(train_real_cd.X) cell_type_adata = data[data.obs["cell_type"] == cell_type] cell_type_ctrl = cell_type_adata[cell_type_adata.obs["condition"] == ctrl_key] cell_type_stim = cell_type_adata[cell_type_adata.obs["condition"] == stim_key] if sparse.issparse(cell_type_ctrl.X): cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X.A) else: cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X) predicted_cells = predict(pca, train_real_cd_PCA, train_real_stimulated_PCA, cell_type_ctrl_PCA) if sparse.issparse(cell_type_ctrl.X): all_Data = sc.AnnData( np.concatenate( [cell_type_ctrl.X.A, cell_type_stim.X.A, predicted_cells])) else: all_Data = sc.AnnData( np.concatenate( [cell_type_ctrl.X, cell_type_stim.X, predicted_cells])) all_Data.obs["condition"] = [f"{cell_type}_ctrl"] * cell_type_ctrl.shape[0] + [f"{cell_type}_real_stim"] * \ cell_type_stim.shape[0] + \ [f"{cell_type}_pred_stim"] * len(predicted_cells) all_Data.obs["cell_type"] = [f"{cell_type}"] * ( cell_type_ctrl.shape[0] + cell_type_stim.shape[0] + len(predicted_cells)) all_Data.var_names = cell_type_adata.var_names if idx == 0: all_data = all_Data else: all_data = all_data.concatenate(all_Data) print(cell_type) sc.write("../data/reconstructed/PCAVecArithm/PCA_pbmc.h5ad", all_data)
def read_h5ad(x): #read input h5ad dataset = sc.read(x) kwargs = {} kwargs["batch_vector"] = dataset.obs["Batch"] kwargs["cell_type_vector"] = dataset.obs["cell_type1"] do_the_filtering(dataset, **kwargs)
def load_10x_12k_mix_mouse(): filename_data = '/data/martin/single_cell/10x_12k_mix/filtered_gene_bc_matrices/mm10/matrix.mtx' filename_genes = '/data/martin/single_cell/10x_12k_mix/filtered_gene_bc_matrices/mm10/genes.tsv' filename_barcodes = '/data/martin/single_cell/10x_12k_mix/filtered_gene_bc_matrices/mm10/barcodes.tsv' data = sc.read(filename_data, cache=True).transpose() data.var_names = np.genfromtxt(filename_genes, dtype=str)[:, 1] data.smp_names = np.genfromtxt(filename_barcodes, dtype=str) return data
def paul_test(n_top_gene=100): adata = sc.read("data/paul15/paul15.h5ad") sc.pp.filter_cells(adata, min_genes=10) sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4) sc.pp.filter_genes(adata, min_cells=20) sc.pp.filter_genes_dispersion(adata, n_top_genes=1000) sc.pp.log1p(adata) sc.pp.scale(adata, zero_center=True, max_value=False) return adata
def read_dataset(input_file, transpose=False): """ Construct a anndata object """ if os.path.isfile(input_file): print("The value os", os.path.isfile(input_file)) if str(input_file).endswith('h5ad'): adata = sc.read(input_file) elif sum([ str(input_file).endswith(str(i)) for i in ["tsv", 'TSV', 'tab', 'data'] ]): adata = sc.read_text(input_file, sep="\t", first_column_names=True) if transpose: adata = adata.T elif sum([str(input_file).endswith(str(i)) for i in ['csv', "CSV"]]): adata = sc.read_text(input_file, sep=",", first_column_names=True) if transpose: adata = adata.T else: #ValueError 'The file must be one of *.h5ad, *.tsv,*TSV,*.tab,*data, *csv,*CSV' print( "The file must be one of *.h5ad, *.tsv,*TSV,*.tab,*data, *csv,*CSV" ) else: #read folder mtx = sc.read_mtx(os.path.join(input_file, "matrix.mtx")) num_lines = sum( 1 for line in open(os.path.join(input_file, 'barcodes.tsv'))) cellinfo = pd.read_csv(os.path.join(input_file, "barcodes.tsv"), sep="\t", header=None if num_lines == mtx.shape[1] else 0) if not 'cellname' in cellinfo.columns: cellinfo['cellname'] = cellinfo.iloc[:, 0] num_lines = sum( 1 for line in open(os.path.join(input_file, 'genes.tsv'))) geneinfo = pd.read_csv(os.path.join(input_file, "genes.tsv"), sep="\t", header=None if num_lines == mtx.shape[0] else 0) if not 'genename' in geneinfo.columns: geneinfo[ 'genename'] = geneinfo.iloc[:, 1] # for 10x,the second columns is the genename, and the first column is gene_id #create anndata adata = sc.AnnData(mtx.X.T, obs=cellinfo, var=geneinfo) adata.obs_names = adata.obs["cellname"] adata.var_names = adata.var["genename"] adata.obs_names_make_unique(join="-") adata.var_names_make_unique(join="-") #create time now = datetime.datetime.now() adata.uns["ProjectName"] = "DESC created in" + str( now.strftime("%Y-%m-%d %H:%M")) print("Creat adata successfully! The adata infofation is", adata) return adata
def getAnnData(matrix, genelist, barcodes): sc.settings.verbosity = 0 # verbosity: errors (0), warnings (1), info (2), hints (3) sc.settings.autoshow = False print('Reading matrix...') adata = sc.read(matrix, cache=False).T print(adata) print('Reading gene list...') genes = pd.read_csv(genelist, header=None, sep='\t') geneNames = anndata.utils.make_index_unique(pd.Index(genes[1])) adata.var_names = geneNames adata.var['gene_ids'] = genes[0].values adata.obs_names = pd.read_csv(barcodes, header=None)[0] adata.var_names_make_unique() sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) mito_genes = [name for name in adata.var_names if name.startswith('MT-')] # for each cell compute fraction of counts in mito genes vs. all genes # the `.A1` is only necessary as X is sparse to transform to a dense array after summing adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 # add the total counts per cell as observations-annotation to adata adata.obs['n_counts'] = adata.X.sum(axis=1).A1 adata = adata[adata.obs['n_genes'] < 2500, :] adata = adata[adata.obs['percent_mito'] < 0.05, :] adata.raw = sc.pp.log1p(adata, copy=True) sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4) filter_result = sc.pp.filter_genes_dispersion(adata.X, min_mean=0.0125, max_mean=3, min_disp=0.5) adata = adata[:, filter_result.gene_subset] sc.pp.log1p(adata) sc.pp.regress_out(adata, ['n_counts', 'percent_mito']) sc.pp.scale(adata, max_value=10) sc.tl.pca(adata, svd_solver='arpack') sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40) sc.tl.umap(adata) sc.tl.louvain(adata) sc.pl.umap(adata, color=['louvain'], show=False) sc.tl.rank_genes_groups(adata, 'louvain', method='logreg') return adata
def main(): ad_path, cs_name, output, gmt_file = parse_args() gene_sets = read_gmt(gmt_file) ad = sc.read(ad_path) percentage_markers_expressed_in_cluster(ad, cs_name, gene_sets).to_csv(output, header=True)
def normalize(adata, copy=True, highly_genes = None, filter_min_counts=True, size_factors=True, normalize_input=True, logtrans_input=True): """ Normalizes input data and retains only most variable genes (indicated by highly_genes parameter) Args: adata ([type]): [description] copy (bool, optional): [description]. Defaults to True. highly_genes ([type], optional): [description]. Defaults to None. filter_min_counts (bool, optional): [description]. Defaults to True. size_factors (bool, optional): [description]. Defaults to True. normalize_input (bool, optional): [description]. Defaults to True. logtrans_input (bool, optional): [description]. Defaults to True. Raises: NotImplementedError: [description] Returns: [type]: [description] """ if isinstance(adata, sc.AnnData): if copy: adata = adata.copy() elif isinstance(adata, str): adata = sc.read(adata) else: raise NotImplementedError norm_error = 'Make sure that the dataset (adata.X) contains unnormalized count data.' assert 'n_count' not in adata.obs, norm_error if adata.X.size < 50e6: # check if adata.X is integer only if array is small if sp.sparse.issparse(adata.X): assert (adata.X.astype(int) != adata.X).nnz == 0, norm_error else: assert np.all(adata.X.astype(int) == adata.X), norm_error if filter_min_counts: sc.pp.filter_genes(adata, min_counts=1)#3 sc.pp.filter_cells(adata, min_counts=1) if size_factors or normalize_input or logtrans_input: adata.raw = adata.copy() else: adata.raw = adata if size_factors: sc.pp.normalize_per_cell(adata) adata.obs['size_factors'] = adata.obs.n_counts / np.median(adata.obs.n_counts) else: adata.obs['size_factors'] = 1.0 if logtrans_input: sc.pp.log1p(adata) if highly_genes != None: sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, n_top_genes = highly_genes, subset=True) if normalize_input: sc.pp.scale(adata) return adata
def load_10x_1_3mil_subsample(opt=10): if opt == 10: filename_data = '/data/martin/exp_sceb/subsample_1.3mil/data_1.3mil_high10_gene.h5ad' elif opt == 5: filename_data = '/data/martin/exp_sceb/subsample_1.3mil/data_1.3mil_high5_gene.h5ad' elif opt == 1: filename_data = '/data/martin/exp_sceb/subsample_1.3mil/data_1.3mil_high1_gene.h5ad' elif opt == 0.5: filename_data = '/data/martin/exp_sceb/subsample_1.3mil/data_1.3mil_high0.5_gene.h5ad' data = sc.read(filename_data) return data
def readData(self, countsFile=""): if countsFile == "": countsFile = self.CountsFile if countsFile == "": print("please input counts file path") return "" self.CountsFile = countsFile datapath = self.CountsFile if os.path.isdir(datapath): files = os.listdir(datapath) for i in files: if i.endswith(".gz"): print(i) target = datapath + "/*.gz" print(target) command = subprocess.Popen("gunzip " + target, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT) output = command.stdout.read() break files = os.listdir(datapath) for i in files: if i == "features.tsv": os.rename(datapath + "/features.tsv", datapath + "/genes.tsv") break files = list(os.listdir(datapath)) if ('barcodes.tsv' in files) and ('barcodes.tsv' in files) and ("genes.tsv" in files): adata = sc.read_10x_mtx(datapath, var_names='gene_symbols') self.data = adata self.preprocess() else: print("input data is not correct") return "" elif os.path.isfile(datapath): if datapath.endswith(".h5ad"): adata = sc.read(datapath) else: adata = sc.read_csv(datapath) adata = adata.T self.data = adata self.preprocess() else: print("file or dir not exists") return ""
def load_10x_ercc_1k(): """ https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/ercc """ filename_data = '/data/martin/single_cell/10x_ERCC_1k/filtered_matrices_mex/ercc92/matrix.mtx' filename_genes = '/data/martin/single_cell/10x_ERCC_1k/filtered_matrices_mex/ercc92/genes.tsv' filename_barcodes = '/data/martin/single_cell/10x_ERCC_1k/filtered_matrices_mex/ercc92/barcodes.tsv' data = sc.read(filename_data, cache=True).transpose() data.var_names = np.genfromtxt(filename_genes, dtype=str)[:, 1] data.obs_names = np.genfromtxt(filename_barcodes, dtype=str) return data
def preprocessing(data_folder, min_genes=200, min_cells=3, max_genes=7000, mito_cutoff=False, normalize=True): """ Combined function for preprocessing using Scanpy. For a more complete documentation on preprocessing, please visit Input: data_folder = Path to data files min_genes = Minimum amount of genes required for a gene to be valid (default is set at 200) min_cells = Minimum amount of cells required for a gene to be valid (default is set at 3) max_genes = Maximum amount of genes permitted for a cell to be valid (default is set at 7000) mito_cutoff = Percentage of genes permitted to be assigned to mitochondrial Genes in a cell (default is set at False=0) normalize = Normalize the Anndata object (default set at True) Returns AnnData type from matrix - genes - barcodes. Full documentation on AnnData can be found on github. """ #Read data and create initial AnnData Frame path = '{}/'.format(data_folder) adata = sc.read(path + 'matrix.mtx', cache=True).T # transpose the data adata.var_names = pd.read_csv(path + 'genes.tsv', header=None, sep='\t')[1] adata.obs_names = pd.read_csv(path + 'barcodes.tsv', header=None)[0] adata.var_names_make_unique() #Filter data with min_genes per cell, max_genes per cell, min_cells per genes sc.pp.filter_cells(adata, min_genes=min_genes) sc.pp.filter_genes(adata, min_cells=min_cells) adata = adata[adata.obs['n_genes'] < max_genes, :] # add the total counts per cell as observations-annotation to adata adata.obs['n_counts'] = adata.X.sum(axis=1).A1 #Create mito_genes and possible filter mito_genes = [name for name in adata.var_names if name.startswith('MT-')] adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 if int(mito_cutoff) == False: pass else: adata = adata[adata.obs['percent_mito'] < float(mito_cutoff), :] #Normalize data option if normalize == True: sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4) return adata