def read_10x_data(input_file, format_type='10x_h5', backed=None, transpose=False, sparse=False): if format_type == '10x_h5': adata = sc.read_10x_h5(input_file) elif format_type == '10x_mtx': adata = sc.read_10x_mtx(input_file) elif format_type == '10x_h5ad': adata = sc.read_h5ad(input_file, backed=backed) elif format_type == "10x_csv": adata = sc.read_csv(input_file) elif format_type == "10x_txt": adata = sc.read_csv(input_file, delimiter="\t") else: raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'') if transpose: adata = adata.transpose() if sparse: adata.X = csr_matrix(adata.X, dtype='float32') adata.var_names_make_unique() adata.obs_names_make_unique() return adata
def scanpy_deal(): if opt.need_transpose: src_data = sc.read(opt.matrix_file, first_column_names=True) else: src_data = sc.read_csv(opt.matrix_file, first_column_names=True) print('X:', src_data.X, ' \ncells:', src_data.obs, ' \ngenes:', src_data.var) print('cell name:', src_data.obs_names, '\ngene name:', src_data.var_names)
def load_file(path): """ Load single cell dataset from file Parameters ---------- path the path store the file Return ------ AnnData """ if os.path.exists(DATA_PATH + path + '.h5ad'): adata = sc.read_h5ad(DATA_PATH + path + '.h5ad') elif os.path.isdir(path): # mtx format adata = read_mtx(path) elif os.path.isfile(path): if path.endswith(('.csv', '.csv.gz')): adata = sc.read_csv(path).T elif path.endswith(('.txt', '.txt.gz', '.tsv', '.tsv.gz')): df = pd.read_csv(path, sep='\t', index_col=0).T adata = AnnData(df.values, dict(obs_names=df.index.values), dict(var_names=df.columns.values)) elif path.endswith('.h5ad'): adata = sc.read_h5ad(path) else: raise ValueError("File {} not exists".format(path)) if not issparse(adata.X): adata.X = scipy.sparse.csr_matrix(adata.X) adata.var_names_make_unique() return adata
def read_file(filename, transpose=False): adata = None if os.path.exists(filename): if os.path.isdir(filename): adata = sc.read_10x_mtx(filename) elif os.path.isfile(filename): name, filetype = os.path.splitext(filename) if filetype == ".txt": print() adata = sc.read_text(filename) if filetype == ".csv": adata = sc.read_csv(filename) if filetype == ".h5ad": adata = sc.read(filename) else: print( "ERROR: the format must be [H5AD|CSV|TXT] for file or 10x-MTX for directory." ) sys.exit() if transpose: adata = adata.transpose() elif not os.path.exists(filename): sys.exit("ERROR: no such file or directory.") if not isinstance(adata.X, np.ndarray): X = adata.X.toarray() adata = anndata.AnnData(X, obs=adata.obs, var=adata.var) return adata
def preliminaryAnalysis(): # f1. Read data idata = scanpy.read_csv( '/Volumes/omics4tb2/alomana/projects/mscni/data/scanpy/count.file.all.day.clean.csv' ) adata = idata.transpose() # f2. Preprocessing scanpy.pp.filter_cells(adata, min_genes=200) scanpy.pp.filter_genes(adata, min_cells=3) adata.obs['n_counts'] = adata.X.sum(axis=1) scanpy.pp.normalize_per_cell(adata, counts_per_cell_after=1e5) scanpy.pp.log1p(adata) adata.raw = adata scanpy.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=6, min_disp=0.25) # 2,851 adata = adata[:, adata.var['highly_variable']] scanpy.pp.regress_out(adata, ['n_counts']) scanpy.pp.scale(adata, max_value=10) scanpy.tl.pca(adata, svd_solver='arpack') ### there seem to be a bug return adata
def cell_grouping(condition): adata = sc.read_csv('scRecover+scImpute_' + condition + '_condition.csv').transpose() sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) sc.tl.pca(adata, svd_solver='arpack') sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40) sc.tl.umap(adata) sc.tl.leiden(adata) sc.pl.umap(adata, color='leiden') raw = pd.DataFrame(data=adata.X, columns=adata.var_names) avg_gata = np.average(raw['Gata2'].to_numpy()) avg_sox = np.average(raw['Sox2'].to_numpy()) avg_zic = np.average(raw['Zic3'].to_numpy()) labels = [] for i in range(0, len(raw)): if raw['Gata2'][i] > avg_gata: labels.append('2c') elif raw['Sox2'][i] > avg_sox: labels.append('naive') elif raw['Zic3'][i] > avg_zic: labels.append('primed') else: labels.append('unknown') raw.set_index(adata.obs_names) adata.obs.leiden = labels sc.pl.umap(adata, color='leiden') adata_2c = adata[adata.obs.leiden == '2c'] adata_naive = adata[adata.obs.leiden == 'naive'] adata_primed = adata[adata.obs.leiden == 'primed'] raw_2c = pd.DataFrame(adata_2c.X, columns=adata_2c.var_names) raw_naive = pd.DataFrame(adata_naive.X, columns=adata_naive.var_names) raw_primed = pd.DataFrame(adata_primed.X, columns=adata_primed.var_names) raw_2c = raw_2c.transpose() raw_naive = raw_naive.transpose() raw_primed = raw_primed.transpose() raw_2c.to_csv(condition + '_2c.csv') raw_naive.to_csv(condition + '_naive.csv') raw_primed.to_csv(condition + '_primed.csv')
def read_counts_and_phases(count_or_rpkm, use_spike_ins, biotype_to_use, use_isoforms=False): ''' Read data into scanpy; Read phases and FACS intensities - count_or_rpkm: Must be "Counts" or "Tpms" ''' read_file = f"input/RNAData/{count_or_rpkm}{'_Isoforms' if use_isoforms else ''}.csv" + ( ".ercc.csv" if use_spike_ins else "") if biotype_to_use != None and len(biotype_to_use) > 0: print(f"filtering for biotype: {biotype_to_use}") biotype_file = f"{read_file}.{biotype_to_use}.csv" if not os.path.exists(biotype_file): gene_info = pd.read_csv( f"input/RNAData/IdsToNames{'_Isoforms' if use_isoforms else ''}.csv.gz", index_col=False, header=None, names=["gene_id", "name", "biotype", "description"]) biotyped = gene_info[gene_info["biotype"] == biotype_to_use]["gene_id"] pd.read_csv(read_file)[biotyped].to_csv(biotype_file, index=False) read_file = biotype_file adata = sc.read_csv(read_file) print(f"data shape: {adata.X.shape}") # adata.raw = adata phases = pd.read_csv( "input/ProteinData/WellPlatePhasesLogNormIntensities.csv").sort_values( by="Well_Plate") # Assign phases and log intensities; require log intensity adata.obs["Well_Plate"] = np.array(phases["Well_Plate"]) adata.obs["plate"] = np.array( [wp.split("_")[1] for wp in adata.obs["Well_Plate"]]) adata.obs["phase"] = np.array(phases["Stage"]) adata.obs["Green530"] = np.array(phases["Green530"]) adata.obs["Red585"] = np.array(phases["Red585"]) adata = adata[pd.notnull(adata.obs["Green530"]) & pd.notnull( adata.obs["Red585"])] # removes dark mitotic cells # Read in fucci pseudotime from previous analysis if os.path.isfile("output/fucci_time.csv"): adata.obs["fucci_time"] = np.array( pd.read_csv("output/fucci_time.csv")["fucci_time"]) # Get info about the genes gene_info = pd.read_csv( f"input/RNAData/IdsToNames{'_Isoforms' if use_isoforms else ''}.csv.gz", header=None, names=["name", "biotype", "description"], index_col=0) adata.var["name"] = gene_info["name"] adata.var["biotype"] = gene_info["biotype"] adata.var["description"] = gene_info["description"] return adata, phases
def read_as_anndata(list_of_list: List[List[float]], roundoff_decimal: int = 5, filename: str = None) -> ad.AnnData: temp_folder: str = '__temp__' complete_file_path: str = os.path.join(temp_folder, filename) list_of_list = [[u.roundoff(value, roundoff_decimal) for value in row] for row in list_of_list] u.create_path_if_not_exists(temp_folder) csv.writecsv(filename, list_of_list, directory=temp_folder) return sc.read_csv(complete_file_path)
def read_10x_data(input_file, format_type='10x_h5', backed=None): if format_type == '10x_h5': adata = sc.read_10x_h5(input_file) elif format_type == '10x_mtx': adata = sc.read_10x_mtx(input_file) elif format_type == '10x_h5ad': adata = sc.read_h5ad(input_file, backed=backed) elif format_type == "10x_csv": adata = sc.read_csv(input_file) else: raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'') adata.var_names_make_unique() return adata
def readData(self,countsFile=""): if countsFile=="": countsFile = self.CountsFile; if countsFile=="": print("please input counts file path"); return "" self.CountsFile=countsFile; datapath = self.CountsFile; if os.path.isdir(datapath): files = os.listdir(datapath) for i in files: if i.endswith(".gz"): print(i) target = datapath+"/*.gz"; print(target) command = subprocess.Popen("gunzip "+target, shell=True, stdin=PIPE, stdout=PIPE,stderr=STDOUT) output =command.stdout.read(); break; files=os.listdir(datapath); for i in files: if i =="features.tsv": os.rename(datapath+"/features.tsv",datapath+"/genes.tsv"); break; files = list(os.listdir(datapath)); if ('barcodes.tsv' in files) and ('barcodes.tsv' in files) and ("genes.tsv" in files): adata = sc.read_10x_mtx(datapath, var_names='gene_symbols'); self.data=adata; self.preprocess(); else: print("input data is not correct") return "" elif os.path.isfile(datapath): if datapath.endswith(".h5ad"): adata=sc.read_h5ad(datapath); else: adata = sc.read_csv(datapath) adata = adata.T; self.data=adata; #self.preprocess(); else: print("file or dir not exists") return ""
def normalizeTissue(file, dataDirectory, log): path = "%s/transpose/%s" % (dataDirectory, file) log.write(path + "\n") tissue_transpose = sc.read_csv(path, first_column_names=True) log.write("Gene count (pre-filter): %s\n" % len(tissue_transpose.var_names)) sc.pp.log1p(tissue_transpose) sc.pp.highly_variable_genes(tissue_transpose, flavor='seurat') highly_variable = tissue_transpose.var['highly_variable'] filter_result = highly_variable[highly_variable == True].keys() tissue_transpose = tissue_transpose[:, filter_result] log.write("Gene count (post-filter): %s\n" % len(tissue_transpose.var_names)) sc.pp.normalize_per_cell(tissue_transpose, counts_per_cell_after=1) sc.pp.scale(tissue_transpose) tissue_norm = pd.DataFrame(data=tissue_transpose.X, index=tissue_transpose.obs_names, columns=tissue_transpose.var_names) tissue_norm.index.name = 'cell' normFile = addPostfix(file, 'norm') normPath = "%s/norm/%s" % (dataDirectory, normFile) tissue_norm.to_csv(normPath, index=True) return normFile
def main(args): # print(args) n_slices = int(len(args.filename) / 2) # Error check arguments if args.mode != 'pairwise' and args.mode != 'center': raise (ValueError("Please select either 'pairwise' or 'center' mode.")) if args.alpha < 0 or args.alpha > 1: raise (ValueError("alpha specified outside [0, 1]")) if args.initial_slice < 1 or args.initial_slice > n_slices: raise (ValueError("Initial slice specified outside [1, n]")) if len(args.lmbda) == 0: lmbda = n_slices * [1. / n_slices] elif len(args.lmbda) != n_slices: raise (ValueError("Length of lambda does not equal number of files")) else: if not all(i >= 0 for i in args.lmbda): raise (ValueError("lambda includes negative weights")) else: print("Normalizing lambda weights into probability vector.") lmbda = args.lmbda lmbda = [float(i) / sum(lmbda) for i in lmbda] # create slices slices = [] for i in range(n_slices): s = sc.read_csv(args.filename[2 * i]) s.obsm['spatial'] = np.genfromtxt(args.filename[2 * i + 1], delimiter=',') slices.append(s) if len(args.weights) == 0: for i in range(n_slices): slices[i].obsm['weights'] = np.ones( (slices[i].shape[0], )) / slices[i].shape[0] elif len(args.weights) != n_slices: raise (ValueError( "Number of slices {0} != number of weight files {1}".format( n_slices, len(args.weights)))) else: for i in range(n_slices): slices[i].obsm['weights'] = np.genfromtxt(args.weights[i], delimiter=',') slices[i].obsm['weights'] = slices[i].obsm['weights'] / np.sum( slices[i].obsm['weights']) if len(args.start) == 0: pis_init = (n_slices - 1) * [None] if args.mode == 'pairwise' else None elif (args.mode == 'pairwise' and len(args.start) != n_slices - 1) or ( args.mode == 'center' and len(args.start) != n_slices): raise (ValueError( "Number of slices {0} != number of start pi files {1}".format( n_slices, len(args.start)))) else: pis_init = [ pd.read_csv(args.start[i], index_col=0).to_numpy() for i in range(len(args.start)) ] # create output folder output_path = os.path.join(args.direc, "paste_output") if not os.path.exists(output_path): os.mkdir(output_path) if args.mode == 'pairwise': print("Computing pairwise alignment.") # compute pairwise align pis = [] for i in range(n_slices - 1): pi = pairwise_align(slices[i], slices[i + 1], args.alpha, dissimilarity=args.cost, a_distribution=slices[i].obsm['weights'], b_distribution=slices[i + 1].obsm['weights'], G_init=pis_init[i]) pis.append(pi) pi = pd.DataFrame(pi, index=slices[i].obs.index, columns=slices[i + 1].obs.index) output_filename = "paste_output/slice" + str( i + 1) + "_slice" + str(i + 2) + "_pairwise.csv" pi.to_csv(os.path.join(args.direc, output_filename)) if args.coordinates: new_slices = stack_slices_pairwise(slices, pis) for i in range(n_slices): output_filename = "paste_output/slice" + str( i + 1) + "_new_coordinates.csv" np.savetxt(os.path.join(args.direc, output_filename), new_slices[i].obsm['spatial'], delimiter=",") elif args.mode == 'center': print("Computing center alignment.") initial_slice = slices[args.initial_slice - 1].copy() # compute center align center_slice, pis = center_align( initial_slice, slices, lmbda, args.alpha, args.n_components, args.threshold, dissimilarity=args.cost, distributions=[slices[i].obsm['weights'] for i in range(n_slices)], pis_init=pis_init) W = pd.DataFrame(center_slice.uns['paste_W'], index=center_slice.obs.index) H = pd.DataFrame(center_slice.uns['paste_H'], columns=center_slice.var.index) W.to_csv(os.path.join(args.direc, "paste_output/W_center")) H.to_csv(os.path.join(args.direc, "paste_output/H_center")) for i in range(len(pis)): output_filename = "paste_output/slice_center_slice" + str( i + 1) + "_pairwise.csv" pi = pd.DataFrame(pis[i], index=center_slice.obs.index, columns=slices[i].obs.index) pi.to_csv(os.path.join(args.direc, output_filename)) if args.coordinates: center, new_slices = stack_slices_center(center_slice, slices, pis) for i in range(n_slices): output_filename = "paste_output/slice" + str( i + 1) + "_new_coordinates.csv" np.savetxt(os.path.join(args.direc, output_filename), new_slices[i].obsm['spatial'], delimiter=",") np.savetxt(os.path.join(args.direc, "paste_output/center_new_coordinates.csv"), center.obsm['spatial'], delimiter=",") return
os.makedirs(sys.argv[1]) #create the output directory os.chdir(sys.argv[1]) sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3) sc.logging.print_versions() results_file = './write/organ9_concatenate.h5ad' # the file that will store the analysis results sc.settings.autosave = True # save figures, do not show them sc.settings.set_figure_params( dpi=300, frameon=False) # low dpi (dots per inch) yields small inline figures ### input gene expression matrix of each organ #Kidney data1 = sc.read_csv('Kidney_rawcount.txt', delimiter='\t', first_column_names=None, dtype='float32') adata1 = data1.T adata1 adata1.X.shape #Liver data2 = sc.read_csv('Liver_rawcount.txt', delimiter='\t', first_column_names=None, dtype='float32') adata2 = data2.T adata2 adata2.X.shape #Lung data3 = sc.read_csv('Lung_rawcount.txt', delimiter='\t',
celltype_x = celltype_x.values print(celltype_x) seurat_celltype_path = base_path + 'dann_vae/atac/seurat_pred_type.csv' celltype_seurat = pd.read_csv(seurat_celltype_path, index_col=0) celltype_seurat = list(celltype_seurat.values.flatten()) print(celltype_seurat) encoder = LabelEncoder() orig_label = encoder.fit_transform(celltype_x) orig_label.dtype = 'int64' batch_size = 100 epochs = 25 adata1 = sc.read_csv(file1) adata2 = sc.read_csv(file2) adata_davae = sc.read_h5ad(davae_path) data = adata_davae.X # data = adata_davae.obsm['davae'] len1 = adata1.shape[0] len2 = adata2.shape[0] test_set = data[0:len1, ] train_set = data[len1:len1 + len2, ] label = to_categorical(orig_label) class_num = label.shape[1] net_x = CLASSIFIER(input_size=train_set.shape[1], class_num=class_num)
@author: antho """ import os import pandas as pd import numpy as np import scvelo as scv import scanpy as sc import shutil import matplotlib.pyplot as plt plt.rcParams['pdf.fonttype'], plt.rcParams['ps.fonttype'], plt.rcParams[ 'savefig.dpi'] = 42, 42, 300 #Make PDF text readable plt.rcParams['figure.figsize'] = (10, 10) adata = sc.read_csv("input/RNAData/Tpms.csv.protein_coding.csv") adata.obs_names = pd.read_csv("input/RNAData/Tpms.obs_names.csv")["well_plate"] phases = pd.read_csv("input/ProteinData/WellPlatePhasesLogNormIntensities.csv" ).sort_values(by="Well_Plate") # Assign phases and log intensities; require log intensity adata.obs["phase"] = np.array(phases["Stage"]) adata.obs["Green530"] = np.array(phases["Green530"]) adata.obs["Red585"] = np.array(phases["Red585"]) adata = adata[pd.notnull(adata.obs["Green530"]) & pd.notnull(adata.obs["Red585"])] # removes dark mitotic cells adata.obs["fucci_time"] = np.array( pd.read_csv("output/fucci_time.csv")["fucci_time"]) # Get info about the genes gene_info = pd.read_csv("input/RNAData/IdsToNames.csv.gz",
if len(np.unique(adata.var.index)) < len( adata.var.index) and args.make_var_index_unique: adata.var_names_make_unique() print("Making AnnData var index unique...") # Sort var index adata = adata[:, np.sort(adata.var.index)] print("Writing 10x data to h5ad...") adata.write_h5ad(filename="{}.h5ad".format(FILE_PATH_OUT_BASENAME)) elif INPUT_FORMAT in ['tsv', 'csv'] and OUTPUT_FORMAT == 'h5ad': if INPUT_FORMAT == 'tsv': delim = '\t' elif INPUT_FORMAT == 'csv': delim = ',' # Expects csv/tsv to have features as rows and observations as columns adata = sc.read_csv(FILE_PATH_IN, delimiter=delim, first_column_names=True).T # Convert to sparse matrix adata.X = csr_matrix(adata.X) adata = add_sample_id(adata=adata, args=args) # If is tag_cell_with_sample_id is given, add the sample ID as suffix if args.tag_cell_with_sample_id: adata = tag_cell(adata=adata, tag=args.sample_id, remove_10x_gem_well=args.remove_10x_gem_well) adata.var.index = adata.var.index.astype(str) # Check if var index is unique if len(np.unique(adata.var.index)) < len( adata.var.index) and not args.make_var_index_unique: raise Exception( "VSN ERROR: AnnData var index is not unique. This can be fixed by making it unique. To do so update the following param 'makeVarIndexUnique = true' (under params.sc.sc_file_converter) in your config." )
import scanpy as sc import numpy as np sc.settings.autosave = True parser = argparse.ArgumentParser() parser.add_argument('-i', dest='input', help='counts csv file') args = parser.parse_args() #count_csv = 'rsc/tasic_scRNAseq/full_scRNAseq/GSE71585_RefSeq_counts.csv' count_csv = args.input # read in counts csv adata = sc.read_csv(count_csv, delimiter=',', first_column_names=bool, dtype='float32') # need to transpose tdata = sc.AnnData.transpose(adata) print(tdata) # Basic pre-processing # filter out cells have less than 200 genes expressed sc.pp.filter_cells(tdata, min_genes=200) print(tdata.obs['n_genes'].min()) # filter out genes expressed in less than 3 cells sc.pp.filter_genes(tdata, min_cells=3) print(tdata.var['n_cells'].min())
# %% import os import scanpy as sc from scipy import sparse # %% adataD0 = sc.read_csv('./data/Klein/GSM1599494_ES_d0_main.csv.bz2') adataD2 = sc.read_csv('./data/Klein/GSM1599497_ES_d2_LIFminus.csv.bz2') adataD4 = sc.read_csv('./data/Klein/GSM1599498_ES_d4_LIFminus.csv.bz2') adataD7 = sc.read_csv('./data/Klein/GSM1599499_ES_d7_LIFminus.csv.bz2') # %% adata = sc.AnnData.concatenate(adataD0.T, adataD2.T, adataD4.T, adataD7.T, batch_key='cluster', batch_categories=['d0', 'd2', 'd4', 'd7', ]) adata.X = sparse.csr_matrix(adata.X) # %% sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True) adata = adata[adata.obs.total_counts < 75000, :] # sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts') # sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'], jitter=False, multi_panel=True) sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) adata.raw = adata
# Load up whitfield dataset & incorporate ideal vector data from Figure 1 to meta whit1 = ['whitfield_dataPlusScores_6_30_2020_', '_1134.csv', '1134'] mod1 = 'quantile' experiments = { 'TT1': [0, 12], 'TT2': [12, 38], 'TT3': [38, 86], 'TN': [86, 105], 'SHAKE': [105, 114] } whitfield = {} print(whit1, mod1) for exp1 in experiments: whitfield[exp1] = sc.read_csv('data/Whitfield/data/' + whit1[0] + exp1 + whit1[1], first_column_names=True).T var_names = [ str(g2e.loc[float(i), 'Gene stable ID']) for i in whitfield[exp1].var_names if float(i) in g2e.index ] whitfield[exp1] = whitfield[exp1][:, [ True if float(i) in g2e.index else False for i in whitfield[exp1].var_names ]] whitfield[exp1].var_names = pd.Index(var_names) if mod1 == 'quantile': tmp = whitfield[exp1].X whitfield[exp1].X = quantile_transform(tmp, axis=1) whitfield[exp1].var_names = [ i.rstrip('.0') for i in whitfield[exp1].var_names
import sys import numpy as np import pandas as pd import scanpy as sc #1.4.3 import anndata import bbknn #1.3.4 ##################### print('Start') fi = open(BATCH) batch = [] for line in fi: seq = line.rstrip().split(',') batch = batch + seq fi.close() batch = batch[1:] used_pca = sc.read_csv(PCA) adata = anndata.AnnData(X=used_pca.X, obs=batch) PCNUM = used_pca.X.shape[1] sc.tl.pca(adata, n_comps=PCNUM) adata.obsm['X_pca'] = used_pca.X bbknn.bbknn(adata, batch_key=0, neighbors_within_batch=NB, n_pcs=PCNUM, n_trees=NT) sc.tl.umap(adata) umap = adata.obsm['X_umap'] fo = open(OUTPUT, 'w') for one in umap: fo.write(str(one[0]) + '\t' + str(one[1]) + '\n')
from fnmatch import fnmatch # Get all evaluation files: root = '../data/KptnMouse/RNAscope' pattern = "Objects_Population - Nuclei.txt" allFiles = [] slideNames = [] for path, subdirs, files in os.walk(root): for name in files: if fnmatch(name, pattern): allFiles.append(os.path.join(path, name)) slideNames.append(str.split(allFiles[-1], '/')[4]) slide = 0 # Import data: kptn_data_all = sc.read_csv(allFiles[slide], sep='\t', skiprows=8, header=1) kptn_data = np.asarray(kptn_data_all[[ 'Position X [µm]', 'Position Y [µm]', 'Nuclei - Intensity Nucleus Alexa 568 Mean', 'Nuclei - Intensity Nucleus Atto 490LS Mean', 'Nuclei - Intensity Nucleus Alexa 488 Mean', 'Nuclei - Intensity Nucleus Alexa 647 Mean', 'Nuclei - Intensity Nucleus Atto 425 Mean' ]]) channelOrder = ('568', '490LS', '488', '647', '425') celltypeOrder = ('Astrocyte', 'Oligodendrocyte', 'GABAergicNeuron', 'OPC', 'Neuron') # Filter out 1% smallest and 5% of largest nuclei as segmentation errors: volumes = np.asarray(kptn_data_all['Nuclei - Nucleus Volume [µm³]'])
def upload(pathname): import anndata filename, file_extension = os.path.splitext(pathname) if file_extension == ".mat": x = loadmat(pathname) keys = [] for key in x.keys(): keys.append(key) #obs is the cell #var is gene #pick the largest largest = 3 largest_size = 0 for i in range(len(keys) - 3): if len(x[keys[i + 3]].shape) == 2: size = (x[keys[i + 3]].shape[0] * x[keys[i + 3]].shape[1]) else: size = x[keys[i + 3]].shape[0] if size >= largest_size: largest = i + 3 largest_size = size obs_d, var_d = {}, {} for i in range(len(keys) - 3): if i != largest - 3: if (x[keys[i + 3]].flatten()).shape[0] == ( x[keys[largest]]).shape[0]: obs_d[keys[i + 3]] = x[keys[i + 3]].flatten() elif (x[keys[i + 3]].flatten()).shape[0] == ( x[keys[largest]]).shape[1]: var_d[keys[i + 3]] = x[keys[i + 3]].flatten() #else: obs_df = pd.DataFrame(data=obs_d) var_df = pd.DataFrame(data=var_d) data = anndata.AnnData(X=x[keys[largest]].todense(), obs=None if obs_df.empty else obs_df, var=None if var_df.empty else var_df) elif file_extension == ".npz": x = np.load(pathname) #pick largest size file largest = 0 largest_size = 0 for i in range(len(x.files)): if len(x[x.files[i]].shape) == 2: size = (x[x.files[i]].shape[0] * x[x.files[i]].shape[1]) else: size = x[x.files[i]].shape[0] if size >= largest_size: largest = i largest_size = size obs_d, var_d = {}, {} for i in range(len(x.files)): if i != largest: if len(x[x.files[i]].flatten()) == len(x[x.files[largest]]): obs_d[x.files[i]] = x[x.files[i]].flatten() elif len(x[x.files[i]].flatten()) == len( x[x.files[largest]][0]): var_d[x.files[i]] = x[x.files[i]].flatten() #else: obs_df = pd.DataFrame(data=obs_d) var_df = pd.DataFrame(data=var_d) data = anndata.AnnData(X=x[x.files[largest]], obs=None if obs_df.empty else obs_df, var=None if var_df.empty else var_df) elif file_extension == ".mtx": data = sc.read_10x_mtx(os.path.dirname(pathname)) data.X = data.X.todense() elif file_extension == ".csv": data = sc.read_csv(pathname) elif file_extension == ".xlsx": data = sc.read_excel(pathname) elif file_extension == ".txt": data = sc.read_text(pathname) else: data = sc.read(pathname) print(pathname, " uploaded !") return data
import numpy as np import pandas as pd import scanpy as sc import anndata as ad import os sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3) results_folder = 'write' adata = sc.read_csv("dataset_cleaned.csv", first_column_names=True) nn_number_list = [ 10, 12 ] # a list of numbers of nearest neighbors to consider for clustering resolution_list = [ 0.001, 0.010, 0.015, 0.02 ] # a list of resolutions to consider for cluster annotation def main(): # common pre-processing steps sc.pp.scale(adata) sc.tl.pca(adata, svd_solver='auto') # the next steps vary per hyper-parameter for n_neighbors in nn_number_list: #clustering nn_key_added = str(n_neighbors) + '_nn' umap_obsm_key = 'X_umap_' + nn_key_added sc.pp.neighbors(adata, method='umap', n_neighbors=n_neighbors, n_pcs=20,
adata_spatial_posterior = sc.datasets.visium_sge( sample_id="V1_Mouse_Brain_Sagittal_Posterior") #Normalize and log1P for adata in [ adata_spatial_anterior, adata_spatial_posterior, ]: sc.pp.normalize_total(adata, inplace=True) #sc.pp.log1p(adata) #sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000, inplace=True) ################## #Sc data GSE115746 adata_cortex = sc.read_csv('../data/GSE115746_cells_exon_counts.csv').T adata_cortex_meta = pd.read_csv( '../data/GSE115746_complete_metadata_28706-cells.csv', index_col=0) adata_cortex_meta_ = adata_cortex_meta.loc[adata_cortex.obs.index, ] adata_cortex.obs = adata_cortex_meta_ adata_cortex.var_names_make_unique() adata_cortex.var['mt'] = adata_cortex.var_names.str.startswith( 'Mt-') # annotate the group of mitochondrial genes as 'mt' sc.pp.calculate_qc_metrics(adata_cortex, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
#!/usr/bin/env python3 import scanpy as sc import numpy as np import matplotlib.pyplot as plt import scvelo as scv import csv sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3) sc.logging.print_versions() sc.settings.set_figure_params(dpi=80) ## read in counts csv adata = sc.read_csv('GSE71585_RefSeq_counts.csv', delimiter=',', first_column_names=bool, dtype='float32') ## check dims print(adata) ## need to transpose tdata = sc.AnnData.transpose(adata) print(tdata) # Basic pre-processing sc.pp.filter_cells(tdata, min_genes=200) sc.pp.filter_genes(tdata, min_cells=3) ## check dims print(tdata) mito_genes = tdata.var_names.str.startswith('mt-') # for each cell compute fraction of counts in mito genes vs. all genes tdata.obs['percent_mito'] = np.sum(
import mmap import glob from Bio.SeqUtils import MeltingTemp as mt from Bio.Seq import Seq matplotlib.rcParams['pdf.fonttype'] = 42 matplotlib.rcParams['ps.fonttype'] = 42 matplotlib.rcParams['font.sans-serif'] = "Arial" matplotlib.rcParams['font.family'] = "sans-serif" matplotlib.rcParams['font.size'] = 14 #read the data np.random.seed(1) import scanpy as sc wt1 = sc.read_csv("/Users/qingbowang/Desktop/slide_seq/WT_EM_1.csv", first_column_names=True) wt2 = sc.read_csv("/Users/qingbowang/Desktop/slide_seq/WT_EM_2.csv", first_column_names=True) wt1.var['batch'] = "wt1" wt2.var['batch'] = "wt2" wt1 = wt1.T wt2 = wt2.T dkd1 = sc.read_csv("/Users/qingbowang/Desktop/slide_seq/DKD_EM_1.csv", first_column_names=True) dkd2 = sc.read_csv("/Users/qingbowang/Desktop/slide_seq/DKD_EM_2.csv", first_column_names=True) dkd1.var['batch'] = "dkd1" dkd2.var['batch'] = "dkd2" dkd1 = dkd1.T dkd2 = dkd2.T
import numpy,pandas,datetime import matplotlib,matplotlib.pyplot import scanpy scanpy.settings.verbosity=5 # # 1. Reading data # In[2]: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) idata=scanpy.read_csv('/Volumes/omics4tb2/alomana/projects/mscni/data/scanpy/count.file.all.day.clean.csv') adata=idata.transpose() print(adata) print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # # 2. Preprocessing # In[3]: scanpy.pl.highest_expr_genes(adata,n_top=20) # In[4]:
cluster_count = 2 base_path='/Users/zhongyuanke/data/' orig_path = 'pbmc/zheng/mcl_pre.h5ad' desc_path = 'desc/desc_jurkat.h5ad' davae_path = 'dann_vae/pbmc/293t_save04_label.h5ad' seurat_path = 'seurat_result/mcl.h5ad' scan_path = 'scanorama/scan_mcl.h5ad' scgen_path = 'scgen/scgen_mcl.h5ad' harmony_path = 'harmony_result/mcl.csv' adata_davae = sc.read_h5ad(base_path+davae_path) adata_scan = sc.read_h5ad(base_path+scan_path) adata_orig = sc.read_h5ad(base_path+orig_path) adata_seurat = sc.read_h5ad(base_path+seurat_path) adata_scgen=sc.read_h5ad(base_path+scgen_path) adata_harmony = sc.read_csv(base_path+harmony_path) adata_desc = sc.read_h5ad(base_path+desc_path) sc.pp.neighbors(adata_seurat, use_rep='X_pca') sc.tl.umap(adata_seurat) # print(adata_scgen) # sc.pp.neighbors(adata_orig) # sc.tl.umap(adata_orig) # sc.pp.neighbors(adata_davae) # sc.tl.umap(adata_davae) # sc.pp.neighbors(adata_scan, use_rep='X_scanorama') # sc.tl.umap(adata_scan) # sc.pp.neighbors(adata_seurat) # sc.tl.umap(adata_seurat) # sc.pp.neighbors(adata_scgen,use_rep='corrected_latent') # sc.tl.umap(adata_scgen)
def read_counts_and_phases(count_or_rpkm, use_spike_ins, biotype_to_use, u_plates, use_isoforms=False, load_velocities=False): ''' Read data into scanpy; Read phases and FACS intensities - count_or_rpkm: Must be "Counts" or "Tpms" ''' read_file = f"input/RNAData/{count_or_rpkm}{'_Isoforms' if use_isoforms else ''}.csv" + (".ercc.csv" if use_spike_ins else "") if biotype_to_use != None and len(biotype_to_use) > 0: print(f"filtering for biotype: {biotype_to_use}") biotype_file = f"{read_file}.{biotype_to_use}.csv" if not os.path.exists(biotype_file): gene_info = pd.read_csv(f"input/RNAData/IdsToNames{'_Isoforms' if use_isoforms else ''}.csv.gz", index_col=False, header=None, names=["gene_id", "name", "biotype", "description"]) biotyped = gene_info[gene_info["biotype"] == biotype_to_use]["gene_id"] pd.read_csv(read_file)[biotyped].to_csv(biotype_file, index=False) read_file = biotype_file adata = sc.read_csv(read_file) print(f"data shape: {adata.X.shape}") if load_velocities: adata.obs_names = pd.read_csv("input/RNAData/Tpms.obs_names.csv")["well_plate"] intensities, phases = [],[] for plate in u_plates: file = f"input/RNAData/180911_Fucci_single cell seq_ss2-18-{plate}_index sort export.csv" plateIntensities = pd.read_csv(file, skiprows=2) newColumns = list(plateIntensities.columns) newColumns[5] = "MeanGreen530" newColumns[6] = "MeanRed585" plateIntensities.columns = newColumns plateIntensities["Plate"] = [plate] * len(plateIntensities) plateIntensities["Well_Plate"] = [f"{w}_{plate}" for w in plateIntensities["Well"]] intensitiesSubFrame = plateIntensities[plateIntensities["Population"] == "All Events"] if len(intensities) == 0: intensities = intensitiesSubFrame else: intensities = intensities.append(intensitiesSubFrame, ignore_index=True) isPhaseRow = ~plateIntensities["Population"].isin(["All Events", "Cells", "Singlets"]) phasesSubFrame = plateIntensities[isPhaseRow & (plateIntensities["% Total"] == "100.00%")] if len(phases) == 0: phases = phasesSubFrame else: phases = phases.append(phasesSubFrame, ignore_index=True) wp_idx = list(phases.columns).index("Well_Plate") pop_idx = list(phases.columns).index("Population") phases_lookup = dict([(row[1][wp_idx], row[1][pop_idx]) for row in phases.iterrows()]) # Assign phases and log intensities; require log intensity intensities = intensities.sort_values(by="Well_Plate") adata.obs["Well_Plate"] = np.array(intensities["Well_Plate"]) adata.obs["plate"] = np.array(intensities["Plate"]) adata.obs["phase"] = np.array([phases_lookup[wp] if wp in phases_lookup else "N/A" for wp in intensities["Well_Plate"]]) adata.obs["MeanGreen530"] = np.array(intensities["MeanGreen530"]) adata.obs["MeanRed585"] = np.array(intensities["MeanRed585"]) adata = adata[pd.notnull(adata.obs["MeanGreen530"]) & pd.notnull(adata.obs["MeanRed585"])] # removes 6 dark likely mitotic cells # Read in fucci pseudotime from previous analysis if os.path.isfile("output/fucci_time.csv"): adata.obs["fucci_time"] = np.array(pd.read_csv("output/fucci_time.csv")["fucci_time"]) # Get info about the genes gene_info = pd.read_csv(f"input/RNAData/IdsToNames{'_Isoforms' if use_isoforms else ''}.csv.gz", header=None, names=["name", "biotype", "description"], index_col=0) adata.var["name"] = gene_info["name"] adata.var["biotype"] = gene_info["biotype"] adata.var["description"] = gene_info["description"] if load_velocities: ldata = scv.read("input/RNAData/a.loom", cache=True) ldata.obs_names = pd.read_csv("input/RNAData/a.obs_names.csv")["well_plate"] ldata.var["GeneName"] = ldata.var_names ldata.var_names = ldata.var["Accession"] adata = scv.utils.merge(adata, ldata, copy=True) return adata, phases
#!/usr/bin/env python ################################################## # File Name: test.py # Author: Rui # mail: [email protected] # Created Time: Thu 11 Jul 2019 11:49:02 AM EDT ################################################ import scanpy as sc import numpy as np import giniclust3 as gc import anndata ####Load and filter dataset#### adataRaw = sc.read_csv("./data/GSM1599495_ES_d0_biorep_techrep1.csv", first_column_names=True) sc.pp.filter_cells(adataRaw, min_genes=3) #####remover gene expressed less than N cell sc.pp.filter_genes(adataRaw, min_cells=200) #####remove cell express less than M gene adataSC = anndata.AnnData(X=adataRaw.X.T, obs=adataRaw.var, var=adataRaw.obs) sc.pp.normalize_per_cell(adataSC, counts_per_cell_after=1e4) ####GiniIndexClust and FanoFactorClust#### gc.gini.calGini(adataSC) adataGini = gc.gini.clusterGini(adataSC, neighbors=3) gc.fano.calFano(adataSC) adataFano = gc.fano.clusterFano(adataSC) ####ConsensusClust####