def process_image(tiff_dir, mask_file, d): # read and aggregate data data = aggregate_image(tiff_dir, mask_file) X, exp = data['X'], data['exp'] exp = filter_out(exp) # Get total counts per cell tot = pd.DataFrame(exp.sum(1)) tot.columns = ['total_count'] # remove cells with total count bellow 3 X = X[tot.values > 3] exp = exp[tot.values > 3] tot = tot[tot.values > 3] # Convert data to log-scale, and account for depth dfm = NaiveDE.stabilize(exp.T).T res = NaiveDE.regress_out(tot, dfm.T, 'np.log(total_count)').T # Add total_count as pseudogene for reference # res['log_total_count'] = np.log(tot['total_count']) res.to_csv(d+'/expressions.txt', sep=' ', header=True, index=False) X.to_csv(d+'/positions.txt', sep=',', header=False, index=False)
def main(): df = pd.read_table('data/Layer2_BC_count_matrix-1.tsv', index_col=0) df = df.T[df.sum(0) >= 3].T # Filter practically unobserved genes sample_info = get_coords(df.index) sample_info['total_counts'] = df.sum(1) sample_info = sample_info.query( 'total_counts > 5') # Remove empty features df = df.loc[sample_info.index] X = sample_info[['x', 'y']] dfm = NaiveDE.stabilize(df.T).T res = NaiveDE.regress_out(sample_info, dfm.T, 'np.log(total_counts)').T # Add total_count as pseudogene for reference res['log_total_count'] = np.log(sample_info['total_counts']) results = SpatialDE.run(X, res) sample_info.to_csv('BC_sample_info.csv') results.to_csv('BC_final_results.csv') de_results = results[(results.qval < 0.05)].copy() ms_results = SpatialDE.model_search(X, res, de_results) ms_results.to_csv('BC_MS_results.csv') return results
def process_mer_fish(exp_file, annotation_file, d): # read data df = pd.read_csv(exp_file, index_col=0) annotations = pd.read_csv(annotation_file)['gene'] # aggregate data tmp = aggregate_data(df, annotations) X, exp = tmp['X'], tmp['exp'] # filter practically unobserved genes exp = exp.T[exp.sum(0) >= 3].T # Get total counts per cell tot = pd.DataFrame(exp.sum(1)) tot.columns = ['total_count'] # Convert data to log-scale, and account for depth dfm = NaiveDE.stabilize(exp.T).T res = NaiveDE.regress_out(tot, dfm.T, 'np.log(total_count)').T # Add total_count as pseudogene for reference res['log_total_count'] = np.log(tot['total_count']) res.to_csv(d+'/expressions.txt', sep=' ', header=True, index=False) X.to_csv(d+'/positions.txt', sep=',', header=False, index=False)
def main(): # Get time points for each sample sample_info = pd.read_csv('Frog_sample_info.csv', index_col=0) # Load expression df = pd.read_csv('data/GSE65785_clutchApolyA_relative_TPM.csv', index_col=0) df = df[sample_info.index] df = df[df.sum(1) >= 3] # Filter practically unobserved genes X = sample_info[['hpf']] # Convert expression data to log scale, with genes in columns dfm = NaiveDE.stabilize(df) res = NaiveDE.regress_out(sample_info, dfm, 'np.log(ERCC) + np.log(num_genes)', rcond=1e-4).T # Add technical factors as pseudogenes for reference res['log_num_genes'] = np.log(sample_info['num_genes']) res['log_ERCC'] = np.log(sample_info['ERCC']) # Perform Spatial DE test with default settings results = SpatialDE.run(X, res) # Save results and annotation in files for interactive plotting and interpretation results.to_csv('Frog_final_results.csv') de_results = results[(results.qval < 0.05)].copy() ms_results = SpatialDE.model_search(X, res, de_results) ms_results.to_csv('Frog_MS_results.csv') return results
def main(): df = pd.read_csv('exp_mat_43.csv', index_col=0) df.columns = df.columns.map(int) # Get coordinates for each sample sample_info = pd.read_csv('sample_info_43.csv', index_col=0) df = df[sample_info.index] X = sample_info[['x', 'y']] # Convert data to log-scale, and account for depth dfm = NaiveDE.stabilize(df.T).T res = NaiveDE.regress_out(sample_info, dfm, 'np.log(total_count)').T # Add total_count as pseudogene for reference res['log_total_count'] = np.log(sample_info['total_count']) # Perform Spatial DE test with default settings results = SpatialDE.run(X, res) # Save results and annotation in files for interactive plotting and interpretation results.to_csv('final_results_43.csv') de_results = results[(results.qval < 0.05)].copy() ms_results = SpatialDE.model_search(X, res, de_results) ms_results.to_csv('MS_results_43.csv') return results
def main(): df = pd.read_csv('10t.csv', index_col=0) df = df.T[df.sum(0) >= 3].T # Filter practically unobserved genes # Get coordinates for each sample sample_info = get_coords(df.index) sample_info['total_counts'] = df.sum(1) sample_info = sample_info.query( 'total_counts > 10') # Remove empty features df = df.loc[sample_info.index] X = sample_info[['x', 'y']] # Convert data to log-scale, and account for depth dfm = NaiveDE.stabilize(df.T).T res = NaiveDE.regress_out(sample_info, dfm.T, 'np.log(total_counts)').T # Add total_count as pseudogene for reference res['log_total_count'] = np.log(sample_info['total_counts']) # Perform Spatial DE test with default settings results = SpatialDE.run(X, res) # Save results and annotation in files for interactive plotting and interpretation sample_info.to_csv('10t_sample_info.csv') results.to_csv('10t_final_results.csv') de_results = results[(results.qval < 0.05)].copy() ms_results = SpatialDE.model_search(X, res, de_results) ms_results.to_csv('10t_MS_results.csv') return results
def main(): df = pd.read_csv('data/rep6/middle_exp_mat.csv', index_col=0) df = df.T[df.sum(0) >= 3].T # Filter practically unobserved genes # Get coordinates for each sample sample_info = pd.read_csv('data/rep6/middle_sample_info.csv', index_col=0) df = df.loc[sample_info.index] X = sample_info[['abs_X', 'abs_Y']] # Convert data to log-scale, and account for depth dfm = NaiveDE.stabilize(df.T).T res = NaiveDE.regress_out(sample_info, dfm.T, 'np.log(total_count)').T # Add total_count as pseudogene for reference res['log_total_count'] = np.log(sample_info['total_count']) # Perform Spatial DE test with default settings results = SpatialDE.run(X, res) # Assign pi_0 = 1 in multiple testing correction results['qval'] = SpatialDE.util.qvalue(results['pval'], pi0=1.0) # Save results and annotation in files for interactive plotting and interpretation sample_info.to_csv('middle_sample_info.csv') results.to_csv('middle_final_results.csv') de_results = results[(results.qval < 0.05)].copy() ms_results = SpatialDE.model_search(X, res, de_results) ms_results.to_csv('middle_MS_results.csv') return results
def main(expression_csv, coordinate_csv, results_csv, model_selection_csv): ''' Perform SpatialDE test on data in input files. <expression csv> : A CSV file with expression valies. Columns are genes, and Rows are samples <coordinates csv> : A CSV file with sample coordinates. Each row is a sample, the columns with coordinates must be named 'x' and 'y'. For other formats (e.g. 1d or 3d queries), it is recommended to write a custom Python script to do the analysis. <output file> : P-vaues and other relevant values for each gene will be stored in this file, in CSV format. ''' df = pd.read_csv(expression_csv, index_col=0) df = df.T[df.sum(0) >= 3].T # Filter practically unobserved genes sample_info = pd.read_csv(coordinate_csv, index_col=0) sample_info['total_counts'] = df.sum(1) sample_info = sample_info.query('total_counts > 5') # Remove empty features df = df.loc[sample_info.index] X = sample_info[['x', 'y']] # Convert data to log-scale, and account for depth dfm = NaiveDE.stabilize(df.T).T res = NaiveDE.regress_out(sample_info, dfm.T, 'np.log(total_counts)').T # Perform Spatial DE test with default settings results = SpatialDE.run(X, res) # Save results and annotation in files for interactive plotting and interpretation results.to_csv(results_csv) if not model_selection_csv: return results de_results = results[(results.qval < 0.05)].copy() ms_results = SpatialDE.model_search(X, res, de_results) ms_results.to_csv(model_selection_csv) return results, ms_results
def spatialde_test(adata, coord_columns=['x', 'y'], regress_formula='np.log(total_counts)'): ''' Run the SpatialDE test on an AnnData object Parameters ---------- adata: An AnnData object with counts in the .X field. coord_columns: A list with the columns of adata.obs which represent spatial coordinates. Default ['x', 'y']. regress_formula: A patsy formula for linearly regressing out fixed effects from columns in adata.obs before fitting the SpatialDE models. Default is 'np.log(total_counts)'. Returns ------- results: A table of spatial statistics for each gene. ''' logging.info('Performing VST for NB counts') adata.layers['stabilized'] = NaiveDE.stabilize(adata.X.T).T logging.info('Regressing out fixed effects') adata.layers['residual'] = NaiveDE.regress_out( adata.obs, adata.layers['stabilized'].T, regress_formula).T X = adata.obs[coord_columns].values expr_mat = pd.DataFrame.from_records(adata.layers['residual'], columns=adata.var.index, index=adata.obs.index) results = run(X, expr_mat) # Clip 0 pvalues min_pval = results.query('pval > 0')['pval'].min() / 2 results['pval'] = results['pval'].clip_lower(min_pval) # Correct for multiple testing results['qval'] = qvalue(results['pval'], pi0=1.) return results
def Spatial_DE_AEH(filterd_exprs,coordinates,results,pattern_num,l = 1.05, verbosity = 1): ## Automatic expression histology coordinates_cp =coordinates.copy() coordinates_cp['total_counts'] = filterd_exprs.sum(1) dfm = NaiveDE.stabilize(filterd_exprs.T).T res = NaiveDE.regress_out(coordinates_cp, dfm.T, 'np.log(total_counts)').T results['pval'] = results['pval'].clip(lower = results.query('pval > 0')['pval'].min() / 2) results['qval'] = results['qval'].clip(lower = results.query('qval > 0')['qval'].min() / 2) sres = results.query('qval < 0.05 & g != "log_total_count"').copy() X = coordinates.values histology_results, patterns = SpatialDE.spatial_patterns(X, res, sres, int(pattern_num), l = l,verbosity=verbosity) pattern_dic = {"histology_results":histology_results,"patterns":patterns} return pattern_dic
def main(): sample_info = pd.read_csv('MOB_sample_info.csv', index_col=0) df = pd.read_csv('data/Rep11_MOB_0.csv', index_col=0) df = df.loc[sample_info.index] df = df.T[df.sum(0) >= 3].T # Filter practically unobserved genes dfm = NaiveDE.stabilize(df.T).T res = NaiveDE.regress_out(sample_info, dfm.T, 'np.log(total_counts)').T X = sample_info[['x', 'y']].values times = pd.DataFrame(columns=['N', 'time']) Ns = [50, 100, 200, 300, 500, 750, 1000, 2000] j = 0 for N in Ns: for i in range(5): Y = res.sample(N, axis=1).values.T t0 = time() m = GPclust.MOHGP(X=X, Y=Y, kernF=kern.RBF(2) + kern.Bias(2), kernY=kern.RBF(1) + kern.White(1), K=5, prior_Z='DP') m.hyperparam_opt_args['messages'] = False m.optimize(step_length=0.1, verbose=False, maxiter=2000) times.loc[j] = [N, time() - t0] print(times.loc[j]) j += 1 times.to_csv('AEH_times.csv')
def main(out_file): df = pd.read_table('../../BreastCancer/data/Layer2_BC_count_matrix-1.tsv', index_col=0) df = df.T[df.sum(0) >= 3].T # Filter practically unobserved genes sample_info = get_coords(df.index) sample_info['total_counts'] = df.sum(1) sample_info = sample_info.query( 'total_counts > 5') # Remove empty features # Bootstrap sampling 80% of data sample_info = sample_info.sample(frac=0.8) df = df.loc[sample_info.index] X = sample_info[['x', 'y']] dfm = NaiveDE.stabilize(df.T).T res = NaiveDE.regress_out(sample_info, dfm.T, 'np.log(total_counts)').T results = SpatialDE.run(X, res) results.to_csv(out_file) return results
def Spatial_DE(filterd_exprs, coordinates): if(filterd_exprs.shape[0] != coordinates.shape[0]): sys.exit("The number of cells in expression file and location file don't match\n") else: ## results and ms_results coordinates_cp = coordinates.copy() coordinates_cp['total_counts'] = filterd_exprs.sum(1) dfm = NaiveDE.stabilize(filterd_exprs.T).T res = NaiveDE.regress_out(coordinates_cp, dfm.T, 'np.log(total_counts)').T res['log_total_count'] = np.log(coordinates_cp['total_counts']) results = SpatialDE.run(coordinates, res) de_results = results[(results.qval < 0.05)].copy() if(de_results.shape[0] > 0): ms_results = SpatialDE.model_search(coordinates, res, de_results) result_dic = {"results":results, "ms_results":ms_results} else: print("No spatially variable genes found! \n") result_dic = {"results":results} return result_dic
import glob samp_counts = glob.glob( '/fastscratch/myscratch/shicks1/HumanPilot/sample_data/by_sample_id/*_counts.csv' ) samp_meta = np.ravel([[x[:-10] + 'meta.csv'] for x in samp_counts]) samp_output = np.ravel([[x[:-10] + 'spatialDE_results.csv'] for x in samp_counts]) df = pd.DataFrame({ 'counts': samp_counts, 'meta': samp_meta, 'out': samp_output }) for index, row in df.iterrows(): counts = pd.read_csv(row['counts'], index_col=0) # load counts counts = counts.T[counts.sum( axis=0) >= 5].T # Filter practically unobserved genes sample_info = pd.read_csv( row['meta'], index_col=0) # load meta data with spatial coordinates norm_expr = NaiveDE.stabilize(counts.T).T # remove tech variation resid_expr = NaiveDE.regress_out(sample_info, norm_expr.T, 'np.log(sum)').T X = sample_info[['imagerow', 'imagecol']] now = datetime.now().strftime("%H:%M:%S") print(now) results = SpatialDE.run(X, resid_expr) now = datetime.now().strftime("%H:%M:%S") print(now) results.to_csv(row['out']) # Save spatial results print("Finished =", row['counts'])
import pandas as pd import NaiveDE import SpatialDE counts = pd.read_csv('./processed_data/MERFISH_Animal18_Bregma0.11_countdata.csv', index_col=0) counts = counts.T[counts.sum(0) >= 3].T sample_info = pd.read_csv('./processed_data/MERFISH_Animal18_Bregma0.11_info.csv', index_col=0) sample_info['total_counts'] = counts.sum(1) counts = counts.loc[sample_info.index] norm_expr = NaiveDE.stabilize(counts.T).T resid_expr = NaiveDE.regress_out(sample_info, norm_expr.T, 'np.log(total_counts)').T sample_resid_expr=resid_expr X = sample_info[['x', 'y']] results = SpatialDE.run(X, sample_resid_expr) results.to_csv('./output/MERFISH_Animal18_Bregma0.11_spe.csv',sep=' ', index=False, header=True) de_results = results[(results.qval < 0.05)].copy() ms_results = SpatialDE.model_search(X, resid_expr, de_results) ms_results.to_csv('./output/MERFISH_Animal18_Bregma0.11_ms_spe.csv',sep=' ', index=False, header=True)
import numpy as np import NaiveDE import SpatialDE import time info = pd.read_csv("../processed_data/Rep11_MOB_info_scgco.csv", index_col=0) exp_diff = 1 for noise in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]: for irep in range(10): ff = "../processed_data/sim_MOB_expdiff" + str( exp_diff) + "_noise" + str(noise) + "_counts" + str(irep) + ".csv" print(ff) df = pd.read_csv(ff, index_col=0) df = df.T[df.sum(0) >= 3].T sample_info = info.copy() X = sample_info[['x', 'y']] start_time = time.time() dfm = NaiveDE.stabilize(df.T).T res = NaiveDE.regress_out(sample_info, dfm.T, 'np.log(total_counts)').T res['log_total_count'] = np.log(sample_info['total_counts']) results = SpatialDE.run(X, res) ff = "../spatialde_results/sim_MOB_expdiff" + str( exp_diff) + "_noise" + str(noise) + "_counts" + str( irep) + "_spe.csv" results.to_csv(ff)
def main(): df = pd.read_csv('10t.csv', index_col=0) df = df.T[df.sum(0) >= 3].T sample_info = get_coords(df.index) sample_info['total_counts'] = df.sum(1) sample_info = sample_info.query( 'total_counts > 10') # Remove empty features df = df.loc[sample_info.index] # X = sample_info[['x', 'y']] dfm = NaiveDE.stabilize(df.T).T res = NaiveDE.regress_out(sample_info, dfm.T, 'np.log(total_counts)').T res['log_total_count'] = np.log(sample_info['total_counts']) results = pd.read_csv('10t_final_results.csv', index_col=0) results['pval'] = results['pval'].clip_lower( results.query('pval > 0')['pval'].min() / 2) results['qval'] = results['qval'].clip_lower( results.query('qval > 0')['qval'].min() / 2) ymy = int(sys.argv[1]) sres = results.query('qval < 0.05 & g != "log_total_count"').copy() #a = sres['l'].value_counts() #a.to_csv('10t_l_results.csv') X = sample_info[['x', 'y']].values histology_results, patterns = SpatialDE.spatial_patterns(X, res, sres, ymy, 11, verbosity=1) histology_results.to_csv('10t_AEH_results.{}.csv'.format(ymy)) patterns.add_prefix('pattern_').to_csv( '10t_pattern_results.{}.csv'.format(ymy)) for i, Ci in enumerate( histology_results.sort_values('pattern').pattern.unique()): fig = plt.figure(figsize=(5, 5)) plt.scatter(sample_info['x'], sample_info['y'], c=patterns[Ci], s=10, cmap=plt.get_cmap("YlOrBr"), edgecolor="none", marker='s') plt.axis([0, 50, 0, 50]) plt.xlim(0, 50) plt.ylim(0, 50) plt.xticks([0, 10, 20, 30, 40, 50]) plt.yticks([0, 10, 20, 30, 40, 50]) plt.axis('equal') plt.gca().invert_yaxis() plt.title('Pattern {} - {} genes'.format( i, histology_results.query('pattern == @i').shape[0]), size=20) plt.tight_layout() plt.savefig("10t.{}.{}.pdf".format(ymy, i), bbox_inches='tight') for i in histology_results.sort_values('pattern').pattern.unique(): print('Pattern {}'.format(i)) print(', '.join( histology_results.query('pattern == @i').sort_values('membership') ['g'].tolist())) print() return histology_results
import pandas as pd import numpy as np import NaiveDE,SpatialDE from somde import SomNode dataname = '../slideseq_data/Puck_180819_11_' df = pd.read_csv(dataname+'count.csv',sep=',',index_col=1) corinfo = pd.read_csv(dataname+'idx.csv',sep=',',index_col=0) del(df['ENSEMBL']) print(df.shape) corinfo["total_count"]=df.sum(0) # stablize,regress_out is gene by cell . However, run is cell by gene dfm = NaiveDE.stabilize(df) res = NaiveDE.regress_out(corinfo, dfm, 'np.log(total_count)').T X=corinfo[['x','y']].values.astype(np.float32) som4 = SomNode(X,20) ndf,ninfo = som4.mtx(df) r1 ,numberq =som4.run() nres = som4.norm() som4.view()