def testRNASeq(self): from topslam.simulation.simulate_trajectory import rnaseq_simulation p_dims = 30 Xsim, simulate_new, t, c, labels, seed = rnaseq_simulation(p_dims, 4, 2, 1234, split_prob=.5) np.random.seed(42) Y, d = simulate_new() from topslam.filtering import filter_RNASeq import pandas as pd E = filter_RNASeq(pd.DataFrame(np.exp(Y))) np.testing.assert_array_less(-E.values, 0) E = filter_RNASeq(pd.DataFrame(np.exp(Y)), transform_log1p=False) np.testing.assert_allclose(-Y[d].mean(), 0) np.testing.assert_array_less((E.values==0).sum(), (Y==0).sum())
def testRNASeq(self): from topslam.simulation.simulate_trajectory import rnaseq_simulation p_dims = 30 Xsim, simulate_new, t, c, labels, seed = rnaseq_simulation( p_dims, 4, 2, 1234, split_prob=.5) np.random.seed(42) Y, d = simulate_new() from topslam.filtering import filter_RNASeq import pandas as pd E = filter_RNASeq(pd.DataFrame(np.exp(Y))) np.testing.assert_array_less(-E.values, 0) E = filter_RNASeq(pd.DataFrame(np.exp(Y)), transform_log1p=False) np.testing.assert_allclose(-Y[d].mean(), 0) np.testing.assert_array_less((E.values == 0).sum(), (Y == 0).sum())
def example_deng(optimize=True, plot=True): import pandas as pd, os import GPy, numpy as np from topslam.filtering import filter_RNASeq # Reproduceability, BGPLVM has local optima np.random.seed(42) # This is the process of how we loaded the data: ulabels = ['Zygote', '2-cell embryo', 'Early 2-cell blastomere', 'Mid 2-cell blastomere', 'Late 2-cell blastomere', '4-cell blastomere', '8-cell blastomere', '16-cell blastomere', 'Early blastocyst cell', 'Mid blastocyst cell', 'Late blastocyst cell', 'fibroblast', 'adult liver', ] folder_path = os.path.expanduser('~/tmp/Deng') csv_file = os.path.join(folder_path, 'filtered_expression_values.csv') if os.path.exists(csv_file): print('Loading previous filtered data: {}'.format(csv_file)) Y_bgplvm = pd.read_csv(csv_file, index_col=[0,1,2], header=0) else: print('Loading data:') data = GPy.util.datasets.singlecell_rna_seq_deng() if not os.path.exists(folder_path): os.mkdir(folder_path) Ydata = data['Y'].copy() Ydata.columns = Ydata.columns.to_series().apply(str.upper) Ydata = Ydata.reset_index().set_index('index', append=True) Ydata['labels'] = data['labels'].values Ydata = Ydata.set_index('labels', append=True) Ydata = Ydata.reorder_levels([0,2,1]) Ydata = Ydata.reset_index([0,2]).loc[ulabels].set_index(['level_0', 'index'], append=True) Y = Ydata.copy() Y.columns = [c.split('.')[0] for c in Y.columns] Y_bgplvm = filter_RNASeq(Y) print('\nSaving data to tmp file: {}'.format(csv_file)) Y_bgplvm.to_csv(csv_file) labels = Y_bgplvm.index.get_level_values(0).values Ymean = Y_bgplvm.values.mean() Ystd = Y_bgplvm.values.std() Y_m = Y_bgplvm.values Y_m -= Ymean Y_m /= Ystd # get the labels right for split experiments # get the labels right for 8 and split new_8_labels = [] for _l in Y_bgplvm.loc['8-cell blastomere'].index.get_level_values(1): _l = _l.split('-')[0] if not('split' in _l): new_8_labels.append('8') elif not('pooled' in _l): new_8_labels.append('8 split') else: new_8_labels.append('8 split') labels[labels=='8-cell blastomere'] = new_8_labels # get the labels right for 16 and split new_16_labels = [] for _l in Y_bgplvm.loc['16-cell blastomere'].index.get_level_values(1): _l = _l.split('-')[0] if not('split' in _l): new_16_labels.append('16') elif not('pooled' in _l): new_16_labels.append('16 split') else: new_16_labels.append('16 split') labels[labels=='16-cell blastomere'] = new_16_labels ulabels = [] for lab in labels: if lab not in ulabels: ulabels.append(lab) short_labels = labels.copy() _ulabels_convert = np.array([ 'Z',# Z', 'E',# Em', '2',# Bm E', '2',# Bm M', '2',# Bm L', '4', '8', '8 s', '16', '16 s', 'Bz',# E', 'Bz',# M', 'Bz',# L' 'F', 'L' ]) short_ulabels = [] for lab, nlab in zip(ulabels, _ulabels_convert): short_labels[short_labels==lab] = nlab if nlab not in short_ulabels: short_ulabels.append(nlab) from topslam.optimization import run_methods, methods, create_model, optimize_model X_init, dims = run_methods(Y_m, methods) m = create_model(Y_m, X_init, num_inducing=25) m.Ymean = Ymean m.Ystd = Ystd m.data_labels = short_labels m.data_ulabels = short_ulabels m.data = Y_bgplvm m.X_init = X_init m.dims = dims if optimize: optimize_model(m) if plot: mc = ManifoldCorrectionTree(m) plot_comparison(mc, X_init, dims, m.data_labels, m.data_ulabels, 0) return m
def example_deng(optimize=True, plot=True): import pandas as pd, os import GPy, numpy as np from topslam.filtering import filter_RNASeq # Reproduceability, BGPLVM has local optima np.random.seed(42) # This is the process of how we loaded the data: ulabels = [ 'Zygote', '2-cell embryo', 'Early 2-cell blastomere', 'Mid 2-cell blastomere', 'Late 2-cell blastomere', '4-cell blastomere', '8-cell blastomere', '16-cell blastomere', 'Early blastocyst cell', 'Mid blastocyst cell', 'Late blastocyst cell', 'fibroblast', 'adult liver', ] folder_path = os.path.expanduser('~/tmp/Deng') csv_file = os.path.join(folder_path, 'filtered_expression_values.csv') if os.path.exists(csv_file): print('Loading previous filtered data: {}'.format(csv_file)) Y_bgplvm = pd.read_csv(csv_file, index_col=[0, 1, 2], header=0) else: print('Loading data:') data = GPy.util.datasets.singlecell_rna_seq_deng() if not os.path.exists(folder_path): os.mkdir(folder_path) Ydata = data['Y'].copy() Ydata.columns = Ydata.columns.to_series().apply(str.upper) Ydata = Ydata.reset_index().set_index('index', append=True) Ydata['labels'] = data['labels'].values Ydata = Ydata.set_index('labels', append=True) Ydata = Ydata.reorder_levels([0, 2, 1]) Ydata = Ydata.reset_index([0, 2]).loc[ulabels].set_index( ['level_0', 'index'], append=True) Y = Ydata.copy() Y.columns = [c.split('.')[0] for c in Y.columns] Y_bgplvm = filter_RNASeq(Y) print('\nSaving data to tmp file: {}'.format(csv_file)) Y_bgplvm.to_csv(csv_file) labels = Y_bgplvm.index.get_level_values(0).values Ymean = Y_bgplvm.values.mean() Ystd = Y_bgplvm.values.std() Y_m = Y_bgplvm.values Y_m -= Ymean Y_m /= Ystd # get the labels right for split experiments # get the labels right for 8 and split new_8_labels = [] for _l in Y_bgplvm.loc['8-cell blastomere'].index.get_level_values(1): _l = _l.split('-')[0] if not ('split' in _l): new_8_labels.append('8') elif not ('pooled' in _l): new_8_labels.append('8 split') else: new_8_labels.append('8 split') labels[labels == '8-cell blastomere'] = new_8_labels # get the labels right for 16 and split new_16_labels = [] for _l in Y_bgplvm.loc['16-cell blastomere'].index.get_level_values(1): _l = _l.split('-')[0] if not ('split' in _l): new_16_labels.append('16') elif not ('pooled' in _l): new_16_labels.append('16 split') else: new_16_labels.append('16 split') labels[labels == '16-cell blastomere'] = new_16_labels ulabels = [] for lab in labels: if lab not in ulabels: ulabels.append(lab) short_labels = labels.copy() _ulabels_convert = np.array([ 'Z', # Z', 'E', # Em', '2', # Bm E', '2', # Bm M', '2', # Bm L', '4', '8', '8 s', '16', '16 s', 'Bz', # E', 'Bz', # M', 'Bz', # L' 'F', 'L' ]) short_ulabels = [] for lab, nlab in zip(ulabels, _ulabels_convert): short_labels[short_labels == lab] = nlab if nlab not in short_ulabels: short_ulabels.append(nlab) from topslam.optimization import run_methods, methods, create_model, optimize_model X_init, dims = run_methods(Y_m, methods) m = create_model(Y_m, X_init, num_inducing=25) m.Ymean = Ymean m.Ystd = Ystd m.data_labels = short_labels m.data_ulabels = short_ulabels m.data = Y_bgplvm m.X_init = X_init m.dims = dims if optimize: optimize_model(m) if plot: mc = ManifoldCorrectionTree(m) plot_comparison(mc, X_init, dims, m.data_labels, m.data_ulabels, 0) return m