rnaseq_train_df = pd.read_table(rnaseq_train, index_col=0) rnaseq_test_df = pd.read_table(rnaseq_test, index_col=0) # Determine most variably expressed genes and subset if subset_mad_genes is not None: data_base = os.path.join('..', '0.expression-download', 'data') mad_file = os.path.join(data_base, '{}_mad_genes.tsv'.format(dataset)) mad_genes_df = pd.read_table(mad_file) mad_genes = mad_genes_df.iloc[0:subset_mad_genes, ].gene_id.astype(str) rnaseq_train_df = rnaseq_train_df.reindex(mad_genes, axis='columns') rnaseq_test_df = rnaseq_test_df.reindex(mad_genes, axis='columns') # Initialize DataModel class with pancancer data dm = DataModel(df=rnaseq_train_df, test_df=rnaseq_test_df) dm.transform(how='zeroone') # Set seed and list of algorithms for compression np.random.seed(1234) random_seeds = np.random.randint(0, high=1000000, size=num_seeds) algorithms = ['pca', 'ica', 'nmf', 'dae', 'vae'] # Save population of models in specific folder comp_out_dir = os.path.join(out_dir, 'ensemble_z_matrices', '{}_components_{}'.format(dataset, num_components)) if not os.path.exists(comp_out_dir): os.makedirs(comp_out_dir) reconstruction_results = []
expr_file = os.path.join('..', 'data', 'pancan_rnaseq_freeze.tsv.gz') rnaseq_df = pd.read_table(expr_file, index_col=0) # Subset x matrix to MAD genes med_dev = pd.DataFrame(mad(rnaseq_df), index=rnaseq_df.columns) mad_genes = ( med_dev.sort_values(by=0, ascending=False) .iloc[0:num_genes_kept] .index .tolist() ) rnaseq_df = rnaseq_df.loc[:, mad_genes] # Initialize DataModel class with PanCanAtlas RNAseq dm = DataModel(df=rnaseq_df) # Transform the input matrix into a range between zero and one dm.transform(how='zeroone') # Fit models dm.pca(n_components=num_components) dm.ica(n_components=num_components) dm.nmf(n_components=num_components) dm.nn(n_components=num_components, model='tybalt', loss='binary_crossentropy', epochs=int(vae_epochs), batch_size=int(vae_batch_size), learning_rate=float(vae_lr),
def train_models(basename, input_train, input_test, zdim, paramsD, out_dir, num_seeds, shuffle, madfile, num_mad_genes, algorithms=['pca', 'ica', 'nmf', 'dae', 'vae']): # Set output directory and file names train_dir = os.path.join(out_dir, 'ensemble_z_results', f'{zdim}_components') if shuffle: train_dir = f'{train_dir}_shuffled' if not os.path.exists(train_dir): os.makedirs(train_dir) if shuffle: file_pre = f'{basename}_{zdim}_components_shuffled_' else: file_pre = f'{basename}_{zdim}_components_' recon_file = os.path.join(train_dir, f'{file_pre}reconstruction.tsv') co_file = os.path.join(train_dir, f'{file_pre}sample_corr.tsv.gz') co_g_file = os.path.join(train_dir, f'{file_pre}gene_corr.tsv.gz') tybalt_hist_file = os.path.join(train_dir, f'{file_pre}tybalt_training_hist.tsv') adage_hist_file = os.path.join(train_dir, f'{file_pre}adage_training_hist.tsv') # Load Preprocessed Data --> could provide option to input raw data and just call process data from here, # but don't want to process multiple times, esp since we're running this independently on each zdim rnaseq_train_df = read_counts_or_params(input_train) rnaseq_test_df = read_counts_or_params(input_test) # Determine most variably expressed genes and subset if madfile is not None: mad_genes_df = read_counts_or_params(madfile) #data_base = os.path.join('..', '0.expression-download', 'data') #mad_file = os.path.join(data_base, '{}_mad_genes.tsv'.format(dataset)) #mad_genes_df = pd.read_table(mad_file) mad_genes = mad_genes_df.iloc[0:num_mad_genes, ].index.astype(str) rnaseq_train_df = rnaseq_train_df.reindex(mad_genes, axis='columns') rnaseq_test_df = rnaseq_test_df.reindex(mad_genes, axis='columns') # Initialize DataModel class dm = DataModel(df=rnaseq_train_df, test_df=rnaseq_test_df) dm.transform( how='zeroone' ) # data normalization happens here, don't need to feed in normalized data # Set seed and list of algorithms for compression np.random.seed(1234) random_seeds = np.random.randint(0, high=1000000, size=num_seeds) #algorithms = ['pca', 'ica', 'nmf', 'dae', 'vae'] # Save population of models in specific folder comp_out_dir = os.path.join(out_dir, 'ensemble_z_matrices', f'{basename}_components_{zdim}') if not os.path.exists(comp_out_dir): os.makedirs(comp_out_dir) reconstruction_results = [] test_reconstruction_results = [] sample_correlation_results = [] tybalt_training_histories = [] adage_training_histories = [] for seed in random_seeds: np.random.seed(seed) seed_file = os.path.join(comp_out_dir, f'model_{seed}') if shuffle: seed_file = f'{seed_file}_shuffled' # randomly permute genes of each sample in the rnaseq matrix shuf_df = rnaseq_train_df.apply( lambda x: np.random.permutation(x.tolist()), axis=1) # Setup new pandas dataframe shuf_df = pd.DataFrame(shuf_df, columns=['gene_list']) shuf_df = pd.DataFrame(shuf_df.gene_list.values.tolist(), columns=rnaseq_train_df.columns, index=rnaseq_train_df.index) # Initiailze a new DataModel, with different shuffling each permutation dm = DataModel(df=shuf_df, test_df=rnaseq_test_df) dm.transform(how='zeroone') # Fit models if "pca" in algorithms: sys.stdout.write("\nrunning pca\n") dm.pca(n_components=zdim, transform_test_df=True) if "ics" in algorithms: sys.stdout.write("\nrunning ics\n") dm.ica(n_components=zdim, transform_test_df=True) if "nmf" in algorithms: sys.stdout.write("\nrunning nmf\n") dm.nmf(n_components=zdim, transform_test_df=True) if "vae" in algorithms: # run tybalt vae sys.stdout.write("\nrunning tybalt vae\n") dm.nn(n_components=zdim, model='tybalt', loss='binary_crossentropy', epochs=int(paramsD['vae_epochs']), batch_size=int(paramsD['vae_batch_size']), learning_rate=float(paramsD['vae_lr']), separate_loss=True, verbose=True, transform_test_df=True) if "dae" in algorithms: # run adage dae sys.stdout.write("\nrunning adage dae\n") dm.nn(n_components=zdim, model='adage', loss='binary_crossentropy', epochs=int(paramsD['dae_epochs']), batch_size=int(paramsD['dae_batch_size']), learning_rate=float(paramsD['dae_lr']), noise=float(paramsD['dae_noise']), sparsity=float(paramsD['dae_sparsity']), verbose=True, transform_test_df=True) # Obtain z matrix (sample scores per latent space feature) for all models full_z_file = f'{seed_file}_z_matrix.tsv.gz' dm.combine_models().to_csv(full_z_file, sep='\t', compression='gzip') full_test_z_file = f'{seed_file}_z_test_matrix.tsv.gz' sys.stdout.write("combining trained models") dm.combine_models(test_set=True).to_csv(full_test_z_file, sep='\t', compression='gzip') # Obtain weight matrices (gene by latent space feature) for all models full_weight_file = f'{seed_file}_weight_matrix.tsv.gz' dm.combine_weight_matrix().to_csv(full_weight_file, sep='\t', compression='gzip') # Store reconstruction costs and reconstructed input at training end sys.stdout.write("compiling reconstruction costs") full_reconstruction, reconstructed_matrices = dm.compile_reconstruction( ) # Store reconstruction evaluation and data for test set full_test_recon, test_recon_mat = dm.compile_reconstruction( test_set=True) # Get correlations across samples and genes between input and output data pearson_corr = [] spearman_corr = [] pearson_corr_test = [] spearman_corr_test = [] sys.stdout.write( "\ncalculating correlations across samples and genes\n") for algorithm in algorithms: # Training Sample Correlations sys.stdout.write( f"training: calculating pearson correlation for {algorithm}" ) # f string requires py >=3.6 pearson_corr.append( get_recon_correlation(df=dm.df, recon_mat_dict=reconstructed_matrices, algorithm=algorithm, cor_type='pearson', genes=False)) sys.stdout.write( f"training: calculating spearman correlation for {algorithm}" ) # f string requires py >=3.6 spearman_corr.append( get_recon_correlation(df=dm.df, recon_mat_dict=reconstructed_matrices, algorithm=algorithm, cor_type='spearman', genes=False)) # Testing Sample Correlations sys.stdout.write( f"testing: calculating pearson correlation for {algorithm}" ) # f string requires py >=3.6 pearson_corr_test.append( get_recon_correlation(df=dm.test_df, recon_mat_dict=test_recon_mat, algorithm=algorithm, cor_type='pearson', genes=False)) sys.stdout.write( f"testing: calculating spearman correlation for {algorithm}" ) # f string requires py >=3.6 spearman_corr_test.append( get_recon_correlation(df=dm.test_df, recon_mat_dict=test_recon_mat, algorithm=algorithm, cor_type='spearman', genes=False)) # Training - Sample correlations between input and reconstruction sys.stdout.write( f"training: calculating sample correlation for {algorithms}" ) # f string requires py >=3.6 sample_correlation_results.append( compile_corr_df(pearson_list=pearson_corr, spearman_list=spearman_corr, algorithm_list=algorithms, column_names=dm.df.index, seed=seed, data_type='training')) # Testing - Sample correlations between input and reconstruction sys.stdout.write( f"testing: calculating sample correlation for {algorithms}") sample_correlation_results.append( compile_corr_df(pearson_list=pearson_corr_test, spearman_list=spearman_corr_test, algorithm_list=algorithms, column_names=dm.test_df.index, seed=seed, data_type='testing')) # Store training histories and intermediate results for neural networks reconstruction_results.append( full_reconstruction.assign(seed=seed, shuffled=shuffle)) test_reconstruction_results.append( full_test_recon.assign(seed=seed, shuffled=shuffle)) tybalt_training_histories.append( dm.tybalt_fit.history_df.assign(seed=seed, shuffle=shuffle)) adage_training_histories.append( dm.adage_fit.history_df.assign(seed=seed, shuffle=shuffle)) # Save reconstruction and neural network training results pd.concat([ pd.concat(reconstruction_results).assign(data_type='training'), pd.concat(test_reconstruction_results).assign(data_type='testing') ]).reset_index(drop=True).to_csv(recon_file, sep='\t', index=False) pd.concat(sample_correlation_results).to_csv(co_file, sep='\t', index=False, float_format='%.3f', compression='gzip') (pd.concat(tybalt_training_histories).reset_index().rename( { 'index': 'epoch' }, axis='columns').to_csv(tybalt_hist_file, sep='\t', index=False)) (pd.concat(adage_training_histories).reset_index().rename( { 'index': 'epoch' }, axis='columns').to_csv(adage_hist_file, sep='\t', index=False))
# In[6]: # Split into training and testing sets # (For compatibility with tybalt.DataModel) split_prop = 0.05 test_samples = random.sample(range(0, data_df.shape[0]), int(data_df.shape[0] * split_prop)) test_df = data_df.iloc[test_samples, :] train_df = data_df.drop(test_df.index, axis="index") # In[7]: # Initialize DataModel class with the input data dm = DataModel(df=train_df, test_df=test_df) dm.transform(how='zeroone') # In[8]: # Parameters selected to be similar to real data parameter sweep epochs = 25 batch_size = 50 vae_learning_rate = 0.0015 dae_learning_rate = 0.0005 dae_noise = 0.01 dae_sparsity = 0 # In[9]: # Loop over the latent dimensionalities
vae_learning_rate = float(args.vae_learning_rate) adage_learning_rate = float(args.adage_learning_rate) adage_noise = float(args.adage_noise) loss = args.loss verbose = args.verbose # Load and process data sim_df = pd.read_table(data_file) select_columns = range(0, sim_df.shape[1] - 1) groups = sim_df['groups'] gene_modules = sim_df.iloc[0, range(0, sim_df.shape[1] - 1)] sim_sub_df = sim_df.iloc[range(1, sim_df.shape[0]), :] data_model = DataModel(df=sim_sub_df, select_columns=select_columns, gene_modules=gene_modules) data_model.transform(how=how_transform) # Real target output real_group_data = np.array( pd.concat([data_model.df, data_model.other_df], axis=1).query('groups == "B"').drop('groups', axis=1)) # Get random seeds random_seeds = np.random.randint(0, high=1000000, size=num_seeds) all_results = [] for seed in random_seeds: data_model.pca(n_components=n_components) data_model.ica(n_components=n_components)