raise Exception('cmd err in the argv[2]') # summary nz_rate_df = scimpute.nnzero_rate_df(df) print('df.shape, [gene, cell]:', df.shape) print('nz_rate: {}'.format(round(nz_rate_df, 3))) print(df.ix[0:3, 0:3]) # exp df = np.power(10, df) -1 print('after exp(value) - 1') print(df.iloc[:3, :3]) # lib-size per million normalization df = scimpute.df_normalization(df) print('after normalization') print(df.ix[0:3, 0:3]) read_per_gene = df.sum(axis=1) read_per_cell = df.sum(axis=0) print('sum_reads_per_gene:', read_per_gene[0:3]) print('sum_reads_per_cell:', read_per_cell[0:3]) # log(tpm+1) transformation df = scimpute.df_log_transformation(df) print('after log transformation') print(df.ix[0:3, 0:3]) # save print('saving output to:', outname) scimpute.save_hd5(df, outname) print('finished')
# summary nz_rate_df = scimpute.nnzero_rate_df(df) print('input matrix.shape:', df.shape) print('nz_rate: {}'.format(round(nz_rate_df, 3))) print(df.ix[0:3, 0:3]) # filter # read_per_gene = df.sum(axis=1) read_per_cell = df.sum(axis=0) df_filtered = df.loc[(read_per_gene >= gene_min), (read_per_cell >= cell_min)] nz_rate_filtered = scimpute.nnzero_rate_df(df_filtered) print('filtered matrix : ', df_filtered.shape) print('nz_rate:', nz_rate_filtered) print(df_filtered.ix[0:3, 0:3]) scimpute.save_hd5(df_filtered, tag0 + '.hd5') # histogram of filtered data read_per_gene_filtered = df_filtered.sum(axis=1) read_per_cell_filtered = df_filtered.sum(axis=0) scimpute.hist_list(read_per_cell_filtered.values, xlab='counts/cell', title='Histogram of counts per cell' + tag) scimpute.hist_list(read_per_gene_filtered.values, xlab='counts/gene', title='Histogram of counts per gene' + tag) scimpute.hist_df(df_filtered, xlab='counts', title='Histogram of counts in expression matrix' + tag) # histogram of log transformed filtered data
# Remove .x from ID # df_big.index = df_big.index.to_series().astype(str).str.replace(r'\.[0-9]*','').astype(str) # print('because the index is different, remove the appendix') # print('big df after changing index', df_big.ix[0:5, 0:5]) print('df_big index is unique? {}'.format(df_big.index.is_unique)) print('df_small index is unique? {}'.format(df_small.index.is_unique)) # SELECT print('selecting..') df_selected = df_big.ix[df_small.index] # Check null, fill zeros null_gene_num = df_selected.ix[:, 1].isnull().sum() print('there are {} genes from small-df not found in big-df'.format( null_gene_num)) df_selected = df_selected.fillna(value=0) print('those N.A. in selected-df has been filled with zeros') null_gene_num2 = df_selected.ix[:, :].isnull().sum().sum() print('Now, there are {} null values in selected-df'.format(null_gene_num2)) nz_selected = scimpute.nnzero_rate_df(df_selected) print('nz_rate output: ', nz_selected) # Finish print('selected big df from small df, after fillna:\n', df_selected.ix[0:5, 0:5]) print('shape: ', df_selected.shape) scimpute.save_hd5(df_selected, out_name) print('Finished')
def late_main(p, log_dir, rand_state=3): ##0. read data and extract gene IDs and cell IDs input_matrix, gene_ids, cell_ids = read_data(p) ##1. split data and save indexes #input p, input_matrix, cell_ids #return cell_ids_train, cell_ids_valid, cell_ids_test m, n = input_matrix.shape input_train, input_valid, input_test, train_idx, valid_idx, test_idx = \ scimpute.split__csr_matrix(input_matrix, a=p.a, b=p.b, c=p.c) cell_ids_train = cell_ids[train_idx] cell_ids_valid = cell_ids[valid_idx] cell_ids_test = cell_ids[test_idx] np.savetxt('{}/train.{}_index.txt'.format(p.stage, p.stage), cell_ids_train, fmt='%s') np.savetxt('{}/valid.{}_index.txt'.format(p.stage, p.stage), cell_ids_valid, fmt='%s') np.savetxt('{}/test.{}_index.txt'.format(p.stage, p.stage), cell_ids_test, fmt='%s') print('RAM usage after splitting input data is: {} M'.format(usage())) # todo: for backward support for older parameter files only # sample_size is 1000 in default; if sample_size is less than the number of cells (m), # we reconstruct the training and validation sets by randomly sampling. try: p.sample_size sample_size = p.sample_size except: sample_size = int(9e4) if sample_size < m: np.random.seed(1) rand_idx = np.random.choice(range(len(cell_ids_train)), min(sample_size, len(cell_ids_train))) sample_train = input_train[rand_idx, :].todense() sample_train_cell_ids = cell_ids_train[rand_idx] rand_idx = np.random.choice(range(len(cell_ids_valid)), min(sample_size, len(cell_ids_valid))) sample_valid = input_valid[rand_idx, :].todense() sample_valid_cell_ids = cell_ids_valid[rand_idx] #?? the following sample_input is a matrix sampled randomly, and should it be a matrix containing # sample_training and sample_valid rand_idx = np.random.choice(range(m), min(sample_size, m)) sample_input = input_matrix[rand_idx, :].todense() sample_input_cell_ids = cell_ids[rand_idx] del rand_idx gc.collect() np.random.seed() else: sample_input = input_matrix.todense() sample_train = input_train.todense() sample_valid = input_valid.todense() sample_input_cell_ids = cell_ids sample_train_cell_ids = cell_ids_train sample_valid_cell_ids = cell_ids_valid print('len of sample_train: {}, sample_valid: {}, sample_input {}'.format( len(sample_train_cell_ids), len(sample_valid_cell_ids), len(sample_input_cell_ids))) ##2. model training and validation #2.1 init --> keep this in the main tf.reset_default_graph() # define placeholders and variables X = tf.placeholder(tf.float32, [None, n], name='X_input') # input pIn_holder = tf.placeholder(tf.float32, name='p.pIn') #keep_prob for dropout pHidden_holder = tf.placeholder(tf.float32, name='p.pHidden') #keep_prob for dropout #2.2 define layers and variables # input p, X, pIn_holder, pHidden_holder, n # return a_bottleneck, h(d_a1) a_bottleneck, h = build_late(X, pHidden_holder, pIn_holder, p, n, rand_state=3) #2.3 define loss # input X, h, p # return mse_nz, mse, reg_term mse_nz, mse, reg_term = build_metrics(X, h, p.reg_coef) #2.4 costruct the trainer --> keep this section in the main optimizer = tf.train.AdamOptimizer(p.learning_rate) if p.mse_mode in ('mse_omega', 'mse_nz'): print('training on mse_nz') trainer = optimizer.minimize(mse_nz + reg_term) elif p.mse_mode == 'mse': print('training on mse') trainer = optimizer.minimize(mse + reg_term) else: raise Exception('mse_mode spelled wrong') #2.5 Init a session accoding to the run_flag sess = tf.Session() # restore variables saver = tf.train.Saver() if p.run_flag == 'load_saved': print('*** In TL Mode') saver.restore(sess, "./step1/step1.ckpt") elif p.run_flag == 'rand_init': print('*** In Rand Init Mode') init = tf.global_variables_initializer() sess.run(init) elif p.run_flag == 'impute': print('*** In impute mode loading "step2.ckpt"..') saver.restore(sess, './step2/step2.ckpt') p.max_training_epochs = 0 p.learning_rate = 0.0 ## save_whole_imputation save_whole_imputation(sess, X, h, a_bottleneck, pIn_holder, pHidden_holder, input_matrix, gene_ids, cell_ids, p, m) print('imputation finished') #toc_stop = time.time() #print("reading took {:.1f} seconds".format(toc_stop - tic_start)) exit() else: raise Exception('run_flag err') # define tensor_board writer batch_writer = tf.summary.FileWriter(log_dir + '/batch', sess.graph) valid_writer = tf.summary.FileWriter(log_dir + '/valid', sess.graph) # prep mini-batch, and reporter vectors num_batch = int(math.floor(len(train_idx) // p.batch_size)) # floor epoch_log = [] mse_nz_batch_vec, mse_nz_valid_vec = [], [ ] #, mse_nz_train_vec = [], [], [] mse_batch_vec, mse_valid_vec = [], [] # mse = MSE(X, h) #msej_batch_vec, msej_valid_vec = [], [] # msej = MSE(X, h), for genej, nz_cells print('RAM usage after building the model is: {} M'.format(usage())) epoch = 0 #2.6. pre-training epoch (0) #save imputation results before training steps print("Evaluation: epoch{}".format(epoch)) epoch_log.append(epoch) mse_train, mse_nz_train = sess.run([mse, mse_nz], feed_dict={ X: sample_train, pHidden_holder: 1.0, pIn_holder: 1.0 }) mse_valid, mse_nz_valid = sess.run([mse, mse_nz], feed_dict={ X: sample_valid, pHidden_holder: 1.0, pIn_holder: 1.0 }) print("mse_nz_train=", round(mse_nz_train, 3), "mse_nz_valid=", round(mse_nz_valid, 3)) print("mse_train=", round(mse_train, 3), "mse_valid=", round(mse_valid, 3)) mse_batch_vec.append(mse_train) mse_valid_vec.append(mse_valid) mse_nz_batch_vec.append(mse_nz_train) mse_nz_valid_vec.append(mse_nz_valid) #2.7. training epochs (1-) for epoch in range(1, p.max_training_epochs + 1): tic_cpu, tic_wall = time.clock(), time.time() ridx_full = np.random.choice(len(train_idx), len(train_idx), replace=False) #2.7.1 training model on mini-batches for i in range(num_batch): # x_batch indices = np.arange(p.batch_size * i, p.batch_size * (i + 1)) ridx_batch = ridx_full[indices] # x_batch = df1_train.ix[ridx_batch, :] x_batch = input_train[ridx_batch, :].todense() sess.run(trainer, feed_dict={ X: x_batch, pIn_holder: p.pIn, pHidden_holder: p.pHidden }) toc_cpu, toc_wall = time.clock(), time.time() #2.7.2 save the results of epoch 1 and all display steps (epochs) if (epoch == 1) or (epoch % p.display_step == 0): tic_log = time.time() print( '#Epoch {} took: {} CPU seconds; {} Wall seconds'.format( epoch, round(toc_cpu - tic_cpu, 2), round(toc_wall - tic_wall, 2))) print('num-mini-batch per epoch: {}, till now: {}'.format( i + 1, epoch * (i + 1))) print('RAM usage: {:0.1f} M'.format(usage())) # debug # print('d_w1', sess.run(d_w1[1, 0:4])) # verified when GradDescent used # training mse and mse_nz of the last batch mse_batch, mse_nz_batch, h_batch = sess.run([mse, mse_nz, h], feed_dict={ X: x_batch, pHidden_holder: 1.0, pIn_holder: 1.0 }) # validation mse and mse_nz of the sample validation set (1000) mse_valid, mse_nz_valid, Y_valid = sess.run([mse, mse_nz, h], feed_dict={ X: sample_valid, pHidden_holder: 1.0, pIn_holder: 1.0 }) toc_log = time.time() print('mse_nz_batch:{}; mse_omage_valid: {}'.format( mse_nz_batch, mse_nz_valid)) print('mse_batch:', mse_batch, '; mse_valid:', mse_valid) print('log time for each epoch: {}\n'.format( round(toc_log - tic_log, 1))) mse_batch_vec.append(mse_batch) mse_valid_vec.append(mse_valid) mse_nz_batch_vec.append(mse_nz_batch) mse_nz_valid_vec.append(mse_nz_valid) epoch_log.append(epoch) #2.7.3 save snapshot step if (epoch % p.snapshot_step == 0) or (epoch == p.max_training_epochs): tic_log2 = time.time() #1.save imputation results #if the input matrix is large (m > p.large_size), only save the #imputation results of a small sample set (sample_input) print("> Impute and save.. ") if m > p.large_size: Y_input_df = fast_imputation(sess, h, X, pIn_holder, pHidden_holder, sample_input, gene_ids, sample_input_cell_ids) scimpute.save_hd5( Y_input_df, "{}/sample_imputation.{}.hd5".format(p.stage, p.stage)) else: Y_input_df = fast_imputation(sess, h, X, pIn_holder, pHidden_holder, input_matrix.todense(), gene_ids, cell_ids) scimpute.save_hd5( Y_input_df, "{}/imputation.{}.hd5".format(p.stage, p.stage)) #2.save model print('> Saving model..') save_path = saver.save(sess, log_dir + "/{}.ckpt".format(p.stage)) print("Model saved in: %s" % save_path) #3.save the training and test curve if p.mse_mode in ('mse_nz', 'mse_omega'): #learning_curve_mse_nz(skip=math.floor(epoch / 5 / p.display_step)) learning_curve_mse_nz(epoch_log, mse_nz_batch_vec, mse_nz_valid_vec, p.stage, skip=math.floor(epoch / 5 / p.display_step)) elif p.mse_mode == 'mse': #learning_curve_mse(skip=math.floor(epoch / 5 / p.display_step)) learning_curve_mse(epoch_log, mse_batch_vec, mse_valid_vec, p.stage, skip=math.floor(epoch / 5 / p.display_step)) #4.save save_bottleneck_representation print("> save bottleneck_representation") code_bottleneck_input = sess.run(a_bottleneck, feed_dict={ X: sample_input, pIn_holder: 1, pHidden_holder: 1 }) np.save('{}/code_neck_valid.{}.npy'.format(p.stage, p.stage), code_bottleneck_input) #save_weights() save_weights(sess, p.stage, en_de_layers=p.l) #visualize_weights() visualize_weights(sess, p.stage, en_de_layers=p.l) toc_log2 = time.time() log2_time = round(toc_log2 - tic_log2, 1) min_mse_valid = min(mse_nz_valid_vec) # os.system( # '''for file in {0}/*npy # do python -u weight_clustmap.py $file {0} # done'''.format(p.stage) # ) print('min_mse_nz_valid till now: {}'.format(min_mse_valid)) print('snapshot_step: {}s'.format(log2_time)) batch_writer.close() valid_writer.close() sess.close()
def save_whole_imputation(sess, X, h, a_bottleneck, pIn_holder, pHidden_holder, input_matrix, gene_ids, cell_ids, p, m): ''' calculate and save imputation results for an input matrix at the 'impute' mode. If the number of cells is larger than a threshold (large_size: 1e5), save results of m//p.sample_size 'folds'. Parameters ---------- ''' if m > p.large_size: #impute on small data blocks to avoid high memory cost n_out_batches = m // p.sample_size print('num_out_batches:', n_out_batches) handle2 = open('./{}/latent_code.{}.csv'.format(p.stage, p.stage), 'w') with open('./{}/imputation.{}.csv'.format(p.stage, p.stage), 'w') as handle: for i_ in range(n_out_batches + 1): start_idx = i_ * p.sample_size end_idx = min((i_ + 1) * p.sample_size, m) print('saving:', start_idx, end_idx) x_out_batch = input_matrix[start_idx:end_idx, :].todense() y_out_batch = sess.run(h, feed_dict={ X: x_out_batch, pIn_holder: 1, pHidden_holder: 1 }) df_out_batch = pd.DataFrame(data=y_out_batch, columns=gene_ids, index=cell_ids[range( start_idx, end_idx)]) latent_code = sess.run(a_bottleneck, feed_dict={ X: x_out_batch, pIn_holder: 1, pHidden_holder: 1 }) latent_code_df = pd.DataFrame(data=latent_code, index=cell_ids[range( start_idx, end_idx)]) if i_ == 0: df_out_batch.to_csv(handle, float_format='%.6f') latent_code_df.to_csv(handle2, float_format='%.6f') print( 'RAM usage during mini-batch imputation and saving output: ', '{} M'.format(usage())) else: df_out_batch.to_csv(handle, header=None) latent_code_df.to_csv(handle2, header=None) handle2.close() else: # if m the # of cells is less than large_size (1e5)) Y_input_arr = sess.run(h, feed_dict={ X: input_matrix.todense(), pIn_holder: 1, pHidden_holder: 1 }) # save sample imputation Y_input_df = pd.DataFrame(data=Y_input_arr, columns=gene_ids, index=cell_ids) latent_code = sess.run(a_bottleneck, feed_dict={ X: input_matrix.todense(), pIn_holder: 1, pHidden_holder: 1 }) latent_code_df = pd.DataFrame(data=latent_code, index=cell_ids) print('RAM usage during whole data imputation and saving output: ', '{} M'.format(usage())) scimpute.save_hd5(Y_input_df, "{}/imputation.{}.hd5".format(p.stage, p.stage)) scimpute.save_hd5(latent_code_df, "{}/latent_code.{}.hd5".format(p.stage, p.stage))
# read data (so that output matrix is [sample, gene]) if matrix_type == 'cell_row': df = pd.read_hdf(file_name).transpose() elif matrix_type == 'gene_row': df = pd.read_hdf(file_name) # summary print('input shape [genes, samples]:', df.shape, df.ix[0:3, 0:2]) nz_rate_in = scimpute.nnzero_rate_df(df) print('nz_rate_in: {}'.format(nz_rate_in)) # read list if list_name.endswith('.csv'): list_df = pd.read_csv(list_name, index_col=0, sep='\t', header=None) elif list_name.endswith('.hd5'): list_df = scimpute.read_hd5(list_name) print('list:', list_df.shape, list_df.index) # filter df_yes = df.ix[list_df.index] overlap = df.index.isin(list_df.index) df_no = df.ix[~overlap] print('matrix yes: ', df_yes.shape, df_yes.ix[0:3, 0:2]) print('matrix no: ', df_no.shape, df_no.ix[0:3, 0:2]) # output result dataframe scimpute.save_hd5(df_yes, out_prefix + '_yes.hd5') scimpute.save_hd5(df_no, out_prefix + '_no.hd5')