Exemple #1
0
    raise Exception('cmd err in the argv[2]')

# summary
nz_rate_df = scimpute.nnzero_rate_df(df)
print('df.shape, [gene, cell]:', df.shape)
print('nz_rate: {}'.format(round(nz_rate_df, 3)))
print(df.ix[0:3, 0:3])

# exp
df = np.power(10, df) -1
print('after exp(value) - 1')
print(df.iloc[:3, :3])

# lib-size per million normalization
df = scimpute.df_normalization(df)
print('after normalization')
print(df.ix[0:3, 0:3])
read_per_gene = df.sum(axis=1)
read_per_cell = df.sum(axis=0)
print('sum_reads_per_gene:', read_per_gene[0:3])
print('sum_reads_per_cell:', read_per_cell[0:3])

# log(tpm+1) transformation
df = scimpute.df_log_transformation(df)
print('after log transformation')
print(df.ix[0:3, 0:3])

# save
print('saving output to:', outname)
scimpute.save_hd5(df, outname)
print('finished')
Exemple #2
0
# summary
nz_rate_df = scimpute.nnzero_rate_df(df)
print('input matrix.shape:', df.shape)
print('nz_rate: {}'.format(round(nz_rate_df, 3)))
print(df.ix[0:3, 0:3])

# filter #
read_per_gene = df.sum(axis=1)
read_per_cell = df.sum(axis=0)
df_filtered = df.loc[(read_per_gene >= gene_min), (read_per_cell >= cell_min)]
nz_rate_filtered = scimpute.nnzero_rate_df(df_filtered)
print('filtered matrix : ', df_filtered.shape)
print('nz_rate:', nz_rate_filtered)
print(df_filtered.ix[0:3, 0:3])
scimpute.save_hd5(df_filtered, tag0 + '.hd5')

# histogram of filtered data
read_per_gene_filtered = df_filtered.sum(axis=1)
read_per_cell_filtered = df_filtered.sum(axis=0)
scimpute.hist_list(read_per_cell_filtered.values,
                   xlab='counts/cell',
                   title='Histogram of counts per cell' + tag)
scimpute.hist_list(read_per_gene_filtered.values,
                   xlab='counts/gene',
                   title='Histogram of counts per gene' + tag)
scimpute.hist_df(df_filtered,
                 xlab='counts',
                 title='Histogram of counts in expression matrix' + tag)

# histogram of log transformed filtered data
# Remove .x from ID
# df_big.index = df_big.index.to_series().astype(str).str.replace(r'\.[0-9]*','').astype(str)
# print('because the index is different, remove the appendix')
# print('big df after changing index', df_big.ix[0:5, 0:5])

print('df_big index is unique? {}'.format(df_big.index.is_unique))
print('df_small index is unique? {}'.format(df_small.index.is_unique))

# SELECT
print('selecting..')
df_selected = df_big.ix[df_small.index]
# Check null, fill zeros
null_gene_num = df_selected.ix[:, 1].isnull().sum()
print('there are {} genes from small-df not found in big-df'.format(
    null_gene_num))
df_selected = df_selected.fillna(value=0)
print('those N.A. in selected-df has been filled with zeros')
null_gene_num2 = df_selected.ix[:, :].isnull().sum().sum()
print('Now, there are {} null values in selected-df'.format(null_gene_num2))

nz_selected = scimpute.nnzero_rate_df(df_selected)
print('nz_rate output: ', nz_selected)

# Finish
print('selected big df from small df, after fillna:\n', df_selected.ix[0:5,
                                                                       0:5])
print('shape: ', df_selected.shape)

scimpute.save_hd5(df_selected, out_name)
print('Finished')
Exemple #4
0
def late_main(p, log_dir, rand_state=3):

    ##0. read data and extract gene IDs and cell IDs
    input_matrix, gene_ids, cell_ids = read_data(p)

    ##1. split data and save indexes
    #input p, input_matrix, cell_ids
    #return cell_ids_train, cell_ids_valid, cell_ids_test
    m, n = input_matrix.shape
    input_train, input_valid, input_test, train_idx, valid_idx, test_idx = \
     scimpute.split__csr_matrix(input_matrix, a=p.a, b=p.b, c=p.c)

    cell_ids_train = cell_ids[train_idx]
    cell_ids_valid = cell_ids[valid_idx]
    cell_ids_test = cell_ids[test_idx]

    np.savetxt('{}/train.{}_index.txt'.format(p.stage, p.stage),
               cell_ids_train,
               fmt='%s')
    np.savetxt('{}/valid.{}_index.txt'.format(p.stage, p.stage),
               cell_ids_valid,
               fmt='%s')
    np.savetxt('{}/test.{}_index.txt'.format(p.stage, p.stage),
               cell_ids_test,
               fmt='%s')

    print('RAM usage after splitting input data is: {} M'.format(usage()))

    # todo: for backward support for older parameter files only
    # sample_size is 1000 in default; if sample_size is less than the number of cells (m),
    # we reconstruct the training and validation sets by randomly sampling.
    try:
        p.sample_size
        sample_size = p.sample_size
    except:
        sample_size = int(9e4)

    if sample_size < m:
        np.random.seed(1)
        rand_idx = np.random.choice(range(len(cell_ids_train)),
                                    min(sample_size, len(cell_ids_train)))
        sample_train = input_train[rand_idx, :].todense()
        sample_train_cell_ids = cell_ids_train[rand_idx]

        rand_idx = np.random.choice(range(len(cell_ids_valid)),
                                    min(sample_size, len(cell_ids_valid)))
        sample_valid = input_valid[rand_idx, :].todense()
        sample_valid_cell_ids = cell_ids_valid[rand_idx]
        #?? the following sample_input is a matrix sampled randomly, and should it be a matrix containing
        # sample_training and sample_valid
        rand_idx = np.random.choice(range(m), min(sample_size, m))
        sample_input = input_matrix[rand_idx, :].todense()
        sample_input_cell_ids = cell_ids[rand_idx]

        del rand_idx
        gc.collect()
        np.random.seed()
    else:
        sample_input = input_matrix.todense()
        sample_train = input_train.todense()
        sample_valid = input_valid.todense()
        sample_input_cell_ids = cell_ids
        sample_train_cell_ids = cell_ids_train
        sample_valid_cell_ids = cell_ids_valid

    print('len of sample_train: {}, sample_valid: {}, sample_input {}'.format(
        len(sample_train_cell_ids), len(sample_valid_cell_ids),
        len(sample_input_cell_ids)))

    ##2. model training and validation
    #2.1 init --> keep this in the main
    tf.reset_default_graph()
    # define placeholders and variables
    X = tf.placeholder(tf.float32, [None, n], name='X_input')  # input
    pIn_holder = tf.placeholder(tf.float32,
                                name='p.pIn')  #keep_prob for dropout
    pHidden_holder = tf.placeholder(tf.float32,
                                    name='p.pHidden')  #keep_prob for dropout

    #2.2 define layers and variables
    # input p, X, pIn_holder, pHidden_holder, n
    # return a_bottleneck, h(d_a1)
    a_bottleneck, h = build_late(X,
                                 pHidden_holder,
                                 pIn_holder,
                                 p,
                                 n,
                                 rand_state=3)

    #2.3 define loss
    # input X, h, p
    # return mse_nz, mse, reg_term
    mse_nz, mse, reg_term = build_metrics(X, h, p.reg_coef)

    #2.4 costruct the trainer --> keep this section in the main
    optimizer = tf.train.AdamOptimizer(p.learning_rate)
    if p.mse_mode in ('mse_omega', 'mse_nz'):
        print('training on mse_nz')
        trainer = optimizer.minimize(mse_nz + reg_term)
    elif p.mse_mode == 'mse':
        print('training on mse')
        trainer = optimizer.minimize(mse + reg_term)
    else:
        raise Exception('mse_mode spelled wrong')

    #2.5 Init a session accoding to the run_flag
    sess = tf.Session()
    # restore variables
    saver = tf.train.Saver()
    if p.run_flag == 'load_saved':
        print('*** In TL Mode')
        saver.restore(sess, "./step1/step1.ckpt")
    elif p.run_flag == 'rand_init':
        print('*** In Rand Init Mode')
        init = tf.global_variables_initializer()
        sess.run(init)
    elif p.run_flag == 'impute':
        print('*** In impute mode   loading "step2.ckpt"..')
        saver.restore(sess, './step2/step2.ckpt')
        p.max_training_epochs = 0
        p.learning_rate = 0.0

        ## save_whole_imputation
        save_whole_imputation(sess, X, h, a_bottleneck, pIn_holder,
                              pHidden_holder, input_matrix, gene_ids, cell_ids,
                              p, m)
        print('imputation finished')
        #toc_stop = time.time()
        #print("reading took {:.1f} seconds".format(toc_stop - tic_start))
        exit()
    else:
        raise Exception('run_flag err')

    # define tensor_board writer
    batch_writer = tf.summary.FileWriter(log_dir + '/batch', sess.graph)
    valid_writer = tf.summary.FileWriter(log_dir + '/valid', sess.graph)

    # prep mini-batch, and reporter vectors
    num_batch = int(math.floor(len(train_idx) // p.batch_size))  # floor
    epoch_log = []
    mse_nz_batch_vec, mse_nz_valid_vec = [], [
    ]  #, mse_nz_train_vec = [], [], []
    mse_batch_vec, mse_valid_vec = [], []  # mse = MSE(X, h)
    #msej_batch_vec, msej_valid_vec = [], []  # msej = MSE(X, h), for genej, nz_cells
    print('RAM usage after building the model is: {} M'.format(usage()))

    epoch = 0
    #2.6. pre-training epoch (0)
    #save imputation results before training steps
    print("Evaluation: epoch{}".format(epoch))
    epoch_log.append(epoch)
    mse_train, mse_nz_train = sess.run([mse, mse_nz],
                                       feed_dict={
                                           X: sample_train,
                                           pHidden_holder: 1.0,
                                           pIn_holder: 1.0
                                       })
    mse_valid, mse_nz_valid = sess.run([mse, mse_nz],
                                       feed_dict={
                                           X: sample_valid,
                                           pHidden_holder: 1.0,
                                           pIn_holder: 1.0
                                       })
    print("mse_nz_train=", round(mse_nz_train, 3), "mse_nz_valid=",
          round(mse_nz_valid, 3))
    print("mse_train=", round(mse_train, 3), "mse_valid=", round(mse_valid, 3))
    mse_batch_vec.append(mse_train)
    mse_valid_vec.append(mse_valid)
    mse_nz_batch_vec.append(mse_nz_train)
    mse_nz_valid_vec.append(mse_nz_valid)

    #2.7. training epochs (1-)
    for epoch in range(1, p.max_training_epochs + 1):
        tic_cpu, tic_wall = time.clock(), time.time()

        ridx_full = np.random.choice(len(train_idx),
                                     len(train_idx),
                                     replace=False)

        #2.7.1 training model on mini-batches
        for i in range(num_batch):
            # x_batch
            indices = np.arange(p.batch_size * i, p.batch_size * (i + 1))
            ridx_batch = ridx_full[indices]
            # x_batch = df1_train.ix[ridx_batch, :]
            x_batch = input_train[ridx_batch, :].todense()

            sess.run(trainer,
                     feed_dict={
                         X: x_batch,
                         pIn_holder: p.pIn,
                         pHidden_holder: p.pHidden
                     })

        toc_cpu, toc_wall = time.clock(), time.time()

        #2.7.2 save the results of epoch 1 and all display steps (epochs)
        if (epoch == 1) or (epoch % p.display_step == 0):
            tic_log = time.time()

            print(
                '#Epoch  {}  took:  {}  CPU seconds;  {} Wall seconds'.format(
                    epoch, round(toc_cpu - tic_cpu, 2),
                    round(toc_wall - tic_wall, 2)))
            print('num-mini-batch per epoch: {}, till now: {}'.format(
                i + 1, epoch * (i + 1)))
            print('RAM usage: {:0.1f} M'.format(usage()))

            # debug
            # print('d_w1', sess.run(d_w1[1, 0:4]))  # verified when GradDescent used

            # training mse and mse_nz of the last batch
            mse_batch, mse_nz_batch, h_batch = sess.run([mse, mse_nz, h],
                                                        feed_dict={
                                                            X: x_batch,
                                                            pHidden_holder:
                                                            1.0,
                                                            pIn_holder: 1.0
                                                        })
            # validation mse and mse_nz of the sample validation set (1000)
            mse_valid, mse_nz_valid, Y_valid = sess.run([mse, mse_nz, h],
                                                        feed_dict={
                                                            X: sample_valid,
                                                            pHidden_holder:
                                                            1.0,
                                                            pIn_holder: 1.0
                                                        })

            toc_log = time.time()

            print('mse_nz_batch:{};  mse_omage_valid: {}'.format(
                mse_nz_batch, mse_nz_valid))
            print('mse_batch:', mse_batch, '; mse_valid:', mse_valid)
            print('log time for each epoch: {}\n'.format(
                round(toc_log - tic_log, 1)))

            mse_batch_vec.append(mse_batch)
            mse_valid_vec.append(mse_valid)
            mse_nz_batch_vec.append(mse_nz_batch)
            mse_nz_valid_vec.append(mse_nz_valid)
            epoch_log.append(epoch)

        #2.7.3 save snapshot step
        if (epoch % p.snapshot_step == 0) or (epoch == p.max_training_epochs):
            tic_log2 = time.time()

            #1.save imputation results
            #if the input matrix is large (m > p.large_size), only save the
            #imputation results of a small sample set (sample_input)
            print("> Impute and save.. ")
            if m > p.large_size:
                Y_input_df = fast_imputation(sess, h, X, pIn_holder,
                                             pHidden_holder, sample_input,
                                             gene_ids, sample_input_cell_ids)
                scimpute.save_hd5(
                    Y_input_df,
                    "{}/sample_imputation.{}.hd5".format(p.stage, p.stage))
            else:
                Y_input_df = fast_imputation(sess, h, X, pIn_holder,
                                             pHidden_holder,
                                             input_matrix.todense(), gene_ids,
                                             cell_ids)
                scimpute.save_hd5(
                    Y_input_df,
                    "{}/imputation.{}.hd5".format(p.stage, p.stage))
            #2.save model
            print('> Saving model..')
            save_path = saver.save(sess, log_dir + "/{}.ckpt".format(p.stage))
            print("Model saved in: %s" % save_path)

            #3.save the training and test curve
            if p.mse_mode in ('mse_nz', 'mse_omega'):
                #learning_curve_mse_nz(skip=math.floor(epoch / 5 / p.display_step))
                learning_curve_mse_nz(epoch_log,
                                      mse_nz_batch_vec,
                                      mse_nz_valid_vec,
                                      p.stage,
                                      skip=math.floor(epoch / 5 /
                                                      p.display_step))
            elif p.mse_mode == 'mse':
                #learning_curve_mse(skip=math.floor(epoch / 5 / p.display_step))
                learning_curve_mse(epoch_log,
                                   mse_batch_vec,
                                   mse_valid_vec,
                                   p.stage,
                                   skip=math.floor(epoch / 5 / p.display_step))

            #4.save save_bottleneck_representation
            print("> save bottleneck_representation")
            code_bottleneck_input = sess.run(a_bottleneck,
                                             feed_dict={
                                                 X: sample_input,
                                                 pIn_holder: 1,
                                                 pHidden_holder: 1
                                             })
            np.save('{}/code_neck_valid.{}.npy'.format(p.stage, p.stage),
                    code_bottleneck_input)

            #save_weights()
            save_weights(sess, p.stage, en_de_layers=p.l)

            #visualize_weights()
            visualize_weights(sess, p.stage, en_de_layers=p.l)

            toc_log2 = time.time()

            log2_time = round(toc_log2 - tic_log2, 1)
            min_mse_valid = min(mse_nz_valid_vec)

            # os.system(
            #     '''for file in {0}/*npy
            #     do python -u weight_clustmap.py $file {0}
            #     done'''.format(p.stage)
            # )
            print('min_mse_nz_valid till now: {}'.format(min_mse_valid))
            print('snapshot_step: {}s'.format(log2_time))

    batch_writer.close()
    valid_writer.close()
    sess.close()
Exemple #5
0
def save_whole_imputation(sess, X, h, a_bottleneck, pIn_holder, pHidden_holder,
                          input_matrix, gene_ids, cell_ids, p, m):
    ''' calculate and save imputation results for an input matrix at the 'impute' mode. If  the number 
    of cells is larger than a threshold (large_size: 1e5), save results of m//p.sample_size 'folds'.
    
    Parameters
    ----------
    
    '''

    if m > p.large_size:
        #impute on small data blocks to avoid high memory cost
        n_out_batches = m // p.sample_size
        print('num_out_batches:', n_out_batches)
        handle2 = open('./{}/latent_code.{}.csv'.format(p.stage, p.stage), 'w')
        with open('./{}/imputation.{}.csv'.format(p.stage, p.stage),
                  'w') as handle:
            for i_ in range(n_out_batches + 1):
                start_idx = i_ * p.sample_size
                end_idx = min((i_ + 1) * p.sample_size, m)
                print('saving:', start_idx, end_idx)

                x_out_batch = input_matrix[start_idx:end_idx, :].todense()

                y_out_batch = sess.run(h,
                                       feed_dict={
                                           X: x_out_batch,
                                           pIn_holder: 1,
                                           pHidden_holder: 1
                                       })
                df_out_batch = pd.DataFrame(data=y_out_batch,
                                            columns=gene_ids,
                                            index=cell_ids[range(
                                                start_idx, end_idx)])

                latent_code = sess.run(a_bottleneck,
                                       feed_dict={
                                           X: x_out_batch,
                                           pIn_holder: 1,
                                           pHidden_holder: 1
                                       })
                latent_code_df = pd.DataFrame(data=latent_code,
                                              index=cell_ids[range(
                                                  start_idx, end_idx)])

                if i_ == 0:
                    df_out_batch.to_csv(handle, float_format='%.6f')
                    latent_code_df.to_csv(handle2, float_format='%.6f')
                    print(
                        'RAM usage during mini-batch imputation and saving output: ',
                        '{} M'.format(usage()))
                else:
                    df_out_batch.to_csv(handle, header=None)
                    latent_code_df.to_csv(handle2, header=None)
        handle2.close()

    else:  # if m the # of cells is less than large_size (1e5))
        Y_input_arr = sess.run(h,
                               feed_dict={
                                   X: input_matrix.todense(),
                                   pIn_holder: 1,
                                   pHidden_holder: 1
                               })
        # save sample imputation
        Y_input_df = pd.DataFrame(data=Y_input_arr,
                                  columns=gene_ids,
                                  index=cell_ids)
        latent_code = sess.run(a_bottleneck,
                               feed_dict={
                                   X: input_matrix.todense(),
                                   pIn_holder: 1,
                                   pHidden_holder: 1
                               })
        latent_code_df = pd.DataFrame(data=latent_code, index=cell_ids)
        print('RAM usage during whole data imputation and saving output: ',
              '{} M'.format(usage()))
        scimpute.save_hd5(Y_input_df,
                          "{}/imputation.{}.hd5".format(p.stage, p.stage))
        scimpute.save_hd5(latent_code_df,
                          "{}/latent_code.{}.hd5".format(p.stage, p.stage))
Exemple #6
0
# read data (so that output matrix is [sample, gene])
if matrix_type == 'cell_row':
    df = pd.read_hdf(file_name).transpose()
elif matrix_type == 'gene_row':
    df = pd.read_hdf(file_name)

# summary
print('input shape [genes, samples]:', df.shape, df.ix[0:3, 0:2])

nz_rate_in = scimpute.nnzero_rate_df(df)
print('nz_rate_in: {}'.format(nz_rate_in))

# read list
if list_name.endswith('.csv'):
    list_df = pd.read_csv(list_name, index_col=0, sep='\t', header=None)
elif list_name.endswith('.hd5'):
    list_df = scimpute.read_hd5(list_name)

print('list:', list_df.shape, list_df.index)

# filter
df_yes = df.ix[list_df.index]
overlap = df.index.isin(list_df.index)
df_no = df.ix[~overlap]

print('matrix yes: ', df_yes.shape, df_yes.ix[0:3, 0:2])
print('matrix no: ', df_no.shape, df_no.ix[0:3, 0:2])

# output result dataframe
scimpute.save_hd5(df_yes, out_prefix + '_yes.hd5')
scimpute.save_hd5(df_no, out_prefix + '_no.hd5')