raise Exception("error: the num of arguments not correct") else: print('running:') print(sys.argv) big_name = str(sys.argv[1]) # big.hd5 small_name = str(sys.argv[2]) # small.hd5 out_name = str(sys.argv[3]) # big.small.hd5 # read print('read big-df..') df_big = pd.read_hdf(big_name) nz_big = scimpute.nnzero_rate_df(df_big) print('nz_rate big-df: ', nz_big) print('read small-df..') df_small = scimpute.read_hd5(small_name) nz_small = scimpute.nnzero_rate_df(df_small) print('nz_rate small_df: ', nz_small) # Remove .x from ID # df_big.index = df_big.index.to_series().astype(str).str.replace(r'\.[0-9]*','').astype(str) # print('because the index is different, remove the appendix') # print('big df after changing index', df_big.ix[0:5, 0:5]) print('df_big index is unique? {}'.format(df_big.index.is_unique)) print('df_small index is unique? {}'.format(df_small.index.is_unique)) # SELECT print('selecting..') df_selected = df_big.ix[df_small.index] # Check null, fill zeros
# get argv # print('usage: <data_RpmNorm_LogTrans.py> <file.csv/hd5> <cell_row/gene_row> <out_name(x.rpm.log.hd5)>') print('cmd typed:', sys.argv) if len(sys.argv) != 4: raise Exception('num args err') file = str(sys.argv[1]) matrix_mode = str(sys.argv[2]) outname = str(sys.argv[3]) # Read data into [gene, cell] if matrix_mode == 'cell_row': if file.endswith('.csv'): df = scimpute.read_csv(file).transpose() elif file.endswith('.hd5'): df = scimpute.read_hd5(file).transpose() else: raise Exception('file extension error: not hd5/csv') elif matrix_mode == 'gene_row': if file.endswith('.csv'): df = scimpute.read_csv(file) elif file.endswith('.hd5'): df = scimpute.read_hd5(file) else: raise Exception('file extension error: not hd5/csv') else: raise Exception('cmd err in the argv[2]') # summary nz_rate_df = scimpute.nnzero_rate_df(df) print('df.shape, [gene, cell]:', df.shape)
# read data (so that output matrix is [sample, gene]) if matrix_type == 'cell_row': df = pd.read_hdf(file_name).transpose() elif matrix_type == 'gene_row': df = pd.read_hdf(file_name) # summary print('input shape [genes, samples]:', df.shape, df.ix[0:3, 0:2]) nz_rate_in = scimpute.nnzero_rate_df(df) print('nz_rate_in: {}'.format(nz_rate_in)) # read list if list_name.endswith('.csv'): list_df = pd.read_csv(list_name, index_col=0, sep='\t', header=None) elif list_name.endswith('.hd5'): list_df = scimpute.read_hd5(list_name) print('list:', list_df.shape, list_df.index) # filter df_yes = df.ix[list_df.index] overlap = df.index.isin(list_df.index) df_no = df.ix[~overlap] print('matrix yes: ', df_yes.shape, df_yes.ix[0:3, 0:2]) print('matrix no: ', df_no.shape, df_no.ix[0:3, 0:2]) # output result dataframe scimpute.save_hd5(df_yes, out_prefix + '_yes.hd5') scimpute.save_hd5(df_no, out_prefix + '_no.hd5')