コード例 #1
0
    raise Exception("error: the num of arguments not correct")
else:
    print('running:')
    print(sys.argv)
    big_name = str(sys.argv[1])  # big.hd5
    small_name = str(sys.argv[2])  # small.hd5
    out_name = str(sys.argv[3])  # big.small.hd5

# read
print('read big-df..')
df_big = pd.read_hdf(big_name)
nz_big = scimpute.nnzero_rate_df(df_big)
print('nz_rate big-df: ', nz_big)

print('read small-df..')
df_small = scimpute.read_hd5(small_name)
nz_small = scimpute.nnzero_rate_df(df_small)
print('nz_rate small_df: ', nz_small)

# Remove .x from ID
# df_big.index = df_big.index.to_series().astype(str).str.replace(r'\.[0-9]*','').astype(str)
# print('because the index is different, remove the appendix')
# print('big df after changing index', df_big.ix[0:5, 0:5])

print('df_big index is unique? {}'.format(df_big.index.is_unique))
print('df_small index is unique? {}'.format(df_small.index.is_unique))

# SELECT
print('selecting..')
df_selected = df_big.ix[df_small.index]
# Check null, fill zeros
コード例 #2
0
# get argv #
print('usage: <data_RpmNorm_LogTrans.py> <file.csv/hd5> <cell_row/gene_row> <out_name(x.rpm.log.hd5)>')
print('cmd typed:', sys.argv)
if len(sys.argv) != 4:
    raise Exception('num args err')

file = str(sys.argv[1])
matrix_mode = str(sys.argv[2])
outname = str(sys.argv[3])

# Read data into [gene, cell]
if matrix_mode == 'cell_row':
    if file.endswith('.csv'):
        df = scimpute.read_csv(file).transpose()
    elif file.endswith('.hd5'):
        df = scimpute.read_hd5(file).transpose()
    else:
        raise Exception('file extension error: not hd5/csv')
elif matrix_mode == 'gene_row':
    if file.endswith('.csv'):
        df = scimpute.read_csv(file)
    elif file.endswith('.hd5'):
        df = scimpute.read_hd5(file)
    else:
        raise Exception('file extension error: not hd5/csv')
else:
    raise Exception('cmd err in the argv[2]')

# summary
nz_rate_df = scimpute.nnzero_rate_df(df)
print('df.shape, [gene, cell]:', df.shape)
コード例 #3
0
# read data (so that output matrix is [sample, gene])
if matrix_type == 'cell_row':
    df = pd.read_hdf(file_name).transpose()
elif matrix_type == 'gene_row':
    df = pd.read_hdf(file_name)

# summary
print('input shape [genes, samples]:', df.shape, df.ix[0:3, 0:2])

nz_rate_in = scimpute.nnzero_rate_df(df)
print('nz_rate_in: {}'.format(nz_rate_in))

# read list
if list_name.endswith('.csv'):
    list_df = pd.read_csv(list_name, index_col=0, sep='\t', header=None)
elif list_name.endswith('.hd5'):
    list_df = scimpute.read_hd5(list_name)

print('list:', list_df.shape, list_df.index)

# filter
df_yes = df.ix[list_df.index]
overlap = df.index.isin(list_df.index)
df_no = df.ix[~overlap]

print('matrix yes: ', df_yes.shape, df_yes.ix[0:3, 0:2])
print('matrix no: ', df_no.shape, df_no.ix[0:3, 0:2])

# output result dataframe
scimpute.save_hd5(df_yes, out_prefix + '_yes.hd5')
scimpute.save_hd5(df_no, out_prefix + '_no.hd5')