Beispiel #1
0
    print(mod)
    ti = time.time()

    if settings[mod].mod_category == 'mc':
        f_mat = hvftrs_f.format(mod, 'tsv')
        gxc_hvftrs[mod] = pd.read_csv(f_mat, sep='\t', header=0, index_col=0)
        print(gxc_hvftrs[mod].shape, time.time() - ti)
        assert np.all(
            gxc_hvftrs[mod].columns.values == metas[mod].index.values
        )  # make sure cell name is in the sanme order as metas (important if save knn mat)
        continue

    f_mat = hvftrs_f.format(mod, 'npz')
    f_gene = hvftrs_gene.format(mod)
    f_cell = hvftrs_cell.format(mod)
    _gxc_tmp = snmcseq_utils.load_gc_matrix(f_gene, f_cell, f_mat)
    _gene = _gxc_tmp.gene
    _cell = _gxc_tmp.cell
    _mat = _gxc_tmp.data

    gxc_hvftrs[mod] = GC_matrix(_gene, _cell, _mat)
    assert np.all(
        gxc_hvftrs[mod].cell == metas[mod].index.values
    )  # make sure cell name is in the sanme order as metas (important if save knn mat)
    print(gxc_hvftrs[mod].data.shape, time.time() - ti)

resolutions = [
    0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 3, 4, 6, 8, 12, 16, 20, 30, 40, 60,
    80, 100, 120
]
# ns = [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000]
    print(mod)
    
    ## read data
    # read metadata
    normalization_option = normalization_options[mod]
    f_meta = f_meta_format.format(SRC_DIR, mod) ##
    meta = pd.read_csv(f_meta, sep="\t", index_col=0)
    metas[mod] = meta
        
    f_data = f_data_format.format(SRC_DIR, mod, '', 'npz') 
    f_data_gene = f_data_format.format(SRC_DIR, mod, '', 'gene') 
    f_data_cell = f_data_format.format(SRC_DIR, mod, '', 'cell') 
    
    # read counts matrix 
    print(mod, "Reading in files {}".format(time.time()-ti))
    gxc_raw = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data) # checked dimensions in agreement internally
    gxc_raws[mod] = gxc_raw
    
    num_cells = len(meta)
    num_reads = gxc_raw.data.sum().sum()/num_cells
    num_reads_all[mod] = num_reads
    
    print(gxc_raw.data.shape, num_cells, num_reads)
    
    # check meta cells agree with gxc cells
    assert np.all(meta.index.values == gxc_raw.cell)
    # check genes are uniq 
    assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 
    
    print(mod, "Total time used: {}".format(time.time()-ti))
Beispiel #3
0
gxc_raws = collections.OrderedDict()
for mod in mods_selected:
    logging.info("Read data {}...".format(mod))
    if settings[mod].mod_category == 'mc':
        f_gene = raw_f.format(DATA_DIR, mod, '', 'gene')
        f_cell = raw_f.format(DATA_DIR, mod, '', 'cell')
        f_data_c = raw_f.format(DATA_DIR, mod, 'CH_', 'npz')
        f_data_mc = raw_f.format(DATA_DIR, mod, 'mCH_', 'npz')
        gxc_raws[mod] = snmcseq_utils.load_gc_matrix_methylation(
            f_gene, f_cell, f_data_mc, f_data_c)

    else:
        f_gene = raw_f.format(DATA_DIR, mod, '', 'gene')
        f_cell = raw_f.format(DATA_DIR, mod, '', 'cell')
        f_data = raw_f.format(DATA_DIR, mod, '', 'npz')
        gxc_raws[mod] = snmcseq_utils.load_gc_matrix(f_gene, f_cell, f_data)

# In[13]:

f = output_clst_and_umap
first_round_cluster_col = 'cluster_joint_r0.1'
df_info = pd.read_csv(
    f, sep="\t", index_col='sample')[[first_round_cluster_col, 'modality']]
print(df_info.shape)
df_info.head()

# In[20]:

normalization_options = {
    'smarter_nuclei': 'TPM',
    'smarter_cells': 'TPM',