Esempio n. 1
0
def integrate_sketch(datasets_dimred,
                     integration_fn,
                     integration_fn_args={},
                     sampling_fn=gs,
                     N=2000,
                     n_iter=1):

    sketch_idxs = [
        sorted(
            set(gs(X, N, replace=False)) | set(uniform(X, N, replace=False)))
        for X in datasets_dimred
    ]
    datasets_sketch = [X[idx] for X, idx in zip(datasets_dimred, sketch_idxs)]

    for _ in range(n_iter):
        datasets_int = integration_fn(datasets_sketch[:],
                                      **integration_fn_args)

    labels = []
    curr_label = 0
    for i, a in enumerate(datasets_sketch):
        labels += list(np.zeros(a.shape[0]) + curr_label)
        curr_label += 1
    labels = np.array(labels, dtype=int)

    for i, (X_dimred,
            X_sketch) in enumerate(zip(datasets_dimred, datasets_sketch)):
        X_int = datasets_int[i]

        neigh = NearestNeighbors(n_neighbors=3).fit(X_dimred)
        _, neigh_idx = neigh.kneighbors(X_sketch)

        ds_idxs, ref_idxs = [], []
        for ref_idx in range(neigh_idx.shape[0]):
            for k_idx in range(neigh_idx.shape[1]):
                ds_idxs.append(neigh_idx[ref_idx, k_idx])
                ref_idxs.append(ref_idx)

        bias = transform(X_dimred,
                         X_int,
                         ds_idxs,
                         ref_idxs,
                         15,
                         batch_size=1000)

        datasets_int[i] = X_dimred + bias

    return datasets_int
Esempio n. 2
0
        NAMESPACE + '_uncorrected',
        [ str(ct) for ct in sorted(set(labels)) ],
        perplexity=100, n_iter=500, image_suffix='.png',
        viz_cluster=True
    )
    
    entropy_test(embedding, labels[idx])

    datasets, genes_list, n_cells = load_names(data_names)
    datasets, genes = merge_datasets(datasets, genes_list)
    X = vstack(datasets)

    if not os.path.isfile('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE)):
        log('Dimension reduction with {}...'.format(METHOD))
        X_dimred = reduce_dimensionality(
            normalize(X), method=METHOD, dimred=DIMRED
        )
        log('Dimensionality = {}'.format(X_dimred.shape[1]))
        np.savetxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE), X_dimred)
    else:
        X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE))

    from ample import gs
    samp_idx = gs(X_dimred, 1000, replace=False)
    save_sketch(X, samp_idx, genes, NAMESPACE + '1000')
    
    for scale in [ 10, 25, 100 ]:
        N = int(X.shape[0] / scale)
        samp_idx = gs(X_dimred, N, replace=False)
        save_sketch(X, samp_idx, genes, NAMESPACE + str(N))
Esempio n. 3
0
    datasets, genes = merge_datasets(datasets, genes_list)
    X = vstack(datasets)

    if not os.path.isfile('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE)):
        log('Dimension reduction with {}...'.format(METHOD))
        X_dimred = reduce_dimensionality(normalize(X),
                                         method=METHOD,
                                         dimred=DIMRED)
        log('Dimensionality = {}'.format(X_dimred.shape[1]))
        np.savetxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE), X_dimred)
    else:
        X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format(
            METHOD, NAMESPACE))

    from ample import gs, uniform
    samp_idx = gs(X_dimred, 20000, replace=False)
    #samp_idx = uniform(X_dimred, 20000, replace=False)

    #from anndata import AnnData
    #import scanpy.api as sc
    #adata = AnnData(X=X_dimred[samp_idx, :])
    #sc.pp.neighbors(adata, use_rep='X')
    #sc.tl.louvain(adata, resolution=1.5, key_added='louvain')
    #
    #louv_labels = np.array(adata.obs['louvain'].tolist())
    #le = LabelEncoder().fit(louv_labels)
    #cell_labels = le.transform(louv_labels)
    #
    #np.savetxt('data/cell_labels/mouse_brain_louvain.txt', cell_labels)

    cell_labels = (open('data/cell_labels/mouse_brain_louvain.txt').read().
Esempio n. 4
0
            cell_names += [ 'mcsf_day6' ] * a.shape[0]
        else:
            assert(False)
    le = LabelEncoder().fit(cell_names)
    cell_labels = le.transform(cell_names)
    
    write_table(X.toarray(), genes, 'data/pseudotime/' + NAMESPACE)

    with open('data/pseudotime/mono_macro_meta.txt', 'w') as of:
        of.write('Label\n')
        for idx in range(X.shape[0]):
            of.write('mono_macro{}\t{}'.format(idx, cell_names[idx]))
            
    from ample import gs, gs_gap, uniform

    gs_idx = gs(X_dimred, 110, replace=False)
    write_table(X[gs_idx, :].toarray(), genes, 'data/pseudotime/' + NAMESPACE + '_gs')
    report_cluster_counts(cell_labels[gs_idx])

    with open('data/pseudotime/mono_macro_meta_gs.txt', 'w') as of:
        of.write('Label\n')
        i = 0
        for idx in range(X.shape[0]):
            if idx not in gs_idx:
                continue
            of.write('mono_macro_gs{}\t{}\n'.format(i, cell_names[idx]))
            i += 1
    
    uni_idx = uniform(X_dimred, 110, replace=False)
    write_table(X[uni_idx, :].toarray(), genes, 'data/pseudotime/' + NAMESPACE + '_uni')
    report_cluster_counts(cell_labels[uni_idx])