def integrate_sketch(datasets_dimred, integration_fn, integration_fn_args={}, sampling_fn=gs, N=2000, n_iter=1): sketch_idxs = [ sorted( set(gs(X, N, replace=False)) | set(uniform(X, N, replace=False))) for X in datasets_dimred ] datasets_sketch = [X[idx] for X, idx in zip(datasets_dimred, sketch_idxs)] for _ in range(n_iter): datasets_int = integration_fn(datasets_sketch[:], **integration_fn_args) labels = [] curr_label = 0 for i, a in enumerate(datasets_sketch): labels += list(np.zeros(a.shape[0]) + curr_label) curr_label += 1 labels = np.array(labels, dtype=int) for i, (X_dimred, X_sketch) in enumerate(zip(datasets_dimred, datasets_sketch)): X_int = datasets_int[i] neigh = NearestNeighbors(n_neighbors=3).fit(X_dimred) _, neigh_idx = neigh.kneighbors(X_sketch) ds_idxs, ref_idxs = [], [] for ref_idx in range(neigh_idx.shape[0]): for k_idx in range(neigh_idx.shape[1]): ds_idxs.append(neigh_idx[ref_idx, k_idx]) ref_idxs.append(ref_idx) bias = transform(X_dimred, X_int, ds_idxs, ref_idxs, 15, batch_size=1000) datasets_int[i] = X_dimred + bias return datasets_int
NAMESPACE + '_uncorrected', [ str(ct) for ct in sorted(set(labels)) ], perplexity=100, n_iter=500, image_suffix='.png', viz_cluster=True ) entropy_test(embedding, labels[idx]) datasets, genes_list, n_cells = load_names(data_names) datasets, genes = merge_datasets(datasets, genes_list) X = vstack(datasets) if not os.path.isfile('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE)): log('Dimension reduction with {}...'.format(METHOD)) X_dimred = reduce_dimensionality( normalize(X), method=METHOD, dimred=DIMRED ) log('Dimensionality = {}'.format(X_dimred.shape[1])) np.savetxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE), X_dimred) else: X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE)) from ample import gs samp_idx = gs(X_dimred, 1000, replace=False) save_sketch(X, samp_idx, genes, NAMESPACE + '1000') for scale in [ 10, 25, 100 ]: N = int(X.shape[0] / scale) samp_idx = gs(X_dimred, N, replace=False) save_sketch(X, samp_idx, genes, NAMESPACE + str(N))
datasets, genes = merge_datasets(datasets, genes_list) X = vstack(datasets) if not os.path.isfile('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE)): log('Dimension reduction with {}...'.format(METHOD)) X_dimred = reduce_dimensionality(normalize(X), method=METHOD, dimred=DIMRED) log('Dimensionality = {}'.format(X_dimred.shape[1])) np.savetxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE), X_dimred) else: X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format( METHOD, NAMESPACE)) from ample import gs, uniform samp_idx = gs(X_dimred, 20000, replace=False) #samp_idx = uniform(X_dimred, 20000, replace=False) #from anndata import AnnData #import scanpy.api as sc #adata = AnnData(X=X_dimred[samp_idx, :]) #sc.pp.neighbors(adata, use_rep='X') #sc.tl.louvain(adata, resolution=1.5, key_added='louvain') # #louv_labels = np.array(adata.obs['louvain'].tolist()) #le = LabelEncoder().fit(louv_labels) #cell_labels = le.transform(louv_labels) # #np.savetxt('data/cell_labels/mouse_brain_louvain.txt', cell_labels) cell_labels = (open('data/cell_labels/mouse_brain_louvain.txt').read().
cell_names += [ 'mcsf_day6' ] * a.shape[0] else: assert(False) le = LabelEncoder().fit(cell_names) cell_labels = le.transform(cell_names) write_table(X.toarray(), genes, 'data/pseudotime/' + NAMESPACE) with open('data/pseudotime/mono_macro_meta.txt', 'w') as of: of.write('Label\n') for idx in range(X.shape[0]): of.write('mono_macro{}\t{}'.format(idx, cell_names[idx])) from ample import gs, gs_gap, uniform gs_idx = gs(X_dimred, 110, replace=False) write_table(X[gs_idx, :].toarray(), genes, 'data/pseudotime/' + NAMESPACE + '_gs') report_cluster_counts(cell_labels[gs_idx]) with open('data/pseudotime/mono_macro_meta_gs.txt', 'w') as of: of.write('Label\n') i = 0 for idx in range(X.shape[0]): if idx not in gs_idx: continue of.write('mono_macro_gs{}\t{}\n'.format(i, cell_names[idx])) i += 1 uni_idx = uniform(X_dimred, 110, replace=False) write_table(X[uni_idx, :].toarray(), genes, 'data/pseudotime/' + NAMESPACE + '_uni') report_cluster_counts(cell_labels[uni_idx])