def get_sketch(adata, key, folds=10, how='pd', min_num_per_key=500, start='filter'): '''geometric sketching based on diffusion map and pca folds: folds to subsample min_num_per_key: minimun number to sample''' sketch_index = [] for smp in set(adata.obs[key]): print(smp) c = adata.obs[key] == smp if start == 'filter': sdata = get_subset(adata, c) else: sdata = adata[c] sc.pp.filter_genes_dispersion(sdata) sc.pp.pca(sdata) sc.pp.neighbors(sdata) sc.tl.diffmap(sdata) N = np.max( [np.int(np.sum(c) / folds), np.min([min_num_per_key, np.sum(c)])]) print(N) if how == 'pd': set1 = set(sdata.obs_names[gs(sdata.obsm['X_diffmap'], N, replace=True)]) set2 = set(sdata.obs_names[gs(sdata.obsm['X_pca'][:, :50], N, replace=True)]) sketch_index.extend(list(set1.union(set2))) elif how == 'p': set2 = set(sdata.obs_names[gs(sdata.obsm['X_pca'][:, :50], N, replace=True)]) sketch_index.extend(list(set2)) elif how == 'd': set1 = set(sdata.obs_names[gs(sdata.obsm['X_diffmap'][:, :20], N, replace=True)]) sketch_index.extend(list(set1)) else: raise SystemError return (sketch_index)
def runGeoSketch(adata,N=10000,use_rep="X_pca"): from geosketch import gs sc.tl.pca(adata) sketch_index = gs(adata.obsm[use_rep], N, replace=False) adata.uns['geosketch']=adata.obs.index[sketch_index] subdata = adata[adata.obs.index[sketch_index]] return subdata
def subsample(self, counts: pd.DataFrame) -> pd.DataFrame: input_genes = counts.shape[1] if self.num_cells is None: self.num_cells = int(input_genes / 3) core_logger.info('Subsampling {} to {}'.format(input_genes, self.num_cells)) counts_t = counts.T if self.log: pca_input = np.log1p(counts_t) else: pca_input = counts_t try: u, s, vt = pca(pca_input.values, k=self.num_pc) x_dimred = u[:, :self.num_pc] * s[:self.num_pc] sketch_index = gs(x_dimred, self.num_cells, replace=False) x_matrix = counts_t.iloc[sketch_index] except Exception as e: core_logger.warning('Subsampling failed: ignored.') if self.verbose: core_logger.warning(str(e)) return counts core_logger.info('Done subsampling {} to {}'.format(input_genes, self.num_cells)) return x_matrix.T
def fit(self, X, y): if X.shape[0] > self.n_inducing_: if self.method_ == 'uniform': uni_idx = np.random.choice(X.shape[0], self.n_inducing_, replace=False) X_sketch = X[uni_idx] y_sketch = y[uni_idx] elif self.method_ == 'geosketch': from fbpca import pca from geosketch import gs U, s, _ = pca(X, k=100) X_dimred = U[:, :100] * s[:100] gs_idx = gs(X_dimred, self.n_inducing_, replace=False) X_sketch = X[gs_idx] y_sketch = y[gs_idx] else: X_sketch, y_sketch = X, y self.gpr_ = GPRegressor( n_restarts=self.n_restarts_, kernel=self.kernel_, backend=self.backend_, batch_size=self.batch_size_, n_jobs=self.n_jobs_, verbose=self.verbose_, ).fit(X_sketch, y_sketch)
def train_clf_helper(X_tr, y_tr, X_te, y_te): if train_size < X_tr.shape[0]: sketch_idx = gs(X_tr, train_size, replace=False) X_tr = X_tr[sketch_idx] y_tr = y_tr[sketch_idx] # clf = KNeighborsClassifier() clf = LogisticRegression(solver='lbfgs', multi_class='multinomial') clf.fit(X_tr, y_tr) y_pred = clf.predict(X_te) print(classification_report(y_te, y_pred)) print() return classification_report(y_te, y_pred, output_dict=True)
def get_expression(adata, use_raw=True, use_geosketch=False, geosketch_N=10000, geosketch_transform=None): """Grab expression and put into pandas dataframe.""" if use_raw: ad = adata.raw else: ad = adata if isinstance(ad.X, csr_matrix): df = pd.DataFrame(ad.X.toarray(), index=ad.obs_names, columns=ad.var_names) else: df = pd.DataFrame(ad.X, index=ad.obs_names, columns=ad.var_names) if use_geosketch: if geosketch_transform==None: sc.tl.umap(adata, n_components=2) geosketch_transform = "umap" sketch_index = gs(adata.obsm[("X_" + geosketch_transform)], geosketch_N, replace=False) df = df.loc[sketch_index] return df.transpose()
def sketch(self, X): """ Actually sketches the dataset and saves nearest neighbor mappings from sketch elements to sample observations in full dataset in the `self.sketch_neighbors` variable. Parameters ---------- X: `numpy.ndarray` or `scipy.sparse.csr_matrix` Dataset tot be sketched. Returns ------- X_sketch Sketched version of dataset `X`. """ n_samples = X.shape[0] if self.verbose: tprint('Sketching...') if self.sketch_method == 'geometric': from geosketch import gs sketch_idx = gs(X, self.sketch_size, replace=False) elif self.sketch_method == 'uniform': sketch_idx = sorted( np.random.choice(n_samples, size=self.sketch_size, replace=False)) else: return X X_sketch = X[sketch_idx] self.sketch_neighbors = nearest_approx(X, X_sketch) return X[sketch_idx]
def multi_sketch(dimRed, fractions, clusters): """ Do geometric sketches of the data given in dimRed, one per fraction """ total_cells = dimRed.shape[0] percentages = ["pct" + str(int(i*100)) for i in fractions] all_counts = Counter(clusters) cluster_names = [str(i) for i in sorted([int(float(i)) for i in list(set(clusters))])] sketch_index_by_percentage = {} sketch_N = [] sketch_df = pd.DataFrame(columns = percentages + ['full'], index = cluster_names) for key, value in all_counts.items(): sketch_df.loc[key, 'full'] = value for i, fraction in enumerate(fractions): N = ceil(total_cells * fraction) print("total number of cells: ", total_cells, "; fraction: ", fraction, "; fraction # cells: ", N) sketch_N = sketch_N + [N] this_sketch_index = gs(dimRed, N, replace=False) sketch_index_by_percentage[percentages[i]] = this_sketch_index subset_counts = Counter(clusters.iloc[this_sketch_index]) for key, value in subset_counts.items(): sketch_df.loc[key, percentages[i]] = value return sketch_df, sketch_index_by_percentage, sketch_N
datasets, genes = merge_datasets(datasets, genes_list) X = vstack(datasets) if not os.path.isfile('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE)): log('Dimension reduction with {}...'.format(METHOD)) X_dimred = reduce_dimensionality(normalize(X), method=METHOD, dimred=DIMRED) log('Dimensionality = {}'.format(X_dimred.shape[1])) np.savetxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE), X_dimred) else: X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format( METHOD, NAMESPACE)) from geosketch import gs, uniform samp_idx = gs(X_dimred, 20000, replace=False) #samp_idx = uniform(X_dimred, 20000, replace=False) #from anndata import AnnData #import scanpy.api as sc #adata = AnnData(X=X_dimred[samp_idx, :]) #sc.pp.neighbors(adata, use_rep='X') #sc.tl.louvain(adata, resolution=1.5, key_added='louvain') # #louv_labels = np.array(adata.obs['louvain'].tolist()) #le = LabelEncoder().fit(louv_labels) #cell_labels = le.transform(louv_labels) # #np.savetxt('data/cell_labels/mouse_brain_louvain.txt', cell_labels) cell_labels = (open('data/cell_labels/mouse_brain_louvain.txt').read().
def geosketch_sample_dimred(X, n): U, s, Vt = pca(X, k=100) # E.g., 100 PCs. X_dimred = U[:, :100] * s[:100] sketch_index = gs(X_dimred, n, replace=False) return X_dimred[sketch_index]
def correlate_tf_motifs( adata: AnnData, n_sketch: Optional[int] = 2500, n_permutations: Optional[int] = 100000, indirect: Optional[bool] = True, ) -> None: """Correlate inferred motif activity with TF expression. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. n_sketch : `int`, optional (default: 2500) If the number of cells is higher than `n_sketch`, use geometric sketching (Hie et al. 2019) to select a subset of `n_sketch` cells. This subset will be used to calculate the correlation beteen motif activity and transcription factor expression. n_permutations : `int`, optional (default: 100000) Number of permutations that is used to calculate the p-value. Can be decreased for quicker run-time, but should probably not be below 10000. indirect : `bool`, optional (default: True) Include indirect TF to motif assignments. """ logger.info("correlating motif activity with factors") if indirect: logger.info("including indirect and/or predicted factors") # Get all TFs from motif database m2f = motif_mapping(indirect=True) batch_size = m2f.shape[0] f2m2 = pd.DataFrame(m2f["factors"].str.split(",").tolist(), index=m2f.index).stack() f2m2 = f2m2.to_frame().reset_index().iloc[:, [0, 2]] f2m2.columns = ["motif", "factor"] unique_factors = f2m2["factor"].unique() if n_sketch is None or n_sketch > adata.shape[0]: logger.info(f"using all cells") my_adata = adata else: logger.info(f"creating sketch of {n_sketch} cells") idx = geosketch.gs(adata.obsm["X_pca"], n_sketch) my_adata = adata.copy() my_adata = my_adata[idx] detected = (my_adata.raw.var_names.str.upper().isin(unique_factors)) & ( (my_adata.raw.X > 0).sum(0) > 3) detected = np.squeeze(np.asarray(detected)) unique_factors = my_adata.raw.var_names[detected].str.upper() # Get the expression for all TFs expression = (np.squeeze(np.asarray(my_adata.raw.X.todense())) if issparse(my_adata.raw.X) else my_adata.raw.X) expression = expression.T[detected] logger.info( f"calculating correlation of motif activity with {len(unique_factors)} factors" ) real = fast_corr( expression, (my_adata.obsm["X_cell_types"] @ my_adata.uns["scepia"]["motif_activity"].T).T.values, ) real = pd.DataFrame( real, index=unique_factors, columns=my_adata.uns["scepia"]["motif_activity"].index, ) tmp = (real.reset_index().melt( id_vars="index", var_name="motif", value_name="correlation").rename(columns={ "index": "factor" }).set_index(["motif", "factor"])) f2m2 = f2m2.set_index(["motif", "factor"]).join(tmp).dropna() f2m2["abs_correlation"] = f2m2["correlation"].abs() logger.info(f"calculating {n_permutations} permutations") permute_result = pd.DataFrame(index=unique_factors) shape = my_adata.uns["scepia"]["motif_activity"].shape for i in tqdm(range(0, n_permutations, batch_size)): random_activities = None while random_activities is None or random_activities.shape[ 0] < batch_size: x = my_adata.uns["scepia"]["motif_activity"].values.flatten() motif_activity = shuffle(x).reshape(shape[1], shape[0]) cell_motif_activity = ( my_adata.obsm["X_cell_types"] @ motif_activity).T if random_activities is None: random_activities = cell_motif_activity else: random_activities = np.vstack( (random_activities, cell_motif_activity)) random_activities = random_activities[:batch_size] batch_result = fast_corr(expression, random_activities) batch_result = pd.DataFrame(batch_result, index=unique_factors, columns=range(i, i + batch_size)) permute_result = permute_result.join(batch_result) logger.info("calculating permutation-based p-values (all)") # Calculate p-value of correlation relative to all permuted correlations permuted_corrs = permute_result.values.flatten() pvals = [(100 - percentileofscore(permuted_corrs, corr)) / 100 for corr in f2m2["correlation"]] f2m2["pval"] = pvals f2m2.loc[f2m2["correlation"] < 0, "pval"] = (1 - f2m2.loc[f2m2["correlation"] < 0, "pval"]) logger.info("calculating permutation-based p-values (factor-specific)") # Calculate p-value of correlation relative to permutated value of this factor for motif, factor in tqdm(f2m2.index): pval = (100 - percentileofscore(permute_result.loc[factor], real.loc[factor, motif])) / 100 pval = 1 - pval if real.loc[factor, motif] < 0 else pval pval = 1 / permute_result.shape[1] if pval == 0 else pval f2m2.loc[(motif, factor), "permutation_pval"] = pval f2m2.loc[(motif, factor), "combined"] = combine_pvalues( f2m2.loc[(motif, factor), ["pval", "permutation_pval"]])[1] f2m2["p_adj"] = multipletests(f2m2["combined"], method="fdr_bh")[1] f2m2["-log10(p-value)"] = -np.log10(f2m2["p_adj"]) cluster_cell_types = adata.obs["cluster_annotation"].unique() f2m2 = f2m2.join( (adata.uns["scepia"]["motif_activity"][cluster_cell_types].max(1) - adata.uns["scepia"]["motif_activity"][cluster_cell_types].min(1) ).to_frame("motif_stddev").rename_axis("motif")) f2m2 = f2m2.reset_index().set_index("factor") adata.uns["scepia"]["correlation"] = f2m2
def fit(self, X, y): if self.sketch_size_ is not None: from fbpca import pca from geosketch import gs if X.shape[1] > 100: U, s, _ = pca(X, k=100) X_dimred = U * s else: X_dimred = X gs_idx = gs(X_dimred, self.sketch_size_, replace=False) X = X[gs_idx] y = y[gs_idx] n_samples, n_features = X.shape if self.verbose_: tprint( 'Fitting Bayesian NN on {} data points with dimension {}...'. format(*X.shape)) X = X.astype(np.float32) # Edward uses float32. # Bayesian weights. W_0_shape = [n_features, self.n_hidden1_] W_0 = Normal(loc=tf.zeros(W_0_shape), scale=tf.ones(W_0_shape)) W_1_shape = [self.n_hidden1_, self.n_hidden2_] W_1 = Normal(loc=tf.zeros(W_1_shape), scale=tf.ones(W_1_shape)) W_2_shape = [self.n_hidden2_, y.shape[1]] W_2 = Normal(loc=tf.zeros(W_2_shape), scale=tf.ones(W_2_shape)) # Bayesian biases. b_0 = Normal(loc=tf.zeros(self.n_hidden1_), scale=tf.ones(self.n_hidden1_)) b_1 = Normal(loc=tf.zeros(self.n_hidden2_), scale=tf.ones(self.n_hidden2_)) b_2 = Normal(loc=tf.zeros(y.shape[1]), scale=tf.ones(y.shape[1])) # Approximating distributions for KL divergence # variational inference. qW_0 = Normal(loc=tf.get_variable("qW_0/loc", W_0_shape), scale=tf.nn.softplus( tf.get_variable("qW_0/scale", W_0_shape))) qW_1 = Normal(loc=tf.get_variable("qW_1/loc", W_1_shape), scale=tf.nn.softplus( tf.get_variable("qW_1/scale", W_1_shape))) qW_2 = Normal(loc=tf.get_variable("qW_2/loc", W_2_shape), scale=tf.nn.softplus( tf.get_variable("qW_2/scale", W_2_shape))) qb_0 = Normal(loc=tf.get_variable("qb_0/loc", [self.n_hidden1_]), scale=tf.nn.softplus( tf.get_variable("qb_0/scale", [self.n_hidden1_]))) qb_1 = Normal(loc=tf.get_variable("qb_1/loc", [self.n_hidden2_]), scale=tf.nn.softplus( tf.get_variable("qb_1/scale", [self.n_hidden2_]))) qb_2 = Normal(loc=tf.get_variable("qb_2/loc", [y.shape[1]]), scale=tf.nn.softplus( tf.get_variable("qb_2/scale", [y.shape[1]]))) # Fit model. X_variational = tf.placeholder(tf.float32, [n_samples, n_features], name='X') y_variational = Normal(loc=neural_network(X, W_0, W_1, W_2, b_0, b_1, b_2), scale=tf.ones((n_samples, y.shape[1]))) inference = ed.KLqp( { W_0: qW_0, b_0: qb_0, W_1: qW_1, b_1: qb_1, W_2: qW_2, b_2: qb_2, }, data={ X_variational: X, y_variational: y }) self.sess_ = ed.get_session() tf.global_variables_initializer().run() inference.run(n_iter=self.n_iter_, n_samples=10) self.model_ = { 'qW_0': qW_0, 'qb_0': qb_0, 'qW_1': qW_1, 'qb_1': qb_1, 'qW_2': qW_2, 'qb_2': qb_2, } if self.verbose_: tprint('Done fitting Bayesian NN model.') return self
NAMESPACE, n_seeds=4, #cell_labels=cell_labels, #cell_exp_ratio=True, #louvain_ami=True, #rare=True, #rare_label=le.transform(['Macrophage'])[0], max_min_dist=True, ) exit() report_cluster_counts(labels) from differential_entropies import differential_entropies differential_entropies(X_dimred, labels) plot_rare(X_dimred, cell_labels, le.transform(['Macrophage'])[0], NAMESPACE, n_seeds=4) from geosketch import gs samp_idx = gs(X_dimred, 1000, replace=False) save_sketch(X, samp_idx, genes, NAMESPACE + '1000') for scale in [10, 25, 100]: N = int(X.shape[0] / scale) samp_idx = gs(X_dimred, N, replace=False) save_sketch(X, samp_idx, genes, NAMESPACE + str(N))
cell_names += ['mcsf_day6'] * a.shape[0] else: assert (False) le = LabelEncoder().fit(cell_names) cell_labels = le.transform(cell_names) write_table(X.toarray(), genes, 'data/pseudotime/' + NAMESPACE) with open('data/pseudotime/mono_macro_meta.txt', 'w') as of: of.write('Label\n') for idx in range(X.shape[0]): of.write('mono_macro{}\t{}'.format(idx, cell_names[idx])) from geosketch import gs, gs_gap, uniform gs_idx = gs(X_dimred, 110, replace=False) write_table(X[gs_idx, :].toarray(), genes, 'data/pseudotime/' + NAMESPACE + '_gs') report_cluster_counts(cell_labels[gs_idx]) with open('data/pseudotime/mono_macro_meta_gs.txt', 'w') as of: of.write('Label\n') i = 0 for idx in range(X.shape[0]): if idx not in gs_idx: continue of.write('mono_macro_gs{}\t{}\n'.format(i, cell_names[idx])) i += 1 uni_idx = uniform(X_dimred, 110, replace=False) write_table(X[uni_idx, :].toarray(), genes,
# remove unnecessary fields matt = mat.copy() matt.drop("batch", axis=1, inplace=True) matt.drop("type", axis=1, inplace=True) matt.drop("cluster", axis=1, inplace=True) matt.drop("tissue.cancer", axis=1, inplace=True) matt.set_index('dataset', inplace=True) mattm = matt.values # compute the PCs - necessary input for the sketching U, s, Vt = pca(mattm, k=100) X_dimred = U[:, :100] * s[:100] # sketch N = int(N_samples * sk_sz) # Number of samples to obtain from the dataset sketch_index = gs(X_dimred, N, replace=False) X_sketch = X_dimred[sketch_index] # get the samples selected in the sketch and output reduced = pd.DataFrame(X_sketch) pca_out = pd.DataFrame(X_dimred) pca_out["dataset"] = list(matt.index) red_with_labs = pd.merge(pca_out, reduced, how="inner", on=list(reduced.columns.values)) selected = list(red_with_labs["dataset"]) out = open( currdir + "../results/sketches/" + can.lower().replace(" ", "-") + "-sketch.txt", "w")