def test_umap_transform_embedding_stability(iris, iris_selection): """Test that transforming data does not alter the learned embeddings Issue #217 describes how using transform to embed new data using a trained UMAP transformer causes the fitting embedding matrix to change in cases when the new data has the same number of rows as the original training data. """ data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data) original_embedding = fitter.embedding_.copy() # The important point is that the new data has the same number of rows # as the original fit data new_data = np.random.random(data.shape) _ = fitter.transform(new_data) assert_array_equal( original_embedding, fitter.embedding_, "Transforming new data changed the original embeddings", ) # Example from issue #217 a = np.random.random((1000, 10)) b = np.random.random((1000, 5)) umap = UMAP() u1 = umap.fit_transform(a[:, :5]) u1_orig = u1.copy() assert_array_equal(u1_orig, umap.embedding_) _ = umap.transform(b) assert_array_equal(u1_orig, umap.embedding_)
def test_densmap_trustworthiness_on_iris(iris): densmap_iris_model = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, densmap=True, verbose=True, ).fit(iris.data) embedding = densmap_iris_model.embedding_ trust = trustworthiness(iris.data, embedding, 10) assert ( trust >= 0.97 ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format( trust) with pytest.raises(NotImplementedError): densmap_iris_model.transform(iris.data[:10]) with pytest.raises(ValueError): densmap_iris_model.inverse_transform(embedding[:10]) with pytest.raises(NotImplementedError): _ = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, densmap=True, verbose=True, ).fit(iris.data, y=iris.target)
def run_svc(train_pkl, val_pkl, test_pkl, series=False, outcome_col='Disease_State', num_random_search=0): train_methyl_array, val_methyl_array, test_methyl_array = MethylationArray.from_pickle(train_pkl), MethylationArray.from_pickle(val_pkl), MethylationArray.from_pickle(test_pkl) umap = UMAP(n_components=100) umap.fit(train_methyl_array.beta) train_methyl_array.beta = pd.DataFrame(umap.transform(train_methyl_array.beta.values),index=train_methyl_array.return_idx()) val_methyl_array.beta = pd.DataFrame(umap.transform(val_methyl_array.beta),index=val_methyl_array.return_idx()) test_methyl_array.beta = pd.DataFrame(umap.transform(test_methyl_array.beta),index=test_methyl_array.return_idx()) model = SVC model = MachineLearning(model,options={'penalty':'l2','verbose':3,'n_jobs':35,'class_weight':'balanced'},grid={'C':[1,10,100,1000], 'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}, n_eval=num_random_search, series=series, labelencode=True, verbose=True) sklearn_model=model.fit(train_methyl_array,val_methyl_array,outcome_col) pickle.dump(sklearn_model,open('sklearn_model.p','wb')) y_pred = model.predict(test_methyl_array) pd.DataFrame(np.hstack((y_pred[:,np.newaxis],test_methyl_array.pheno[outcome_col].values[:,np.newaxis])),index=test_methyl_array.return_idx(),columns=['y_pred','y_true']).to_csv('SklearnPredictions.csv') original, std_err, (low_ci,high_ci) = model.return_outcome_metric(test_methyl_array, outcome_col, accuracy_score, run_bootstrap=True) results={'score':original,'Standard Error':std_err, '0.95 CI Low':low_ci, '0.95 CI High':high_ci} print('\n'.join(['{}:{}'.format(k,v) for k,v in results.items()]))
def run_umap(args): mapper = UMAP(n_neighbors=args.n_neighbors, random_state=args.seed, init="random") mapper.fit(X_train, y=y_train) Z = mapper.embedding_ Z_test = mapper.transform(X_test) return Z, Z_test
def do_cluster(ilayer): if ilayer in these_layers: fit = UMAP(n_components=cluster_ndims, verbose=3, \ n_neighbors=umap_n_neighbors, \ min_dist=umap_min_distance \ ).fit(activations_tocluster[ilayer]) return fit, fit.transform(activations_tocluster[ilayer]) else: return None, None
class Umap: """ This transformer transformers all vectors in an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] by means of umap. We're using the implementation in [umap-learn](https://umap-learn.readthedocs.io/en/latest/). Arguments: n_components: the number of compoments to create/add kwargs: keyword arguments passed to the UMAP algorithm Usage: ```python from whatlies.language import SpacyLanguage from whatlies.transformers import Umap words = ["prince", "princess", "nurse", "doctor", "banker", "man", "woman", "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire", "dog", "cat", "mouse", "red", "blue", "green", "yellow", "water", "person", "family", "brother", "sister"] lang = SpacyLanguage("en_core_web_md") emb = lang[words] emb.transform(Umap(3)).plot_interactive_matrix('umap_0', 'umap_1', 'umap_2') ``` """ def __init__(self, n_components=2, **kwargs): self.is_fitted = False self.n_components = n_components self.kwargs = kwargs self.tfm = UMAP(n_components=n_components, **kwargs) def __call__(self, embset): if not self.is_fitted: self.fit(embset) return self.transform(embset) def fit(self, embset): names, X = embset_to_X(embset=embset) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) warnings.simplefilter("ignore", category=NumbaPerformanceWarning) self.tfm.fit(X) self.is_fitted = True def transform(self, embset): names, X = embset_to_X(embset=embset) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=NumbaPerformanceWarning) new_vecs = self.tfm.transform(X) names_out = names + [f"umap_{i}" for i in range(self.n_components)] vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)]) new_dict = new_embedding_dict(names_out, vectors_out, embset) return EmbeddingSet(new_dict, name=f"{embset.name}.umap_{self.n_components}()")
def _umap_projection(embeddings, n_axes, **kwargs): embeddings_matrix = np.stack(embeddings.values()) umap = UMAP() umap.fit(embeddings_matrix) projected_matrix = umap.transform(embeddings_matrix) projected_emebddings = { embedding_id: projected_matrix[i, :] for i, embedding_id in enumerate(embeddings) } return projected_emebddings
def test_umap_graph_layout(): data, labels = make_blobs(n_samples=500, n_features=10, centers=5) model = UMAP(n_epochs=100, transform_mode="graph") graph = model.fit_transform(data) assert scipy.sparse.issparse(graph) nc, cl = scipy.sparse.csgraph.connected_components(graph) assert nc == 5 new_graph = model.transform(data[:10] + np.random.normal(0.0, 0.1, size=(10, 10))) assert scipy.sparse.issparse(graph) assert new_graph.shape[0] == 10
def draw_umap( data, n_neighbors=15, min_dist=0.1, c=None, n_components=2, metric="euclidean", title="", plot=True, cmap=None, use_plotly=False, **kwargs, ): """Generate plot of UMAP algorithm results based on specified arguments """ fit = UMAP( n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, metric=metric, random_state=42, ) mapper = fit.fit(data) u = fit.transform(data) if plot: if use_plotly: fig = px.scatter( x=u[:, 0], y=u[:, 1], color=c, title=title, **kwargs ) fig.update_layout( { "plot_bgcolor": "rgba(0, 0, 0, 0)", "paper_bgcolor": "rgba(0, 0, 0, 0)", } ) fig.show() else: fig = plt.figure() if n_components == 1: ax = fig.add_subplot(111) ax.scatter(u[:, 0], range(len(u)), c=c) if n_components == 2: ax = fig.add_subplot(111) scatter = ax.scatter(u[:, 0], u[:, 1], c=c, label=c, cmap=cmap) if n_components == 3: ax = fig.add_subplot(111, projection="3d") ax.scatter(u[:, 0], u[:, 1], u[:, 2], c=c, s=100) plt.title(title, fontsize=18) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) return u, mapper
def example_2(): digits_dataset = load_digits() digits_data = digits_dataset['data'] digits_target = digits_dataset['target'] X_tr, X_ts, y_tr, y_ts = train_test_split(digits_data, digits_target, test_size=0.40, random_state=42, stratify=digits_target) transform = UMAP(n_components=2, random_state=42, n_neighbors=30, min_dist=0.0) X_tr = transform.fit_transform(X_tr) X_ts = transform.transform(X_ts) s = KNeighborsClassifier(n_neighbors=30) c = KMeans() reval = FindBestClustCV(s=s, c=c, nfold=5, nclust_range=[2, 15], nrand=100) metrics, nclustbest, _ = reval.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr) plot_metrics(metrics, title='Reval performance digits dataset') out = reval.evaluate(X_tr, X_ts, nclust=nclustbest) perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab) print(f"Best number of clusters: {nclustbest}") print(f"Test set prediction ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}") print(f'AMI (true labels vs predicted labels) = ' f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}') print(f"Validation set normalized stability (misclassification):" f"{metrics['val'][nclustbest]}") print(f'Test set ACC = {out.test_acc} ' f'(true labels vs predicted labels)') plt.figure(figsize=(6, 4)) plt.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='rainbow_r') plt.title("Test set true labels (digits dataset)") plt.show() plt.figure(figsize=(6, 4)) plt.scatter(X_ts[:, 0], X_ts[:, 1], c=perm_lab, cmap='rainbow_r') plt.title("Test set clustering labels (digits dataset)") plt.show()
def run_dimention_reduction(train_x, test_x, train_y): """ 次元削減を行う関数(PCA ⇒ UMAP) """ # 始めにPCAで元の1/2に次元削減する n_components = round(len(train_x.columns) * 0.5) pca = PCA(n_components=n_components).fit(train_x) reduced_train_x = pd.DataFrame(pca.transform(train_x)) reduced_test_x = pd.DataFrame(pca.transform(test_x)) # UMAPで2次元に削減 reducer = UMAP(random_state=0) reducer.fit(reduced_train_x) reduced_train_x = pd.DataFrame(reducer.transform(reduced_train_x)) reduced_test_x = pd.DataFrame(reducer.transform(reduced_test_x)) # 標準化 reduced_train_x = cf.standardize(reduced_train_x) reduced_test_x = cf.standardize(reduced_test_x) reduced_train_x.columns = ["umap_1", "umap_2"] reduced_test_x.columns = ["umap_1", "umap_2"] return reduced_train_x, reduced_test_x
def run(n_init, max_features, umap_n_components, dataset, val_dataset, result_dir, random_state ): # Set random states np.random.seed(random_state) # load data train_df = pd.read_csv(dataset) train_texts = train_df['texts'].to_numpy() train_labels = train_df['labels'].to_numpy() val_df = pd.read_csv(val_dataset) val_texts = val_df['texts'].to_numpy() val_labels = val_df['labels'].to_numpy() tfidf = TfidfVectorizer(max_features=max_features, stop_words='english') X_train = tfidf.fit_transform(train_texts) X_test = tfidf.transform(val_texts) umap = UMAP(n_components=umap_n_components) X_train = umap.fit_transform(X_train.toarray()) X_test = umap.transform(X_test.toarray()) kmeans = KMeans(n_init=n_init, n_clusters=len(np.unique(train_labels))) kmeans.fit(X_train) predicted_labels = kmeans.predict(X_test) best_matching, accuracy = cluster_accuracy(val_labels, predicted_labels) ari = adjusted_rand_score(val_labels, predicted_labels) nmi = normalized_mutual_info_score(val_labels, predicted_labels) purity = purity_score(y_true=val_labels, y_pred=predicted_labels) run_results = {} run_results['best_matching'] = best_matching run_results['accuracy'] = accuracy run_results['ari'] = ari run_results['nmi'] = nmi run_results['purity'] = purity # use purity to compare with microsoft paper os.makedirs(result_dir, exist_ok=True) result_df = pd.DataFrame.from_records([run_results]) result_df.to_csv(os.path.join(result_dir, f'20newsgroups-kmeans.csv'), index=False)
def test_umap_transform_on_iris_modified_dtype(iris, iris_selection): data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data) fitter.embedding_ = fitter.embedding_.astype(np.float64) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert_greater_equal( trust, 0.8, "Insufficiently trustworthy transform for iris dataset: {}".format( trust), )
def test_umap_transform_on_iris(iris, iris_selection): data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, n_epochs=200, random_state=42).fit(data) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert_greater_equal( trust, 0.85, "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust), )
def umap_reduce(docvecs, label, umap_model, use_nn, use_umap, **kwargs): if not use_umap: return np.array(docvecs), None if not umap_model: print(f"Train UMAP...") umap_n_components = min(256, len(docvecs)-2) if use_nn else 1 umap_model = UMAP(metric="cosine", set_op_mix_ratio=1.0, n_components=umap_n_components, random_state=42, verbose=False) umap_model = umap_model.fit(docvecs, y=label) dim_reduced_vecs = umap_model.transform(docvecs) if not use_nn: dim_reduced_vecs = dim_reduced_vecs.astype(float) return dim_reduced_vecs, umap_model
def run_dimention_reduction(train_x, test_x, train_y): """ 次元削減を行う関数(PCA ⇒ UMAP) """ # 始めにPCAで元の1/2に次元削減する n_components = round(len(train_x.columns) * 0.5) pca = PCA(n_components=n_components).fit(train_x) reduced_train_x = pd.DataFrame(pca.transform(train_x)) reduced_test_x = pd.DataFrame(pca.transform(test_x)) # UMAPで2次元に削減 reducer = UMAP(random_state=0) reducer.fit(reduced_train_x) reduced_train_x = pd.DataFrame(reducer.transform(reduced_train_x)) reduced_test_x = pd.DataFrame(reducer.transform(reduced_test_x)) reduced_train_x.columns = ["umap_1", "umap_2"] reduced_test_x.columns = ["umap_1", "umap_2"] # df = pd.concat([reduced_train_x, train_y], axis=1) # plt.figure() # plt.scatter(df.loc[:, 0], df.loc[:, 1], c=df.loc[:, "y"]) # plt.colorbar() # plt.savefig(f"{DATA_DIR}/dimension_reduction.png") return reduced_train_x, reduced_test_x
class UMAPRepresentation(Representation): @staticmethod def default_config(): default_config = Dict() # parameters default_config.parameters = Dict() default_config.parameters.n_neighbors = 15 default_config.parameters.metric = 'euclidean' default_config.parameters.init = 'spectral' default_config.parameters.random_state = None default_config.parameters.min_dist = 0.1 return default_config def __init__(self, n_features=28 * 28, n_latents=10, config={}, **kwargs): Representation.__init__(self, config=config, **kwargs) # input size (flatten) self.n_features = n_features # latent size self.n_latents = n_latents # feature range self.feature_range = (0.0, 1.0) self.algorithm = UMAP() self.update_algorithm_parameters() def fit(self, X_train, update_range=True): ''' X_train: array-like (n_samples, n_features) ''' X_train = np.nan_to_num(X_train) if update_range: self.feature_range = (X_train.min(axis=0), X_train.max(axis=0)) # save (min, max) for normalization X_train = (X_train - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0]) self.algorithm.fit(X_train) def calc_embedding(self, x): x = (x - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0]) x = self.algorithm.transform(x) return x def update_algorithm_parameters(self): self.algorithm.set_params(n_components=self.n_latents, **self.config.parameters, verbose=False)
def draw_umap(data, n_neighbors=15, min_dist=0.1, c=None, n_components=2, metric='euclidean', title='', plot=True, cmap=None, use_plotly=False, **kwargs): fit = UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, metric=metric, random_state=42) mapper = fit.fit(data) u = fit.transform(data) if plot: if use_plotly: fig = px.scatter(x=u[:, 0], y=u[:, 1], color=c, title=title, **kwargs) fig.update_layout({ 'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)', }) fig.show() else: fig = plt.figure() if n_components == 1: ax = fig.add_subplot(111) ax.scatter(u[:, 0], range(len(u)), c=c) if n_components == 2: ax = fig.add_subplot(111) scatter = ax.scatter(u[:, 0], u[:, 1], c=c, label=c, cmap=cmap) if n_components == 3: ax = fig.add_subplot(111, projection='3d') ax.scatter(u[:, 0], u[:, 1], u[:, 2], c=c, s=100) plt.title(title, fontsize=18) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) return u, mapper
def test_umap_transform_on_iris_w_pynndescent(iris, iris_selection): data = iris.data[iris_selection] fitter = UMAP( n_neighbors=10, min_dist=0.01, n_epochs=100, random_state=42, force_approximation_algorithm=True, ).fit(data) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert ( trust >= 0.85 ), "Insufficiently trustworthy transform for" "iris dataset: {}".format( trust)
def test_precomputed_sparse_transform_on_iris(iris, iris_selection): data = iris.data[iris_selection] distance_matrix = sparse.csr_matrix(squareform(pdist(data))) fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, metric='precomputed').fit(distance_matrix) new_data = iris.data[~iris_selection] new_distance_matrix = sparse.csr_matrix(cdist(new_data, data)) embedding = fitter.transform(new_distance_matrix) trust = trustworthiness(new_data, embedding, 10) assert ( trust >= 0.85 ), "Insufficiently trustworthy transform for" "iris dataset: {}".format( trust)
def relabel(dict_imp, plot_scatter=False): """ Function that return the imputed datasets with new labels. Obtained by pasting the subdomain cluster labels with the domain cluster labels. :param dict_imp: imputed datasets :type dict_imp: dict :param plot_scatter: flag for a scatterplot with new labels :type plot_scatter: bool :return: imput dictionary with new cluster label column :rtype: dict """ transform = UMAP(random_state=42, n_neighbors=30, min_dist=0.0) subdomain_feat = [c for c in dict_imp['P1'][0].columns if re.search('vscore', c) and not re.search('written', c)] domain_feat = [c for c in dict_imp['P1'][0].columns if re.search('totalb', c) and not re.search('composite', c)] col = subdomain_feat + domain_feat flatui = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf", "#8c564b", "#a55194"] for p, df in dict_imp.items(): df[0]['new_cluster'] = ['-'.join([str(a), str(b)]) for a, b in zip(df[0]['cluster_subdomain'].tolist(), df[0]['cluster_domain'].tolist())] df[1]['new_cluster'] = ['-'.join([str(a), str(b)]) for a, b in zip(df[1]['cluster_subdomain'].tolist(), df[1]['cluster_domain'].tolist())] X_tr = transform.fit_transform(df[0][col]) X_ts = transform.transform(df[1][col]) if plot_scatter: _scatter_plot(X_tr, [(gui, cl) for gui, cl in zip(df[0].index, df[0].new_cluster)], flatui, 10, 20, {str(ncl): ' '.join(['cluster', str(ncl)]) for ncl in sorted(np.unique(df[0].new_cluster))}, title=f'New labels Vineland {p} training set') _scatter_plot(X_ts, [(gui, cl) for gui, cl in zip(df[1].index, df[1].new_cluster)], flatui, 10, 20, {str(ncl): ' '.join(['cluster', str(ncl)]) for ncl in sorted(np.unique(df[1].new_cluster))}, title=f'New labels Vineland {p} test set') return dict_imp
def umap_graph(data, plot=True): near_n = 15 min_d = 0.1 rand_s = np.random.RandomState(30) transformer = UMAP(n_neighbors=near_n,random_state=rand_s,min_dist=min_d).fit(data) raw_emb = transformer.transform(data); kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=rand_s) labels = kmeans.fit(raw_emb) node_l = labels.labels_ sim_set = umap.umap_.fuzzy_simplicial_set(data,near_n,random_state=rand_s,\ metric="euclidean") sim_set = sim_set[0].toarray() if plot: nx.draw(nx.DiGraph(sim_set), arrows=False, node_size=30, width=0.1, node_color = node_l, cmap='viridis') return sim_set
def test_ingest_map_embedding_umap(): adata_ref = sc.AnnData(X) adata_new = sc.AnnData(T) sc.pp.neighbors(adata_ref, method='umap', use_rep='X', n_neighbors=4, random_state=0) sc.tl.umap(adata_ref, random_state=0) ing = sc.tl.Ingest(adata_ref) ing.fit(adata_new) ing.map_embedding(method='umap') reducer = UMAP(min_dist=0.5, random_state=0, n_neighbors=4) reducer.fit(X) umap_transformed_t = reducer.transform(T) assert np.allclose(ing._obsm['X_umap'], umap_transformed_t)
def test_umap_sparse_transform_on_iris(iris, iris_selection): data = sparse.csr_matrix(iris.data[iris_selection]) assert sparse.issparse(data) fitter = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, # force_approximation_algorithm=True, ).fit(data) new_data = sparse.csr_matrix(iris.data[~iris_selection]) assert sparse.issparse(new_data) embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert ( trust >= 0.80 ), "Insufficiently trustworthy transform for" "iris dataset: {}".format( trust)
def plot_umap(X, Y, validation_data=None, style='starplot', p_size=3.5, save_img=False, img_res=300, fig_res=72, random_state=None): from umap import UMAP if validation_data is None: validation_data = (X, Y) X_valid, Y_valid = validation_data if style == 'starplot': plt.style.use(['dark_background']) plt.rcParams['figure.figsize'] = (15, 15) plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.size'] = 14 plt.rcParams['figure.dpi'] = fig_res umap = UMAP(25, random_state=random_state) umap.fit(X, Y.ravel()) embedings = umap.transform(X_valid) embedings = np.array(embedings) size = p_size cmap = LinearSegmentedColormap.from_list("recy", ["magenta", "cyan"]) for point in range(1, 10): plt.scatter( embedings[:, 0], embedings[:, 1], c=Y_valid.ravel(), cmap=cmap, s=5 * point**size, alpha=1 / (point**size), edgecolors='', ) file_name = './plots/s' + str(int(size)) + '_umap.png' if save_img: os.makedirs(os.path.dirname(file_name), exist_ok=True) plt.savefig(file_name, dpi=img_res, transparent=True) plt.show()
def umap_function(self, semi_super=True): umap = UMAP() X_train = umap.fit_transform( self.raw_data["X_train"], y=self.raw_data['y_train']) if semi_super else umap.fit_transform( self.raw_data["X_train"]) X_test = umap.transform(self.raw_data["X_test"]) d_data = { 'X_train': X_train, 'y_train': self.raw_data["y_train"], 'X_test': X_test, 'y_test': self.raw_data["y_test"], } if semi_super: self.umap_sup = umap self.umap_sup_data = d_data else: self.umap = umap self.umap_data = d_data
class UmapTransformer(object): def __init__(self, n_components=2, embed_n_neighbors=10, target_metric='categorical'): self.umap = UMAP(n_components=n_components, n_neighbors=embed_n_neighbors, target_metric=target_metric, transform_seed=1) self.rf = RandomForestRegressor(n_estimators=300) def fit(self, x, y=None): self.umap.fit(x, y) x_low = self.umap.transform(x) self.rf.fit(x, x_low) return self def transform(self, x): return self.rf.predict(x) def fit_transform(self, x, y): return self.fit(x, y).transform(x)
def predict_adata(model, adata, make_umap=True, umap_fit_n=10000, batch_size=1024): dl = get_prediction_dataloader(adata, model.genes, batch_size=1024) logging.info(f'starting prediction of {dl.dataset.adata.shape[0]} cells') emb, y_prob = predict_dl(dl, model) a = dl.dataset.adata a.obsm['X_emb'] = emb if make_umap: u = UMAP() idxs = np.random.choice(np.arange(a.shape[0]), size=min(umap_fit_n, a.shape[0]), replace=False) u.fit(emb[idxs]) a.obsm['X_umap'] = u.transform(emb) a.obsm['prediction_probs'] = y_prob a.obs['y_pred'] = [np.argmax(probs) for probs in y_prob] a.obs['predicted_cell_type_probability'] = [ np.max(probs) for probs in y_prob ] a.obs['predicted_cell_type'] = [ model.classes[np.argmax(probs)] for probs in y_prob ] prob_df = pd.DataFrame(data=a.obsm['prediction_probs'], columns=model.classes, index=a.obs.index.to_list()) prob_df.columns = [f'probability {c}' for c in prob_df.columns] a.obs = pd.concat((a.obs, prob_df), axis=1) return a
class UMAPReducer(Reducer): """ Simple wrapper for UMAP, used for API consistency. """ def __init__(self, n_components=2, **kwargs): self.n_components = n_components kwargs = {**{'n_neighbors': 10, 'min_dist': 0.}, **kwargs} self.model = UMAP(n_components=n_components, **kwargs) def fit(self, X): with warnings.catch_warnings(): warnings.simplefilter('ignore', NumbaWarning) self.model.fit(X) return self def transform(self, X): with warnings.catch_warnings(): warnings.simplefilter('ignore', NumbaWarning) result = self.model.transform(X) return result def decrement_components(self): self.n_components -= 1 self.model.n_components -= 1
def get_similarity_matrix(data, n_components_umap=2, n_neighbors_knn=10, random_state=None): """ The similarity matrix is derived in an unsupervised way (e.g., UMAP projection of the data and k-nearest-neighbors or distance thresholding to define the adjacency matrix for the batch), but can also be used to include weakly-supervised information (e.g., knowledge about diseased vs. non-diseased patients). If labels are available, the model could even be used to derive a latent representation with supervision. Thesimilarity feature in MoE-Sim-VAE thus allows to include prior knowledge about the best similarity measure on the data. """ flat_data = data.reshape(len(data), -1) reducer = UMAP(n_components=n_components_umap, random_state=random_state) reducer.fit(flat_data) embedding = reducer.transform(flat_data) neigh = NearestNeighbors(n_neighbors=n_neighbors_knn) neigh.fit(embedding) similarity = neigh.kneighbors_graph(embedding).toarray() similarity = similarity.astype(np.float32) return similarity, embedding
from datetime import datetime from util import getKaggleMNIST from sklearn.linear_model import LogisticRegression from umap import UMAP # get the data Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST() print("Score without transformation:") model = LogisticRegression() model.fit(Xtrain, Ytrain) print(model.score(Xtrain, Ytrain)) print(model.score(Xtest, Ytest)) umapper = UMAP(n_neighbors=5, n_components=10) t0 = datetime.now() Ztrain = umapper.fit_transform(Xtrain) print("umap fit_transform took:", datetime.now() - t0) t0 = datetime.now() Ztest = umapper.transform(Xtest) print("umap transform took:", datetime.now() - t0) print("Score with transformation") model = LogisticRegression() t0 = datetime.now() model.fit(Ztrain, Ytrain) print("logistic regression fit took:", datetime.now() - t0) print(model.score(Ztrain, Ytrain)) print(model.score(Ztest, Ytest))