def test_umap_bad_n_jobs(nn_data): u = UMAP(n_jobs=-2) with pytest.raises(ValueError): u.fit(nn_data) u = UMAP(n_jobs=0) with pytest.raises(ValueError): u.fit(nn_data)
def test_densmap_frac(nn_data): u = UMAP(densmap=True, dens_frac=-1.0) with pytest.raises(ValueError): u.fit(nn_data) u = UMAP(densmap=True, dens_frac=2.0) with pytest.raises(ValueError): u.fit(nn_data)
def run_svc(train_pkl, val_pkl, test_pkl, series=False, outcome_col='Disease_State', num_random_search=0): train_methyl_array, val_methyl_array, test_methyl_array = MethylationArray.from_pickle(train_pkl), MethylationArray.from_pickle(val_pkl), MethylationArray.from_pickle(test_pkl) umap = UMAP(n_components=100) umap.fit(train_methyl_array.beta) train_methyl_array.beta = pd.DataFrame(umap.transform(train_methyl_array.beta.values),index=train_methyl_array.return_idx()) val_methyl_array.beta = pd.DataFrame(umap.transform(val_methyl_array.beta),index=val_methyl_array.return_idx()) test_methyl_array.beta = pd.DataFrame(umap.transform(test_methyl_array.beta),index=test_methyl_array.return_idx()) model = SVC model = MachineLearning(model,options={'penalty':'l2','verbose':3,'n_jobs':35,'class_weight':'balanced'},grid={'C':[1,10,100,1000], 'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}, n_eval=num_random_search, series=series, labelencode=True, verbose=True) sklearn_model=model.fit(train_methyl_array,val_methyl_array,outcome_col) pickle.dump(sklearn_model,open('sklearn_model.p','wb')) y_pred = model.predict(test_methyl_array) pd.DataFrame(np.hstack((y_pred[:,np.newaxis],test_methyl_array.pheno[outcome_col].values[:,np.newaxis])),index=test_methyl_array.return_idx(),columns=['y_pred','y_true']).to_csv('SklearnPredictions.csv') original, std_err, (low_ci,high_ci) = model.return_outcome_metric(test_methyl_array, outcome_col, accuracy_score, run_bootstrap=True) results={'score':original,'Standard Error':std_err, '0.95 CI Low':low_ci, '0.95 CI High':high_ci} print('\n'.join(['{}:{}'.format(k,v) for k,v in results.items()]))
def test_umap_too_many_neighbors_warns(nn_data): u = UMAP(a=1.2, b=1.75, n_neighbors=2000, n_epochs=11, init="random") u.fit( nn_data[:100,] ) assert_equal(u._a, 1.2) assert_equal(u._b, 1.75)
def cmp3(Y1, Y2, X1, X2, title=('1', '2'), red=None, save=None): '''add a comparison, where all labels are kept''' sns.set(font_scale=1.2, style='white') if not red: red = UMAP() red.fit(np.vstack((X1, X2))) plt.figure(figsize=(24, 8)) #plt.tight_layout() ax = plt.subplot(131) umap(X1, Y1, red, show=False, title=title[0], size=4, markerscale=4) ax = plt.subplot(132) umap(X2, Y2, red, show=False, title=title[1], size=4, markerscale=4) ax = plt.subplot(133) umap(np.vstack((X1, X2)), [1] * len(Y1) + [2] * len(Y2), red, show=False, title="Combined", size=4, markerscale=4, acc={ 1: title[0], 2: title[1] }) if save: plt.tight_layout() plt.savefig(save, dpi=300) plt.show()
def plot_blobclust(Y1, X1, X2, red=None, save=None): sns.set(font_scale=1.2, style='white') if not red: red = UMAP() red.fit(np.vstack((X1, X2))) plt.figure(figsize=(12, 12)) #plt.tight_layout() old markers.. #umap(X1,Y1[:X1.shape[0]],red,show=False,title="combined clustering",size=30,markerscale=4,marker='_') #umap(X2,Y1[X1.shape[0]:],red,show=False,title="combined clustering",size=30,markerscale=4,marker='|') fill = lambda col: {"marker": 'o'} empty = lambda col: {'facecolors': 'none', 'edgecolors': col} #fill = lambda col: {"marker": "o"} #empty = lambda col:{"marker": mpl.markers.MarkerStyle('o','none')} #{"marker":'o','fillstyle':'none'} umap(X1, Y1[:X1.shape[0]], red, show=False, title="combined clustering", size=30, markerscale=4, getmarker=fill) umap(X2, Y1[X1.shape[0]:], red, show=False, title="combined clustering", size=30, markerscale=4, getmarker=empty) if save: plt.tight_layout() plt.savefig(save, dpi=300) plt.show()
def test_umap_inverse_transform_fails_expectedly(sparse_spatial_data, nn_data): u = UMAP(n_epochs=11) u.fit(sparse_spatial_data[:100]) assert_raises(ValueError, u.inverse_transform, u.embedding_[:10]) u = UMAP(metric="dice", n_epochs=11) u.fit(nn_data[:100]) assert_raises(ValueError, u.inverse_transform, u.embedding_[:10])
def test_umap_bad_too_large_min_dist(nn_data): u = UMAP(min_dist=2.0) # a RuntimeWarning about division by zero in a,b curve fitting is expected # caught and ignored for this test with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) with pytest.raises(ValueError): u.fit(nn_data)
def run_umap(args): mapper = UMAP(n_neighbors=args.n_neighbors, random_state=args.seed, init="random") mapper.fit(X_train, y=y_train) Z = mapper.embedding_ Z_test = mapper.transform(X_test) return Z, Z_test
def test_umap_bad_output_metric_no_grad(nn_data): @numba.njit() def dist1(x, y): return np.sum(np.abs(x - y)) u = UMAP(output_metric=dist1) with pytest.raises(ValueError): u.fit(nn_data)
def test_umap_update_bad_params(nn_data): dmat = pairwise_distances(nn_data[:100]) u = UMAP(metric="precomputed", n_epochs=11) u.fit(dmat) assert_raises(ValueError, u.update, dmat) u = UMAP(n_epochs=11) u.fit(nn_data[:100], y=np.repeat(np.arange(5), 20)) assert_raises(ValueError, u.update, nn_data[100:200])
def get_encoder(metas, train_data, target_output_dim): tmpdir = metas['workspace'] model_path = os.path.join(tmpdir, 'umap_model.model') model = UMAP(n_components=target_output_dim, random_state=42) model.fit(train_data) pickle.dump(model, open(model_path, 'wb')) return UMAPEncoder(model_path=model_path)
class Umap: """ This transformer transformers all vectors in an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] by means of umap. We're using the implementation in [umap-learn](https://umap-learn.readthedocs.io/en/latest/). Arguments: n_components: the number of compoments to create/add kwargs: keyword arguments passed to the UMAP algorithm Usage: ```python from whatlies.language import SpacyLanguage from whatlies.transformers import Umap words = ["prince", "princess", "nurse", "doctor", "banker", "man", "woman", "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire", "dog", "cat", "mouse", "red", "blue", "green", "yellow", "water", "person", "family", "brother", "sister"] lang = SpacyLanguage("en_core_web_md") emb = lang[words] emb.transform(Umap(3)).plot_interactive_matrix('umap_0', 'umap_1', 'umap_2') ``` """ def __init__(self, n_components=2, **kwargs): self.is_fitted = False self.n_components = n_components self.kwargs = kwargs self.tfm = UMAP(n_components=n_components, **kwargs) def __call__(self, embset): if not self.is_fitted: self.fit(embset) return self.transform(embset) def fit(self, embset): names, X = embset_to_X(embset=embset) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) warnings.simplefilter("ignore", category=NumbaPerformanceWarning) self.tfm.fit(X) self.is_fitted = True def transform(self, embset): names, X = embset_to_X(embset=embset) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=NumbaPerformanceWarning) new_vecs = self.tfm.transform(X) names_out = names + [f"umap_{i}" for i in range(self.n_components)] vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)]) new_dict = new_embedding_dict(names_out, vectors_out, embset) return EmbeddingSet(new_dict, name=f"{embset.name}.umap_{self.n_components}()")
def _umap_projection(embeddings, n_axes, **kwargs): embeddings_matrix = np.stack(embeddings.values()) umap = UMAP() umap.fit(embeddings_matrix) projected_matrix = umap.transform(embeddings_matrix) projected_emebddings = { embedding_id: projected_matrix[i, :] for i, embedding_id in enumerate(embeddings) } return projected_emebddings
def test_umap_fit_instance_returned(): # Test that fit returns a new UMAP instance # Passing both data and targets u = UMAP() x = np.random.uniform(0, 1, (256, 10)) y = np.random.randint(10, size=(256,)) res = u.fit(x, y) assert isinstance(res, UMAP) # Passing only data u = UMAP() x = np.random.uniform(0, 1, (256, 10)) res = u.fit(x) assert isinstance(res, UMAP)
def get_umap_layout(**kwargs): '''Get the x,y positions of images passed through a umap projection''' print(' * creating UMAP layout') out_path = get_path('layouts', 'umap', **kwargs) if os.path.exists(out_path) and kwargs['use_cache']: return out_path model = UMAP(n_neighbors=kwargs['n_neighbors'], min_dist=kwargs['min_distance'], metric=kwargs['metric']) # run PCA to reduce dimensionality of image vectors w = PCA(n_components=min(100, len(kwargs['vecs']))).fit_transform( kwargs['vecs']) # fetch categorical labels for images (if provided) y = [] if kwargs.get('metadata', False): labels = [i.get('label', None) for i in kwargs['metadata']] # if the user provided labels, integerize them if any([i for i in labels]): d = defaultdict(lambda: len(d)) for i in labels: if i == None: y.append(-1) else: y.append(d[i]) y = np.array(y) # project the PCA space down to 2d for visualization z = model.fit(w, y=y if np.any(y) else None).embedding_ return write_layout(out_path, z, **kwargs)
class UMAPRepresentation(Representation): @staticmethod def default_config(): default_config = Dict() # parameters default_config.parameters = Dict() default_config.parameters.n_neighbors = 15 default_config.parameters.metric = 'euclidean' default_config.parameters.init = 'spectral' default_config.parameters.random_state = None default_config.parameters.min_dist = 0.1 return default_config def __init__(self, n_features=28 * 28, n_latents=10, config={}, **kwargs): Representation.__init__(self, config=config, **kwargs) # input size (flatten) self.n_features = n_features # latent size self.n_latents = n_latents # feature range self.feature_range = (0.0, 1.0) self.algorithm = UMAP() self.update_algorithm_parameters() def fit(self, X_train, update_range=True): ''' X_train: array-like (n_samples, n_features) ''' X_train = np.nan_to_num(X_train) if update_range: self.feature_range = (X_train.min(axis=0), X_train.max(axis=0)) # save (min, max) for normalization X_train = (X_train - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0]) self.algorithm.fit(X_train) def calc_embedding(self, x): x = (x - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0]) x = self.algorithm.transform(x) return x def update_algorithm_parameters(self): self.algorithm.set_params(n_components=self.n_latents, **self.config.parameters, verbose=False)
def test_umap_custom_distance_w_grad(nn_data): @numba.njit() def dist1(x, y): return np.sum(np.abs(x - y)) @numba.njit() def dist2(x, y): return np.sum(np.abs(x - y)), (x - y) u = UMAP(metric=dist1, n_epochs=11) with pytest.warns(UserWarning) as warnings: u.fit(nn_data[:10]) assert len(warnings) >= 1 u = UMAP(metric=dist2, n_epochs=11) with pytest.warns(UserWarning) as warnings: u.fit(nn_data[:10]) assert len(warnings) <= 1
def test_ingest_map_embedding_umap(): adata_ref = sc.AnnData(X) adata_new = sc.AnnData(T) sc.pp.neighbors(adata_ref, method='umap', use_rep='X', n_neighbors=4, random_state=0) sc.tl.umap(adata_ref, random_state=0) ing = sc.tl.Ingest(adata_ref) ing.fit(adata_new) ing.map_embedding(method='umap') reducer = UMAP(min_dist=0.5, random_state=0, n_neighbors=4) reducer.fit(X) umap_transformed_t = reducer.transform(T) assert np.allclose(ing._obsm['X_umap'], umap_transformed_t)
def run_transformation(self, X, y, transformation_params, callback): class CallbackAdapter: def __init__(self, callback): self.callback = callback def __call__(self, iteration, embedding): self.callback('embedding', iteration, dict(embedding=embedding)) callback_adapter = CallbackAdapter(callback) umap = UMAP(callback=callback_adapter, **transformation_params) with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=NumbaWarning) try: callback('start', 0, None) callback('status', 0, dict(message='Initializing UMAP')) umap.fit(X, y) except Exception as e: callback('error', 0, dict(message=str(e)))
def test_umap_bad_metrics(nn_data): u = UMAP(metric="foobar") with pytest.raises(ValueError): u.fit(nn_data) u = UMAP(metric=2.75) with pytest.raises(ValueError): u.fit(nn_data) u = UMAP(output_metric="foobar") with pytest.raises(ValueError): u.fit(nn_data) u = UMAP(output_metric=2.75) with pytest.raises(ValueError): u.fit(nn_data)
def plot_umap(X, Y, validation_data=None, style='starplot', p_size=3.5, save_img=False, img_res=300, fig_res=72, random_state=None): from umap import UMAP if validation_data is None: validation_data = (X, Y) X_valid, Y_valid = validation_data if style == 'starplot': plt.style.use(['dark_background']) plt.rcParams['figure.figsize'] = (15, 15) plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.size'] = 14 plt.rcParams['figure.dpi'] = fig_res umap = UMAP(25, random_state=random_state) umap.fit(X, Y.ravel()) embedings = umap.transform(X_valid) embedings = np.array(embedings) size = p_size cmap = LinearSegmentedColormap.from_list("recy", ["magenta", "cyan"]) for point in range(1, 10): plt.scatter( embedings[:, 0], embedings[:, 1], c=Y_valid.ravel(), cmap=cmap, s=5 * point**size, alpha=1 / (point**size), edgecolors='', ) file_name = './plots/s' + str(int(size)) + '_umap.png' if save_img: os.makedirs(os.path.dirname(file_name), exist_ok=True) plt.savefig(file_name, dpi=img_res, transparent=True) plt.show()
def draw_umap( data, n_neighbors=15, min_dist=0.1, c=None, n_components=2, metric="euclidean", title="", plot=True, cmap=None, use_plotly=False, **kwargs, ): """Generate plot of UMAP algorithm results based on specified arguments """ fit = UMAP( n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, metric=metric, random_state=42, ) mapper = fit.fit(data) u = fit.transform(data) if plot: if use_plotly: fig = px.scatter( x=u[:, 0], y=u[:, 1], color=c, title=title, **kwargs ) fig.update_layout( { "plot_bgcolor": "rgba(0, 0, 0, 0)", "paper_bgcolor": "rgba(0, 0, 0, 0)", } ) fig.show() else: fig = plt.figure() if n_components == 1: ax = fig.add_subplot(111) ax.scatter(u[:, 0], range(len(u)), c=c) if n_components == 2: ax = fig.add_subplot(111) scatter = ax.scatter(u[:, 0], u[:, 1], c=c, label=c, cmap=cmap) if n_components == 3: ax = fig.add_subplot(111, projection="3d") ax.scatter(u[:, 0], u[:, 1], u[:, 2], c=c, s=100) plt.title(title, fontsize=18) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) return u, mapper
class UmapTransformer(object): def __init__(self, n_components=2, embed_n_neighbors=10, target_metric='categorical'): self.umap = UMAP(n_components=n_components, n_neighbors=embed_n_neighbors, target_metric=target_metric, transform_seed=1) self.rf = RandomForestRegressor(n_estimators=300) def fit(self, x, y=None): self.umap.fit(x, y) x_low = self.umap.transform(x) self.rf.fit(x, x_low) return self def transform(self, x): return self.rf.predict(x) def fit_transform(self, x, y): return self.fit(x, y).transform(x)
def run_dimention_reduction(train_x, test_x, train_y): """ 次元削減を行う関数(PCA ⇒ UMAP) """ # 始めにPCAで元の1/2に次元削減する n_components = round(len(train_x.columns) * 0.5) pca = PCA(n_components=n_components).fit(train_x) reduced_train_x = pd.DataFrame(pca.transform(train_x)) reduced_test_x = pd.DataFrame(pca.transform(test_x)) # UMAPで2次元に削減 reducer = UMAP(random_state=0) reducer.fit(reduced_train_x) reduced_train_x = pd.DataFrame(reducer.transform(reduced_train_x)) reduced_test_x = pd.DataFrame(reducer.transform(reduced_test_x)) # 標準化 reduced_train_x = cf.standardize(reduced_train_x) reduced_test_x = cf.standardize(reduced_test_x) reduced_train_x.columns = ["umap_1", "umap_2"] reduced_test_x.columns = ["umap_1", "umap_2"] return reduced_train_x, reduced_test_x
def predict_adata(model, adata, make_umap=True, umap_fit_n=10000, batch_size=1024): dl = get_prediction_dataloader(adata, model.genes, batch_size=1024) logging.info(f'starting prediction of {dl.dataset.adata.shape[0]} cells') emb, y_prob = predict_dl(dl, model) a = dl.dataset.adata a.obsm['X_emb'] = emb if make_umap: u = UMAP() idxs = np.random.choice(np.arange(a.shape[0]), size=min(umap_fit_n, a.shape[0]), replace=False) u.fit(emb[idxs]) a.obsm['X_umap'] = u.transform(emb) a.obsm['prediction_probs'] = y_prob a.obs['y_pred'] = [np.argmax(probs) for probs in y_prob] a.obs['predicted_cell_type_probability'] = [ np.max(probs) for probs in y_prob ] a.obs['predicted_cell_type'] = [ model.classes[np.argmax(probs)] for probs in y_prob ] prob_df = pd.DataFrame(data=a.obsm['prediction_probs'], columns=model.classes, index=a.obs.index.to_list()) prob_df.columns = [f'probability {c}' for c in prob_df.columns] a.obs = pd.concat((a.obs, prob_df), axis=1) return a
def umap_reduce(docvecs, label, umap_model, use_nn, use_umap, **kwargs): if not use_umap: return np.array(docvecs), None if not umap_model: print(f"Train UMAP...") umap_n_components = min(256, len(docvecs)-2) if use_nn else 1 umap_model = UMAP(metric="cosine", set_op_mix_ratio=1.0, n_components=umap_n_components, random_state=42, verbose=False) umap_model = umap_model.fit(docvecs, y=label) dim_reduced_vecs = umap_model.transform(docvecs) if not use_nn: dim_reduced_vecs = dim_reduced_vecs.astype(float) return dim_reduced_vecs, umap_model
def run_dimention_reduction(train_x, test_x, train_y): """ 次元削減を行う関数(PCA ⇒ UMAP) """ # 始めにPCAで元の1/2に次元削減する n_components = round(len(train_x.columns) * 0.5) pca = PCA(n_components=n_components).fit(train_x) reduced_train_x = pd.DataFrame(pca.transform(train_x)) reduced_test_x = pd.DataFrame(pca.transform(test_x)) # UMAPで2次元に削減 reducer = UMAP(random_state=0) reducer.fit(reduced_train_x) reduced_train_x = pd.DataFrame(reducer.transform(reduced_train_x)) reduced_test_x = pd.DataFrame(reducer.transform(reduced_test_x)) reduced_train_x.columns = ["umap_1", "umap_2"] reduced_test_x.columns = ["umap_1", "umap_2"] # df = pd.concat([reduced_train_x, train_y], axis=1) # plt.figure() # plt.scatter(df.loc[:, 0], df.loc[:, 1], c=df.loc[:, "y"]) # plt.colorbar() # plt.savefig(f"{DATA_DIR}/dimension_reduction.png") return reduced_train_x, reduced_test_x
def get_similarity_matrix(data, n_components_umap=2, n_neighbors_knn=10, random_state=None): """ The similarity matrix is derived in an unsupervised way (e.g., UMAP projection of the data and k-nearest-neighbors or distance thresholding to define the adjacency matrix for the batch), but can also be used to include weakly-supervised information (e.g., knowledge about diseased vs. non-diseased patients). If labels are available, the model could even be used to derive a latent representation with supervision. Thesimilarity feature in MoE-Sim-VAE thus allows to include prior knowledge about the best similarity measure on the data. """ flat_data = data.reshape(len(data), -1) reducer = UMAP(n_components=n_components_umap, random_state=random_state) reducer.fit(flat_data) embedding = reducer.transform(flat_data) neigh = NearestNeighbors(n_neighbors=n_neighbors_knn) neigh.fit(embedding) similarity = neigh.kneighbors_graph(embedding).toarray() similarity = similarity.astype(np.float32) return similarity, embedding
class UMAPReducer(Reducer): """ Simple wrapper for UMAP, used for API consistency. """ def __init__(self, n_components=2, **kwargs): self.n_components = n_components kwargs = {**{'n_neighbors': 10, 'min_dist': 0.}, **kwargs} self.model = UMAP(n_components=n_components, **kwargs) def fit(self, X): with warnings.catch_warnings(): warnings.simplefilter('ignore', NumbaWarning) self.model.fit(X) return self def transform(self, X): with warnings.catch_warnings(): warnings.simplefilter('ignore', NumbaWarning) result = self.model.transform(X) return result def decrement_components(self): self.n_components -= 1 self.model.n_components -= 1