def get_graph_elements(graph_, n_epochs): ### should we remove redundancies () here?? # graph_ = remove_redundant_edges(graph_) graph = graph_.tocoo() # eliminate duplicate entries by summing them together graph.sum_duplicates() # number of vertices in dataset n_vertices = graph.shape[1] # get the number of epochs based on the size of the dataset if n_epochs is None: # For smaller datasets we can use more epochs if graph.shape[0] <= 10000: n_epochs = 500 else: n_epochs = 200 # remove elements with very low probability graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0 graph.eliminate_zeros() # get epochs per sample based upon edge probability epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs) head = graph.row tail = graph.col weight = graph.data return graph, epochs_per_sample, head, tail, weight, n_vertices
def get_graph_elements(graph_, n_epochs): """ gets elements of graphs, weights, and number of epochs per edge Parameters ---------- graph_ : scipy.sparse.csr.csr_matrix umap graph of probabilities n_epochs : int maximum number of epochs per edge Returns ------- graph scipy.sparse.csr.csr_matrix umap graph epochs_per_sample np.array number of epochs to train each sample for head np.array edge head tail np.array edge tail weight np.array edge weight n_vertices int number of verticies in graph """ ### should we remove redundancies () here?? # graph_ = remove_redundant_edges(graph_) graph = graph_.tocoo() # eliminate duplicate entries by summing them together graph.sum_duplicates() # number of vertices in dataset n_vertices = graph.shape[1] # get the number of epochs based on the size of the dataset if n_epochs is None: # For smaller datasets we can use more epochs if graph.shape[0] <= 10000: n_epochs = 500 else: n_epochs = 200 # remove elements with very low probability graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0 graph.eliminate_zeros() # get epochs per sample based upon edge probability epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs) head = graph.row tail = graph.col weight = graph.data return graph, epochs_per_sample, head, tail, weight, n_vertices
def simplicial_set_embedding(g, embedding, n_epochs, a, b, random_seed, gamma, initial_alpha, negative_sample_rate, parallel, nthreads): import numba from threadpoolctl import threadpool_limits from umap.umap_ import make_epochs_per_sample from umap.layouts import optimize_layout_euclidean from sklearn.utils import check_random_state import numpy as np g.data[g.data < (g.data.max() / float(n_epochs))] = 0.0 g.eliminate_zeros() epochs_per_sample = make_epochs_per_sample(g.data, n_epochs) head = g.row tail = g.col rng_state = check_random_state(random_seed).randint( np.iinfo(np.int32).min + 1, np.iinfo(np.int32).max - 1, 3).astype(np.int64) # Since threadpool_limits doesnt work well with numba. We will use numba's set_num_threads to limit threads if numba.config.NUMBA_NUM_THREADS > nthreads: numba.set_num_threads(nthreads) with threadpool_limits(limits=nthreads): embedding = optimize_layout_euclidean(embedding, embedding, head, tail, n_epochs, g.shape[1], epochs_per_sample, a, b, rng_state, gamma, initial_alpha, negative_sample_rate, parallel=parallel, verbose=True) return embedding
def transform(self): self.graph_ = self.data epochs_per_sample = umaplib.make_epochs_per_sample( self.graph_.data, self.n_iter) self.graph_ = self.graph_.tocoo() n_vertices = self.graph_.shape[1] head = self.graph_.row tail = self.graph_.col saver = TSNESaveEmbedding(self.outdir) rng_state = self.random_state.randint(-(2**31) + 1, 2**31 - 1, 3).astype(np.int64) self.data_ = optimize_layout_euclidean( self.init, self.init, head, tail, self.n_iter, n_vertices, epochs_per_sample, self.a, self.b, rng_state, gamma=float(self.gamma), initial_alpha=self.learning_rate, eps=self.eps, negative_sample_rate=self.nu, parallel=self.parallel, verbose=False, saver=saver, save_freq=self.save_freq, ) return self.data_
def update(self, X, y=None, **fit_params): if "relations" not in fit_params: raise ValueError( "Aligned UMAP requires relations between data to be " "specified") new_dict_relations = fit_params["relations"] X = check_array(X) self.__dict__ = set_aligned_params(fit_params, self.__dict__, self.n_models_) self.n_models_ += 1 new_mapper = UMAP( n_neighbors=get_nth_item_or_val(self.n_neighbors, self.n_models_), min_dist=get_nth_item_or_val(self.min_dist, self.n_models_), n_epochs=get_nth_item_or_val(self.n_epochs, self.n_models_), repulsion_strength=get_nth_item_or_val(self.repulsion_strength, self.n_models_), learning_rate=get_nth_item_or_val(self.learning_rate, self.n_models_), spread=get_nth_item_or_val(self.spread, self.n_models_), negative_sample_rate=get_nth_item_or_val(self.negative_sample_rate, self.n_models_), local_connectivity=get_nth_item_or_val(self.local_connectivity, self.n_models_), set_op_mix_ratio=get_nth_item_or_val(self.set_op_mix_ratio, self.n_models_), unique=get_nth_item_or_val(self.unique, self.n_models_), ).fit(X) self.mappers_ += [new_mapper] # TODO: We can likely make this more efficient and not recompute each time self.dict_relations_ += [invert_dict(new_dict_relations)] if self.n_epochs is None: n_epochs = 200 else: n_epochs = self.n_epochs indptr_list = numba.typed.List.empty_list(numba.types.int32[::1]) indices_list = numba.typed.List.empty_list(numba.types.int32[::1]) heads = numba.typed.List.empty_list(numba.types.int32[::1]) tails = numba.typed.List.empty_list(numba.types.int32[::1]) epochs_per_samples = numba.typed.List.empty_list( numba.types.float64[::1]) for i, mapper in enumerate(self.mappers_): indptr_list.append(mapper.graph_.indptr) indices_list.append(mapper.graph_.indices) heads.append(mapper.graph_.tocoo().row) tails.append(mapper.graph_.tocoo().col) if i == len(self.mappers_) - 1: epochs_per_samples.append( make_epochs_per_sample(mapper.graph_.tocoo().data, n_epochs)) else: epochs_per_samples.append( np.full(mapper.embedding_.shape[0], n_epochs + 1, dtype=np.float64)) new_relations = expand_relations(self.dict_relations_) new_regularisation_weights = build_neighborhood_similarities( indptr_list, indices_list, new_relations, ) new_embedding = init_from_existing(self.embeddings_[-1], new_mapper.graph_, new_dict_relations) random_state = check_random_state(self.random_state) rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) self.embeddings_.append(new_embedding) self.embeddings_ = optimize_layout_aligned_euclidean( self.embeddings_, self.embeddings_, heads, tails, n_epochs, epochs_per_samples, new_regularisation_weights, new_relations, rng_state, lambda_=self.alignment_regularisation, )
def fit(self, X, y=None, **fit_params): if "relations" not in fit_params: raise ValueError( "Aligned UMAP requires relations between data to be " "specified") self.dict_relations_ = fit_params["relations"] assert type(self.dict_relations_) in (list, tuple) assert type(X) in (list, tuple, np.ndarray) assert (len(X) - 1) == (len(self.dict_relations_)) # We need n_components to be constant or this won't work if type(self.n_components) in (list, tuple, np.ndarray): raise ValueError( "n_components must be a single integer, and cannot vary") self.n_models_ = len(X) self.mappers_ = [ UMAP( n_neighbors=get_nth_item_or_val(self.n_neighbors, n), min_dist=get_nth_item_or_val(self.min_dist, n), n_epochs=get_nth_item_or_val(self.n_epochs, n), repulsion_strength=get_nth_item_or_val(self.repulsion_strength, n), learning_rate=get_nth_item_or_val(self.learning_rate, n), spread=get_nth_item_or_val(self.spread, n), negative_sample_rate=get_nth_item_or_val( self.negative_sample_rate, n), local_connectivity=get_nth_item_or_val(self.local_connectivity, n), set_op_mix_ratio=get_nth_item_or_val(self.set_op_mix_ratio, n), unique=get_nth_item_or_val(self.unique, n), n_components=self.n_components, ).fit(X[n]) for n in range(self.n_models_) ] if self.n_epochs is None: n_epochs = 200 else: n_epochs = self.n_epochs window_size = fit_params.get("window_size", self.alignment_window_size) relations = expand_relations(self.dict_relations_, window_size) indptr_list = numba.typed.List.empty_list(numba.types.int32[::1]) indices_list = numba.typed.List.empty_list(numba.types.int32[::1]) heads = numba.typed.List.empty_list(numba.types.int32[::1]) tails = numba.typed.List.empty_list(numba.types.int32[::1]) epochs_per_samples = numba.typed.List.empty_list( numba.types.float64[::1]) for mapper in self.mappers_: indptr_list.append(mapper.graph_.indptr) indices_list.append(mapper.graph_.indices) heads.append(mapper.graph_.tocoo().row) tails.append(mapper.graph_.tocoo().col) epochs_per_samples.append( make_epochs_per_sample(mapper.graph_.tocoo().data, n_epochs)) regularisation_weights = build_neighborhood_similarities( indptr_list, indices_list, relations, ) first_init = spectral_layout( self.mappers_[0]._raw_data, self.mappers_[0].graph_, self.n_components, np.random, ) expansion = 10.0 / np.abs(first_init).max() first_embedding = (first_init * expansion).astype( np.float32, order="C", ) embeddings = numba.typed.List.empty_list(numba.types.float32[:, ::1]) embeddings.append(first_embedding) for i in range(1, self.n_models_): next_init = spectral_layout( self.mappers_[i]._raw_data, self.mappers_[i].graph_, self.n_components, np.random, ) expansion = 10.0 / np.abs(next_init).max() next_embedding = (next_init * expansion).astype( np.float32, order="C", ) anchor_data = relations[i][window_size - 1] left_anchors = anchor_data[anchor_data >= 0] right_anchors = np.where(anchor_data >= 0)[0] embeddings.append( procrustes_align( embeddings[-1], next_embedding, np.vstack([left_anchors, right_anchors]), )) random_state = check_random_state(self.random_state) rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) self.embeddings_ = optimize_layout_aligned_euclidean( embeddings, embeddings, heads, tails, n_epochs, epochs_per_samples, regularisation_weights, relations, rng_state, lambda_=self.alignment_regularisation, ) return self
def simplicial_set_embedding( g, embedding, n_epochs, a, b, random_seed, gamma, initial_alpha, negative_sample_rate, densmap_kwds, parallel, nthreads, verbose, ): import numba from threadpoolctl import threadpool_limits from umap.umap_ import make_epochs_per_sample from umap.layouts import optimize_layout_euclidean from sklearn.utils import check_random_state import numpy as np from .utils import tqdm_params # g.data[g.data < (g.data.max() / float(n_epochs))] = 0.0 # g.eliminate_zeros() epochs_per_sample = make_epochs_per_sample(g.data, n_epochs) logger.trace("calculated epochs_per_sample") rng_state = (check_random_state(random_seed).randint( np.iinfo(np.int32).min + 1, np.iinfo(np.int32).max - 1, 3).astype(np.int64)) # Since threadpool_limits doesnt work well with numba. We will use numba's set_num_threads to limit threads if numba.config.NUMBA_NUM_THREADS > nthreads: numba.set_num_threads(nthreads) if densmap_kwds != {}: with threadpool_limits(limits=nthreads): mu_sum, R = calc_dens_map_params(g, densmap_kwds["knn_dists"]) densmap_kwds["mu_sum"] = mu_sum densmap_kwds["R"] = R densmap_kwds["mu"] = g.data densmap = True logger.trace("calculated densmap params") else: densmap = False # tqdm will be activated if https://github.com/lmcinnes/umap/pull/739 # is merged and when it is released tqdm_params = dict(tqdm_params) tqdm_params["desc"] = "Training UMAP" with threadpool_limits(limits=nthreads): embedding = optimize_layout_euclidean( head_embedding=embedding, tail_embedding=embedding, head=g.row, tail=g.col, n_epochs=n_epochs, n_vertices=g.shape[1], epochs_per_sample=epochs_per_sample, a=a, b=b, rng_state=rng_state, gamma=gamma, initial_alpha=initial_alpha, negative_sample_rate=negative_sample_rate, parallel=parallel, verbose=verbose, densmap=densmap, densmap_kwds=densmap_kwds, tqdm_kwds=tqdm_params, move_other=True, ) return embedding