def __init__( self, data, sigma, k=30, method="auto", metric="euclidean", metric_params=None, symmetrize=True, n_jobs=1, random_state=None, verbose=False, ): self.n_samples = n_samples = data.shape[0] self.verbose = verbose if k >= self.n_samples: raise ValueError( "`k` (%d) cannot be larger than N-1 (%d)." % (k, self.n_samples) ) with utils.Timer( f"Finding {k} nearest neighbors using {method} search with {metric} metric...", self.verbose, ): knn_index, neighbors, distances = build_knn_index( data, method, k, metric, metric_params, n_jobs, random_state ) self.knn_index = knn_index with utils.Timer("Calcualting affinity matrix...", self.verbose): # Compute asymmetric pairwise input similarities conditional_P = np.exp(-(distances ** 2) / (2 * sigma ** 2)) conditional_P /= np.sum(conditional_P, axis=1)[:, np.newaxis] P = sp.csr_matrix( ( conditional_P.ravel(), neighbors.ravel(), range(0, n_samples * k + 1, k), ), shape=(n_samples, n_samples), ) # Symmetrize the probability matrix if symmetrize: P = (P + P.T) / 2 # Convert weights to probabilities P /= np.sum(P) self.sigma = sigma self.k = k self.P = P self.n_jobs = n_jobs
def __init__( self, data, perplexity=30, method="auto", metric="euclidean", metric_params=None, symmetrize=True, n_jobs=1, random_state=None, verbose=False, ): self.n_samples = data.shape[0] self.perplexity = self.check_perplexity(perplexity) self.verbose = verbose k_neighbors = min(self.n_samples - 1, int(3 * self.perplexity)) self.knn_index, self.__neighbors, self.__distances = build_knn_index( data, method, k_neighbors, metric, metric_params, n_jobs, random_state, verbose) with utils.Timer("Calculating affinity matrix...", self.verbose): self.P = joint_probabilities_nn( self.__neighbors, self.__distances, [self.perplexity], symmetrize=symmetrize, n_jobs=n_jobs, ) self.n_jobs = n_jobs
def build(self): data, k = self.data, self.k timer = utils.Timer( f"Finding {k} nearest neighbors using Annoy approximate search using " f"{self.metric} distance...", verbose=self.verbose, ) timer.__enter__() from openTSNE.dependencies.annoy import AnnoyIndex N = data.shape[0] annoy_metric = self.metric annoy_aliases = { "cosine": "angular", "l1": "manhattan", "l2": "euclidean", "taxicab": "manhattan", } if annoy_metric in annoy_aliases: annoy_metric = annoy_aliases[annoy_metric] self.index = AnnoyIndex(data.shape[1], annoy_metric) random_state = check_random_state(self.random_state) self.index.set_seed(random_state.randint(np.iinfo(np.int32).max)) for i in range(N): self.index.add_item(i, data[i]) # Number of trees. FIt-SNE uses 50 by default. self.index.build(50, n_jobs=self.n_jobs) # Return the nearest neighbors in the training set distances = np.zeros((N, k)) indices = np.zeros((N, k)).astype(int) def getnns(i): # Annoy returns the query point itself as the first element indices_i, distances_i = self.index.get_nns_by_item( i, k + 1, include_distances=True ) indices[i] = indices_i[1:] distances[i] = distances_i[1:] if self.n_jobs == 1: for i in range(N): getnns(i) else: from joblib import Parallel, delayed Parallel(n_jobs=self.n_jobs, require="sharedmem")( delayed(getnns)(i) for i in range(N) ) timer.__exit__() return indices, distances
def query(self, query, k): timer = utils.Timer( f"Finding {k} nearest neighbors in existing embedding using Annoy " f"approximate search...", self.verbose, ) timer.__enter__() N = query.shape[0] distances = np.zeros((N, k)) indices = np.zeros((N, k)).astype(int) def getnns(i): indices[i], distances[i] = self.index.get_nns_by_vector( query[i], k, include_distances=True ) if self.n_jobs == 1: for i in range(N): getnns(i) else: from joblib import Parallel, delayed Parallel(n_jobs=self.n_jobs, require="sharedmem")( delayed(getnns)(i) for i in range(N) ) timer.__exit__() return indices, distances
def __init__( self, data=None, perplexities=None, method="auto", metric="euclidean", metric_params=None, symmetrize=True, n_jobs=1, random_state=None, verbose=False, knn_index=None, ): # Perplexities must be specified, but has default set to none, so the # parameter order makes more sense if perplexities is None: raise ValueError("`perplexities` must be specified!") # This can't work if neither data nor the knn index are specified if data is None and knn_index is None: raise ValueError( "At least one of the parameters `data` or `knn_index` must be specified!" ) # This can't work if both data and the knn index are specified if data is not None and knn_index is not None: raise ValueError( "Both `data` or `knn_index` were specified! Please pass only one." ) # Find the nearest neighbors if knn_index is None: # We will compute the nearest neighbors to the max value of perplexity, # smaller values can just use indexing to truncate unneeded neighbors n_samples = data.shape[0] perplexities = self.check_perplexities(perplexities, n_samples) max_perplexity = np.max(perplexities) k_neighbors = min(n_samples - 1, int(3 * max_perplexity)) self.knn_index = get_knn_index(data, method, k_neighbors, metric, metric_params, n_jobs, random_state, verbose) else: self.knn_index = knn_index log.info("KNN index provided. Ignoring KNN-related parameters.") self.__neighbors, self.__distances = self.knn_index.build() with utils.Timer("Calculating affinity matrix...", verbose): self.P = self._calculate_P( self.__neighbors, self.__distances, perplexities, symmetrize=symmetrize, n_jobs=n_jobs, ) self.perplexities = perplexities self.n_jobs = n_jobs self.verbose = verbose
def weighted_mean(X, embedding, neighbors, distances, verbose=False): """Initialize points onto an existing embedding by placing them in the weighted mean position of their nearest neighbors on the reference embedding. Parameters ---------- X: np.ndarray embedding: TSNEEmbedding neighbors: np.ndarray distances: np.ndarray verbose: bool Returns ------- np.ndarray """ n_samples = X.shape[0] n_components = embedding.shape[1] with utils.Timer("Calculating weighted-mean initialization...", verbose): partial_embedding = np.zeros((n_samples, n_components), order="C") for i in range(n_samples): partial_embedding[i] = np.average(embedding[neighbors[i]], axis=0, weights=distances[i]) return partial_embedding
def __init__( self, data, perplexities, method="auto", metric="euclidean", metric_params=None, symmetrize=True, n_jobs=1, random_state=None, verbose=False, ): self.n_samples = data.shape[0] self.verbose = verbose # We will compute the nearest neighbors to the max value of perplexity, # smaller values can just use indexing to truncate unneeded neighbors perplexities = self.check_perplexities(perplexities) max_perplexity = np.max(perplexities) k_neighbors = min(self.n_samples - 1, int(3 * max_perplexity)) with utils.Timer( f"Finding {k_neighbors} nearest neighbors using {method} search " f"with {metric} metric...", self.verbose, ): self.knn_index, self.__neighbors, self.__distances = build_knn_index( data, method, k_neighbors, metric, metric_params, n_jobs, random_state ) with utils.Timer("Calcualting affinity matrix...", self.verbose): self.P = self._calculate_P( self.__neighbors, self.__distances, perplexities, symmetrize=symmetrize, n_jobs=n_jobs, ) self.perplexities = perplexities self.n_jobs = n_jobs
def set_perplexity(self, new_perplexity): """Change the perplexity of the affinity matrix. Note that we only allow setting the perplexity to a value not larger than the number of neighbors used for the original perplexity. This restriction exists because setting a higher perplexity value requires recomputing all the nearest neighbors, which can take a long time. To avoid potential confusion as to why execution time is slow, this is not allowed. If you would like to increase the perplexity above that value, simply create a new instance. Parameters ---------- new_perplexity: float The new perplexity. """ # If the value hasn't changed, there's nothing to do if new_perplexity == self.perplexity: return # Verify that the perplexity isn't negative effective_perplexity = self.check_perplexity(new_perplexity, np.inf) # Verify that the perplexity isn't too large for the kNN graph if effective_perplexity > self.__neighbors.shape[1]: raise RuntimeError( "The desired perplexity `%.2f` is larger than the kNN graph " "allows. This would need to recompute the nearest neighbors, " "which is not efficient. Please create a new `%s` instance " "with the increased perplexity." % (effective_perplexity, self.__class__.__name__)) # Warn if the perplexity is larger than the heuristic if 3 * effective_perplexity > self.__neighbors.shape[1]: log.warning( "The new perplexity is quite close to the computed number of " "nearest neighbors. The results may be unexpected. Consider " "creating a new `%s` instance with the increased perplexity." % self.__class__.__name__) # Recompute the affinity matrix self.perplexity = new_perplexity self.effective_perplexity_ = effective_perplexity k_neighbors = int(3 * new_perplexity) with utils.Timer("Perplexity changed. Recomputing affinity matrix...", self.verbose): self.P = joint_probabilities_nn( self.__neighbors[:, :k_neighbors], self.__distances[:, :k_neighbors], [self.effective_perplexity_], symmetrize=self.symmetrize, n_jobs=self.n_jobs, )
def query(self, query, k): timer = utils.Timer( f"Finding {k} nearest neighbors in existing embedding using NN Descent " f"approxmimate search...", self.verbose, ) timer.__enter__() indices, distances = self.index.query(query, k=k) timer.__exit__() return indices, distances
def build(self): data, k = self.data, self.k timer = utils.Timer( f"Finding {k} nearest neighbors using exact search using " f"{self.metric} distance...", verbose=self.verbose, ) timer.__enter__() if self.metric == "cosine": # The nearest neighbor ranking for cosine distance is the same as # for euclidean distance on normalized data effective_metric = "euclidean" effective_data = data.copy() effective_data = ( effective_data / np.linalg.norm(effective_data, axis=1)[:, None] ) # In order to properly compute cosine distances when querying the # index, we need to store the original data self.__data = data else: effective_metric = self.metric effective_data = data self.index = neighbors.NearestNeighbors( algorithm="auto", metric=effective_metric, metric_params=self.metric_params, n_jobs=self.n_jobs, ) self.index.fit(effective_data) # Return the nearest neighbors in the training set distances, indices = self.index.kneighbors(n_neighbors=k) # If using cosine distance, the computed distances will be wrong and # need to be recomputed if self.metric == "cosine": distances = np.vstack( [ cdist(np.atleast_2d(x), data[idx], metric="cosine") for x, idx in zip(data, indices) ] ) timer.__exit__() return indices, distances
def pca(X, n_components=2, svd_solver="auto", random_state=None, verbose=False): """Initialize an embedding using the top principal components. Parameters ---------- X: np.ndarray The data matrix. n_components: int The dimension of the embedding space. svd_solver: str See sklearn.decomposition.PCA documentation. random_state: Union[int, RandomState] If the value is an int, random_state is the seed used by the random number generator. If the value is a RandomState instance, then it will be used as the random number generator. If the value is None, the random number generator is the RandomState instance used by `np.random`. verbose: bool Returns ------- initialization: np.ndarray """ timer = utils.Timer("Calculating PCA-based initialization...", verbose) timer.__enter__() pca_ = PCA(n_components=n_components, svd_solver=svd_solver, random_state=random_state) embedding = pca_.fit_transform(X) # The PCA embedding may have high variance, which leads to poor convergence normalization = np.std(embedding[:, 0]) normalization /= 0.0001 embedding /= normalization timer.__exit__() return np.ascontiguousarray(embedding)
def build(self): data, k = self.data, self.k timer = utils.Timer( f"Finding {k} nearest neighbors using HNSWlib approximate search using " f"{self.metric} distance...", verbose=self.verbose, ) timer.__enter__() from hnswlib import Index hnsw_space = { "cosine": "cosine", "dot": "ip", "euclidean": "l2", "ip": "ip", "l2": "l2", }[self.metric] random_state = check_random_state(self.random_state) random_seed = random_state.randint(np.iinfo(np.int32).max) self.index = Index(space=hnsw_space, dim=data.shape[1]) # Initialize HNSW Index self.index.init_index( max_elements=data.shape[0], ef_construction=200, M=16, random_seed=random_seed, ) # Build index tree from data self.index.add_items(data, num_threads=self.n_jobs) # Set ef parameter for (ideal) precision/recall self.index.set_ef(min(2 * k, self.index.get_current_count())) # Query for kNN indices, distances = self.index.knn_query(data, k=k + 1, num_threads=self.n_jobs) # Stop timer timer.__exit__() # return indices and distances, skip first entry, which is always the point itself return indices[:, 1:], distances[:, 1:]
def median(embedding, neighbors, verbose=False): """Initialize points onto an existing embedding by placing them in the median position of their nearest neighbors on the reference embedding. Parameters ---------- embedding: TSNEEmbedding neighbors: np.ndarray verbose: bool Returns ------- np.ndarray """ with utils.Timer("Calculating meadian initialization...", verbose): embedding = np.median(embedding[neighbors], axis=1) return np.ascontiguousarray(embedding)
def set_perplexity(self, new_perplexity): """Change the perplexity of the affinity matrix. Note that we only allow lowering the perplexity or restoring it to its original value. This restriction exists because setting a higher perplexity value requires recomputing all the nearest neighbors, which can take a long time. To avoid potential confusion as to why execution time is slow, this is not allowed. If you would like to increase the perplexity above the initial value, simply create a new instance. Parameters ---------- new_perplexity: float The new perplexity. """ # If the value hasn't changed, there's nothing to do if new_perplexity == self.perplexity: return # Verify that the perplexity isn't too large new_perplexity = self.check_perplexity(new_perplexity) # Recompute the affinity matrix k_neighbors = min(self.n_samples - 1, int(3 * new_perplexity)) if k_neighbors > self.__neighbors.shape[1]: raise RuntimeError( "The desired perplexity `%.2f` is larger than the initial one " "used. This would need to recompute the nearest neighbors, " "which is not efficient. Please create a new `%s` instance " "with the increased perplexity." % (new_perplexity, self.__class__.__name__) ) self.perplexity = new_perplexity with utils.Timer( "Perplexity changed. Recomputing affinity matrix...", self.verbose ): self.P = joint_probabilities_nn( self.__neighbors[:, :k_neighbors], self.__distances[:, :k_neighbors], [self.perplexity], symmetrize=True, n_jobs=self.n_jobs, )
def set_perplexities(self, new_perplexities): """Change the perplexities of the affinity matrix. Note that we only allow lowering the perplexities or restoring them to their original maximum value. This restriction exists because setting a higher perplexity value requires recomputing all the nearest neighbors, which can take a long time. To avoid potential confusion as to why execution time is slow, this is not allowed. If you would like to increase the perplexity above the initial value, simply create a new instance. Parameters ---------- new_perplexities: List[float] The new list of perplexities. """ if np.array_equal(self.perplexities, new_perplexities): return effective_perplexities = self.check_perplexities( new_perplexities, self.n_samples) max_perplexity = np.max(effective_perplexities) k_neighbors = min(self.n_samples - 1, int(3 * max_perplexity)) if k_neighbors > self.__neighbors.shape[1]: raise RuntimeError( "The largest perplexity `%.2f` is larger than the initial one " "used. This would need to recompute the nearest neighbors, " "which is not efficient. Please create a new `%s` instance " "with the increased perplexity." % (max_perplexity, self.__class__.__name__)) self.perplexities = new_perplexities self.effective_perplexities_ = effective_perplexities with utils.Timer("Perplexity changed. Recomputing affinity matrix...", self.verbose): self.P = self._calculate_P( self.__neighbors[:, :k_neighbors], self.__distances[:, :k_neighbors], self.effective_perplexities_, symmetrize=self.symmetrize, n_jobs=self.n_jobs, )
def query(self, query, k): timer = utils.Timer( f"Finding {k} nearest neighbors in existing embedding using HNSWlib " f"approximate search...", self.verbose, ) timer.__enter__() # Set ef parameter for (ideal) precision/recall self.index.set_ef(min(2 * k, self.index.get_current_count())) # Query for kNN indices, distances = self.index.knn_query(query, k=k, num_threads=self.n_jobs) # Stop timer timer.__exit__() # return indices and distances return indices, distances
def __init__( self, data, perplexity=30, method="auto", metric="euclidean", metric_params=None, symmetrize=True, n_jobs=1, random_state=None, verbose=False, k_neighbors="auto", ): self.n_samples = data.shape[0] if k_neighbors == "auto": _k_neighbors = min(self.n_samples - 1, int(3 * perplexity)) else: _k_neighbors = k_neighbors self.perplexity = self.check_perplexity(perplexity, _k_neighbors) self.verbose = verbose if _k_neighbors > int(3 * self.perplexity): log.warning( "The k_neighbors value is over 3 times larger than the perplexity value. " "This may result in an unnecessary slowdown.") self.knn_index, self.__neighbors, self.__distances = build_knn_index( data, method, _k_neighbors, metric, metric_params, n_jobs, random_state, verbose) with utils.Timer("Calculating affinity matrix...", self.verbose): self.P = joint_probabilities_nn( self.__neighbors, self.__distances, [self.perplexity], symmetrize=symmetrize, n_jobs=n_jobs, ) self.n_jobs = n_jobs
def query(self, query, k): timer = utils.Timer( f"Finding {k} nearest neighbors in existing embedding using exact search...", self.verbose, ) timer.__enter__() # The nearest neighbor ranking for cosine distance is the same as for # euclidean distance on normalized data if self.metric == "cosine": effective_data = query.copy() effective_data = ( effective_data / np.linalg.norm(effective_data, axis=1)[:, None] ) else: effective_data = query distances, indices = self.index.kneighbors(effective_data, n_neighbors=k) # If using cosine distance, the computed distances will be wrong and # need to be recomputed if self.metric == "cosine": if self.__data is None: raise RuntimeError( "The original data was unavailable when querying cosine " "distance. Did you change the distance metric after " "building the index? Please rebuild the index using cosine " "similarity." ) distances = np.vstack( [ cdist(np.atleast_2d(x), self.__data[idx], metric="cosine") for x, idx in zip(query, indices) ] ) timer.__exit__() return indices, distances
def build(self, data, k): timer = utils.Timer( f"Finding {k} nearest neighbors using NN descent approximate search using " f"{self.metric} distance...", verbose=self.verbose, ) timer.__enter__() # These values were taken from UMAP, which we assume to be sensible defaults n_trees = 5 + int(round((data.shape[0])**0.5 / 20)) n_iters = max(5, int(round(np.log2(data.shape[0])))) # Numba takes a while to load up, so there's little point in loading it # unless we're actually going to use it import pynndescent # UMAP uses the "alternative" algorithm, but that sometimes causes # memory corruption, so use the standard one, which seems to work fine self.index = pynndescent.NNDescent( data, n_neighbors=15, metric=self.metric, metric_kwds=self.metric_params, random_state=self.random_state, n_trees=n_trees, n_iters=n_iters, algorithm="standard", max_candidates=60, n_jobs=self.n_jobs, ) indices, distances = self.index.query(data, k=k + 1) timer.__exit__() return indices[:, 1:], distances[:, 1:]
def spectral(A, n_components=2, tol=1e-4, max_iter=None, random_state=None, verbose=False): """Initialize an embedding using the spectral embedding of the KNN graph. Specifically, we initialize data points by computing the diffusion map on the random walk transition matrix of the weighted graph given by the affiniy matrix. Parameters ---------- A: Union[sp.csr_matrix, sp.csc_matrix, ...] The graph adjacency matrix. n_components: int The dimension of the embedding space. tol: float See scipy.sparse.linalg.eigsh documentation. max_iter: float See scipy.sparse.linalg.eigsh documentation. random_state: Any Unused, but kept for consistency between initialization schemes. verbose: bool Returns ------- initialization: np.ndarray """ if A.ndim != 2: raise ValueError( "The graph adjacency matrix must be a 2-dimensional matrix.") if A.shape[0] != A.shape[1]: raise ValueError("The graph adjacency matrix must be a square matrix.") timer = utils.Timer("Calculating spectral initialization...", verbose) timer.__enter__() D = sp.diags(np.ravel(np.sum(A, axis=1))) # Find leading eigenvectors k = n_components + 1 v0 = np.ones(A.shape[0]) / np.sqrt(A.shape[0]) eigvals, eigvecs = sp.linalg.eigsh(A, M=D, k=k, tol=tol, maxiter=max_iter, which="LM", v0=v0) # Sort the eigenvalues in decreasing order order = np.argsort(eigvals)[::-1] eigvecs = eigvecs[:, order] # In diffusion maps, we multiply the eigenvectors by their eigenvalues eigvecs *= eigvals # Drop the leading eigenvector embedding = eigvecs[:, 1:] rescale(embedding, inplace=True) timer.__exit__() return embedding
def __init__( self, data=None, perplexity=30, method="auto", metric="euclidean", metric_params=None, symmetrize=True, n_jobs=1, random_state=None, verbose=False, k_neighbors="auto", knn_index=None, ): # This can't work if neither data nor the knn index are specified if data is None and knn_index is None: raise ValueError( "At least one of the parameters `data` or `knn_index` must be specified!" ) # This can't work if both data and the knn index are specified if data is not None and knn_index is not None: raise ValueError( "Both `data` or `knn_index` were specified! Please pass only one." ) # Find the nearest neighbors if knn_index is None: n_samples = data.shape[0] if k_neighbors == "auto": _k_neighbors = min(n_samples - 1, int(3 * perplexity)) else: _k_neighbors = k_neighbors effective_perplexity = self.check_perplexity( perplexity, _k_neighbors) if _k_neighbors > int(3 * effective_perplexity): log.warning( "The k_neighbors value is over 3 times larger than the perplexity value. " "This may result in an unnecessary slowdown.") self.knn_index = get_knn_index(data, method, _k_neighbors, metric, metric_params, n_jobs, random_state, verbose) else: self.knn_index = knn_index effective_perplexity = self.check_perplexity( perplexity, self.knn_index.k) log.info("KNN index provided. Ignoring KNN-related parameters.") self.__neighbors, self.__distances = self.knn_index.build() with utils.Timer("Calculating affinity matrix...", verbose): self.P = joint_probabilities_nn( self.__neighbors, self.__distances, [effective_perplexity], symmetrize=symmetrize, n_jobs=n_jobs, ) self.perplexity = perplexity self.effective_perplexity_ = effective_perplexity self.symmetrize = symmetrize self.n_jobs = n_jobs self.verbose = verbose
def build(self, data, k): timer = utils.Timer( f"Finding {k} nearest neighbors using NN descent approximate search using " f"{self.metric} distance...", verbose=self.verbose, ) timer.__enter__() # These values were taken from UMAP, which we assume to be sensible defaults n_trees = 5 + int(round((data.shape[0]) ** 0.5 / 20)) n_iters = max(5, int(round(np.log2(data.shape[0])))) # Numba takes a while to load up, so there's little point in loading it # unless we're actually going to use it import pynndescent # Will use query() only for k>15 if k <= 15: n_neighbors_build = k + 1 else: n_neighbors_build = 15 self.index = pynndescent.NNDescent( data, n_neighbors=n_neighbors_build, metric=self.metric, metric_kwds=self.metric_params, random_state=self.random_state, n_trees=n_trees, n_iters=n_iters, max_candidates=60, n_jobs=self.n_jobs, verbose=self.verbose > 1, ) # -1 in indices means that pynndescent failed indices, distances = self.index.neighbor_graph mask = np.sum(indices == -1, axis=1) > 0 if k > 15: indices, distances = self.index.query(data, k=k + 1) # As a workaround, we let the failed points group together if np.sum(mask) > 0: if self.verbose: opt = np.get_printoptions() np.set_printoptions(threshold=np.inf) warnings.warn( f"`pynndescent` failed to find neighbors for some of the points. " f"As a workaround, openTSNE considers all such points similar to " f"each other, so they will likely form a cluster in the embedding." f"The indices of the failed points are:\n{np.where(mask)[0]}" ) np.set_printoptions(**opt) else: warnings.warn( f"`pynndescent` failed to find neighbors for some of the points. " f"As a workaround, openTSNE considers all such points similar to " f"each other, so they will likely form a cluster in the embedding. " f"Run with verbose=True, to see indices of the failed points." ) distances[mask] = 1 rs = check_random_state(self.random_state) fake_indices = rs.choice( np.sum(mask), size=np.sum(mask) * indices.shape[1], replace=True ) fake_indices = np.where(mask)[0][fake_indices] indices[mask] = np.reshape(fake_indices, (np.sum(mask), indices.shape[1])) timer.__exit__() return indices[:, 1:], distances[:, 1:]
def __init__( self, data=None, sigma=None, k=30, method="auto", metric="euclidean", metric_params=None, symmetrize=True, n_jobs=1, random_state=None, verbose=False, knn_index=None, ): # Sigma must be specified, but has default set to none, so the parameter # order makes more sense if sigma is None: raise ValueError("`sigma` must be specified!") # This can't work if neither data nor the knn index are specified if data is None and knn_index is None: raise ValueError( "At least one of the parameters `data` or `knn_index` must be specified!" ) # This can't work if both data and the knn index are specified if data is not None and knn_index is not None: raise ValueError( "Both `data` or `knn_index` were specified! Please pass only one." ) # Find the nearest neighbors if knn_index is None: if k >= data.shape[0]: raise ValueError("`k` (%d) cannot be larger than N-1 (%d)." % (k, data.shape[0])) self.knn_index = get_knn_index(data, method, k, metric, metric_params, n_jobs, random_state, verbose) else: self.knn_index = knn_index log.info("KNN index provided. Ignoring KNN-related parameters.") neighbors, distances = self.knn_index.build() with utils.Timer("Calculating affinity matrix...", verbose): # Compute asymmetric pairwise input similarities conditional_P = np.exp(-(distances**2) / (2 * sigma**2)) conditional_P /= np.sum(conditional_P, axis=1)[:, np.newaxis] n_samples = self.knn_index.n_samples P = sp.csr_matrix( ( conditional_P.ravel(), neighbors.ravel(), range(0, n_samples * k + 1, k), ), shape=(n_samples, n_samples), ) # Symmetrize the probability matrix if symmetrize: P = (P + P.T) / 2 # Convert weights to probabilities P /= np.sum(P) self.sigma = sigma self.P = P self.n_jobs = n_jobs self.verbose = verbose
def to_new(self, data, perplexities=None, return_distances=False): """Compute the affinities of new samples to the initial samples. This is necessary for embedding new data points into an existing embedding. Please see the :ref:`parameter-guide` for more information. Parameters ---------- data: np.ndarray The data points to be added to the existing embedding. perplexities: List[float] A list of perplexity values, which will be used in the multiscale Gaussian kernel. Perplexity can be thought of as the continuous :math:`k` number of nearest neighbors, for which t-SNE will attempt to preserve distances. return_distances: bool If needed, the function can return the indices of the nearest neighbors and their corresponding distances. Returns ------- P: array_like An :math:`N \\times M` affinity matrix expressing interactions between :math:`N` new data points the initial :math:`M` data samples. indices: np.ndarray Returned if ``return_distances=True``. The indices of the :math:`k` nearest neighbors in the existing embedding for every new data point. distances: np.ndarray Returned if ``return_distances=True``. The distances to the :math:`k` nearest neighbors in the existing embedding for every new data point. """ perplexities = perplexities if perplexities is not None else self.perplexities effective_perplexities = self.check_perplexities( perplexities, self.n_samples) max_perplexity = np.max(effective_perplexities) k_neighbors = min(self.n_samples - 1, int(3 * max_perplexity)) neighbors, distances = self.knn_index.query(data, k_neighbors) with utils.Timer("Calculating affinity matrix...", self.verbose): P = self._calculate_P( neighbors, distances, effective_perplexities, symmetrize=False, normalization="point-wise", n_reference_samples=self.n_samples, n_jobs=self.n_jobs, ) if return_distances: return P, neighbors, distances return P
def to_new(self, data, k=None, sigma=None, return_distances=False): """Compute the affinities of new samples to the initial samples. This is necessary for embedding new data points into an existing embedding. Parameters ---------- data: np.ndarray The data points to be added to the existing embedding. k: int The number of nearest neighbors to consider for each kernel. sigma: float The bandwidth to use for the Gaussian kernels in the ambient space. return_distances: bool If needed, the function can return the indices of the nearest neighbors and their corresponding distances. Returns ------- P: array_like An :math:`N \\times M` affinity matrix expressing interactions between :math:`N` new data points the initial :math:`M` data samples. indices: np.ndarray Returned if ``return_distances=True``. The indices of the :math:`k` nearest neighbors in the existing embedding for every new data point. distances: np.ndarray Returned if ``return_distances=True``. The distances to the :math:`k` nearest neighbors in the existing embedding for every new data point. """ n_samples = data.shape[0] n_reference_samples = self.n_samples if k is None: k = self.knn_index.k elif k >= n_reference_samples: raise ValueError( "`k` (%d) cannot be larger than the number of reference " "samples (%d)." % (k, self.n_samples)) if sigma is None: sigma = self.sigma # Find nearest neighbors and the distances to the new points neighbors, distances = self.knn_index.query(data, k) with utils.Timer("Calculating affinity matrix...", self.verbose): # Compute asymmetric pairwise input similarities conditional_P = np.exp(-(distances**2) / (2 * sigma**2)) # Convert weights to probabilities conditional_P /= np.sum(conditional_P, axis=1)[:, np.newaxis] P = sp.csr_matrix( ( conditional_P.ravel(), neighbors.ravel(), range(0, n_samples * k + 1, k), ), shape=(n_samples, n_reference_samples), ) if return_distances: return P, neighbors, distances return P
import gzip import pickle from os import path import openTSNE from openTSNE import utils with utils.Timer("Loading data...", verbose=True): with gzip.open(path.join("data", "macosko_2015.pkl.gz"), "rb") as f: data = pickle.load(f) x = data["pca_50"] y, cluster_ids = data["CellType1"], data["CellType2"] # import sys; sys.path.append("FIt-SNE") # from fast_tsne import fast_tsne # # with Timer("Running fast_tsne..."): # fast_tsne(x, nthreads=1) affinities = openTSNE.affinity.PerplexityBasedNN( x, perplexity=30, metric="cosine", method="approx", n_jobs=-1, random_state=0, verbose=True, ) init = openTSNE.initialization.spectral(affinities.P, verbose=True)
def to_new(self, data, perplexity=None, return_distances=False, k_neighbors="auto"): """Compute the affinities of new samples to the initial samples. This is necessary for embedding new data points into an existing embedding. Please see the :ref:`parameter-guide` for more information. Parameters ---------- data: np.ndarray The data points to be added to the existing embedding. perplexity: float Perplexity can be thought of as the continuous :math:`k` number of nearest neighbors, for which t-SNE will attempt to preserve distances. return_distances: bool If needed, the function can return the indices of the nearest neighbors and their corresponding distances. k_neighbors: int or ``auto`` The number of neighbors to query kNN graph for. If ``auto`` (default), it is set to three times the perplexity. Returns ------- P: array_like An :math:`N \\times M` affinity matrix expressing interactions between :math:`N` new data points the initial :math:`M` data samples. indices: np.ndarray Returned if ``return_distances=True``. The indices of the :math:`k` nearest neighbors in the existing embedding for every new data point. distances: np.ndarray Returned if ``return_distances=True``. The distances to the :math:`k` nearest neighbors in the existing embedding for every new data point. """ perplexity = perplexity if perplexity is not None else self.perplexity if k_neighbors == "auto": _k_neighbors = min(self.n_samples, int(3 * perplexity)) else: _k_neighbors = k_neighbors effective_perplexity = self.check_perplexity(perplexity, _k_neighbors) neighbors, distances = self.knn_index.query(data, _k_neighbors) with utils.Timer("Calculating affinity matrix...", self.verbose): P = joint_probabilities_nn( neighbors, distances, [effective_perplexity], symmetrize=False, normalization="point-wise", n_reference_samples=self.n_samples, n_jobs=self.n_jobs, ) if return_distances: return P, neighbors, distances return P