def test_gradient_bh_multithread_match_sequential(): # check that the bh gradient with different num_threads gives the same # results n_features = 10 n_samples = 30 n_components = 2 degrees_of_freedom = 1 angle = 3 perplexity = 5 random_state = check_random_state(0) data = random_state.randn(n_samples, n_features).astype(np.float32) params = random_state.randn(n_samples, n_components) n_neighbors = n_samples - 1 distances_csr = NearestNeighbors().fit(data).kneighbors_graph( n_neighbors=n_neighbors, mode='distance') P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0) kl_sequential, grad_sequential = _kl_divergence_bh( params, P_bh, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=0, num_threads=1) for num_threads in [2, 4]: kl_multithread, grad_multithread = _kl_divergence_bh( params, P_bh, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=0, num_threads=num_threads) assert_allclose(kl_multithread, kl_sequential, rtol=1e-6) assert_allclose(grad_multithread, grad_multithread)
def test_binary_perplexity_stability(): # Binary perplexity search should be stable. # The binary_search_perplexity had a bug wherein the P array # was uninitialized, leading to sporadically failing tests. n_neighbors = 10 n_samples = 100 random_state = check_random_state(0) data = random_state.randn(n_samples, 5) nn = NearestNeighbors().fit(data) distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors, mode='distance') distances = distance_graph.data.astype(np.float32, copy=False) distances = distances.reshape(n_samples, n_neighbors) last_P = None desired_perplexity = 3 for _ in range(100): P = _binary_search_perplexity(distances.copy(), desired_perplexity, verbose=0) P1 = _joint_probabilities_nn(distance_graph, desired_perplexity, verbose=0) # Convert the sparse matrix to a dense one for testing P1 = P1.toarray() if last_P is None: last_P = P last_P1 = P1 else: assert_array_almost_equal(P, last_P, decimal=4) assert_array_almost_equal(P1, last_P1, decimal=4)
def test_barnes_hut_angle(): # When Barnes-Hut's angle=0 this corresponds to the exact method. angle = 0.0 perplexity = 10 n_samples = 100 for n_components in [2, 3]: n_features = 5 degrees_of_freedom = float(n_components - 1.0) random_state = check_random_state(0) data = random_state.randn(n_samples, n_features) distances = pairwise_distances(data) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, verbose=0) kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) n_neighbors = n_samples - 1 distances_csr = NearestNeighbors().fit(data).kneighbors_graph( n_neighbors=n_neighbors, mode='distance') P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0) kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=0) P = squareform(P) P_bh = P_bh.toarray() assert_array_almost_equal(P_bh, P, decimal=5) assert_almost_equal(kl_exact, kl_bh, decimal=3)
def _fit(self, X, skip_num_points=0): """Private function to fit the model using X as training data.""" if self.method not in ['barnes_hut', 'exact']: raise ValueError("'method' must be 'barnes_hut' or 'exact'") if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") if hasattr(self, 'square_distances'): if self.square_distances not in [True, 'legacy']: raise ValueError( "'square_distances' must be True or 'legacy'.") if self.metric != "euclidean" and self.square_distances is not True: warnings.warn( ("'square_distances' has been introduced in 0.24" "to help phase out legacy squaring behavior. The " "'legacy' setting will be removed in 0.26, and the " "default setting will be changed to True. In 0.28, " "'square_distances' will be removed altogether," "and distances will be squared by default. Set " "'square_distances'=True to silence this warning."), FutureWarning) if self.method == 'barnes_hut': if sklearn_check_version('0.23'): X = self._validate_data(X, accept_sparse=['csr'], ensure_min_samples=2, dtype=[np.float32, np.float64]) else: X = check_array(X, accept_sparse=['csr'], ensure_min_samples=2, dtype=[np.float32, np.float64]) else: if sklearn_check_version('0.23'): X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float32, np.float64]) else: X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float32, np.float64]) if self.metric == "precomputed": if isinstance(self.init, str) and self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be " "used with metric=\"precomputed\".") if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") check_non_negative( X, "TSNE.fit(). With metric='precomputed', X " "should contain positive distances.") if self.method == "exact" and issparse(X): raise TypeError( 'TSNE with method="exact" does not accept sparse ' 'precomputed distance matrix. Use method="barnes_hut" ' 'or provide the dense distance matrix.') if self.method == 'barnes_hut' and self.n_components > 3: raise ValueError("'n_components' should be inferior to 4 for the " "barnes_hut algorithm as it relies on " "quad-tree or oct-tree.") random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: raise ValueError( "early_exaggeration must be at least 1, but is {}".format( self.early_exaggeration)) if self.n_iter < 250: raise ValueError("n_iter should be at least 250") n_samples = X.shape[0] neighbors_nn = None if self.method == "exact": # Retrieve the distance matrix, either using the precomputed one or # computing it. if self.metric == "precomputed": distances = X else: if self.verbose: print("[t-SNE] Computing pairwise distances...") if self.metric == "euclidean": # Euclidean is squared here, rather than using **= 2, # because euclidean_distances already calculates # squared distances, and returns np.sqrt(dist) for # squared=False. # Also, Euclidean is slower for n_jobs>1, so don't set here distances = pairwise_distances(X, metric=self.metric, squared=True) else: distances = pairwise_distances(X, metric=self.metric, n_jobs=self.n_jobs) if np.any(distances < 0): raise ValueError("All distances should be positive, the " "metric given is not correct") if self.metric != "euclidean" and \ getattr(self, 'square_distances', True) is True: distances **= 2 # compute the joint probability distribution for the input space P = _joint_probabilities(distances, self.perplexity, self.verbose) assert np.all(np.isfinite(P)), "All probabilities should be finite" assert np.all(P >= 0), "All probabilities should be non-negative" assert np.all(P <= 1), ("All probabilities should be less " "or then equal to one") else: # Compute the number of nearest neighbors to find. # LvdM uses 3 * perplexity as the number of neighbors. # In the event that we have very small # of points # set the neighbors to n - 1. n_neighbors = min(n_samples - 1, int(3. * self.perplexity + 1)) if self.verbose: print("[t-SNE] Computing {} nearest neighbors...".format( n_neighbors)) # Find the nearest neighbors for every point knn = NearestNeighbors(algorithm='auto', n_jobs=self.n_jobs, n_neighbors=n_neighbors, metric=self.metric) t0 = time() knn.fit(X) duration = time() - t0 if self.verbose: print("[t-SNE] Indexed {} samples in {:.3f}s...".format( n_samples, duration)) t0 = time() distances_nn = knn.kneighbors_graph(mode='distance') duration = time() - t0 if self.verbose: print("[t-SNE] Computed neighbors for {} samples " "in {:.3f}s...".format(n_samples, duration)) # Free the memory used by the ball_tree del knn if getattr(self, 'square_distances', True) is True or \ self.metric == "euclidean": # knn return the euclidean distance but we need it squared # to be consistent with the 'exact' method. Note that the # the method was derived using the euclidean method as in the # input space. Not sure of the implication of using a different # metric. distances_nn.data **= 2 # compute the joint probability distribution for the input space P = _joint_probabilities_nn(distances_nn, self.perplexity, self.verbose) if isinstance(self.init, np.ndarray): X_embedded = self.init elif self.init == 'pca': pca = PCA(n_components=self.n_components, svd_solver='randomized', random_state=random_state) X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) elif self.init == 'random': # The embedding is initialized with iid samples from Gaussians with # standard deviation 1e-4. X_embedded = 1e-4 * random_state.randn( n_samples, self.n_components).astype(np.float32) else: raise ValueError("'init' must be 'pca', 'random', or " "a numpy array") # Degrees of freedom of the Student's t-distribution. The suggestion # degrees_of_freedom = n_components - 1 comes from # "Learning a Parametric Embedding by Preserving Local Structure" # Laurens van der Maaten, 2009. degrees_of_freedom = max(self.n_components - 1, 1) return self._tsne(P, degrees_of_freedom, n_samples, X_embedded=X_embedded, neighbors=neighbors_nn, skip_num_points=skip_num_points)