def test_can_pass_callbacks_to_tsne_object(self): callback = MagicMock() callback2 = MagicMock() # We don't want individual callbacks to be iterable del callback.__iter__ del callback2.__iter__ # Should be able to pass a single callback TSNE(callbacks=callback, callbacks_every_iters=1, early_exaggeration_iter=0, n_iter=1).fit(self.x) self.assertEqual(callback.call_count, 1) # Should be able to pass a list callbacks callback.reset_mock() TSNE(callbacks=[callback], callbacks_every_iters=1, early_exaggeration_iter=0, n_iter=1).fit(self.x) self.assertEqual(callback.call_count, 1) # Should be able to change the callback on the object callback.reset_mock() tsne = TSNE(callbacks=callback, callbacks_every_iters=1, early_exaggeration_iter=0, n_iter=1) tsne.callbacks = callback2 tsne.fit(self.x) callback.assert_not_called() self.assertEqual(callback2.call_count, 1)
def test_raises_error_on_unrecognized_metric(self): """Unknown distance metric should raise error""" tsne = TSNE(metric='imaginary', neighbors='exact') with self.assertRaises(ValueError): tsne.prepare_initial(self.x) tsne = TSNE(metric='imaginary', neighbors='approx') with self.assertRaises(ValueError): tsne.prepare_initial(self.x)
def test_unfitted_pca_model(self): """Using PCA initialization in `transform` should fail when the initial embedding was initialized with PCA.""" tsne = TSNE(initialization='random') embedding = tsne.fit(self.x) # Transforming using `pca` init on embedding that did not use # `pca` init did not fail with self.assertRaises(AssertionError): embedding.transform(self.x_test, initialization='pca')
def test_low_variance(self): """Low variance of the initial embedding is very important for the convergence of tSNE.""" # Cycle through various initializations initializations = ['random', 'pca'] allowed = 1e-3 for init in initializations: tsne = TSNE(initialization=init, perplexity=2) embedding = tsne.prepare_initial(self.x) np.testing.assert_array_less( np.var(embedding, axis=0), allowed, 'using the `%s` initialization' % init)
def check_error_approx(): x, y = get_mouse_60k(1500) tsne = TSNE( perplexity=20, learning_rate=100, early_exaggeration=12, n_jobs=4, theta=0.5, initialization='pca', metric='euclidean', n_components=2, n_iter=750, early_exaggeration_iter=250, neighbors='exact', negative_gradient_method='bh', min_num_intervals=10, ints_in_interval=2, late_exaggeration_iter=0, late_exaggeration=4, callbacks=ErrorLogger(), ) embedding = tsne.prepare_initial(x, initialization='random') errors = ErrorApproximations(embedding.affinities.P) logger = ErrorLogger() embedding.optimize( 250, exaggeration=12, callbacks=[errors, logger], callbacks_every_iters=5, inplace=True, ) embedding.optimize( 750, exaggeration=None, callbacks=[errors, logger], callbacks_every_iters=5, inplace=True, ) errors.report() plot(embedding, y) x = list(range(len(errors.exact_errors))) plt.semilogy(x, errors.exact_errors, label='Exact') plt.semilogy(x, errors.bh_errors, label='BH') plt.semilogy(x, errors.fft_errors, label='FFT') plt.legend() plt.show()
def transform(n_jobs=4, grad='bh', neighbors='approx'): # iris = datasets.load_iris() # x, y = iris['data'], iris['target'] x, y = get_mnist(20000) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) tsne = TSNE( n_components=2, perplexity=30, learning_rate=100, early_exaggeration=12, n_jobs=n_jobs, theta=0.5, initialization='random', metric='euclidean', n_iter=750, early_exaggeration_iter=250, neighbors=neighbors, negative_gradient_method=grad, min_num_intervals=10, ints_in_interval=2, late_exaggeration_iter=0, late_exaggeration=4, callbacks=[ErrorLogger()], ) start = time.time() embedding = tsne.fit(x_train) print('tsne train', time.time() - start) plt.subplot(121) plot(embedding, y_train, show=False, ms=3) start = time.time() partial_embedding = embedding.transform(x_test, perplexity=20) # partial_embedding = embedding.get_partial_embedding_for( # x_test, perplexity=10, initialization='random') # partial_embedding.optimize(200, exaggeration=2, inplace=True, momentum=0.1) print('tsne transform', time.time() - start) plt.subplot(122) plot(embedding, y_train, show=False, ms=3, alpha=0.25) plt.gca().set_color_cycle(None) plot(partial_embedding, y_test, show=False, ms=3, alpha=0.8) plt.show()
def test_embedding_optimize(self, param_name, param_value, gradient_descent): # type: (str, Any, MagicMock) -> None # Make sure mock still conforms to signature gradient_descent.return_value = (1, MagicMock()) # `optimize` requires us to specify the `n_iter` params = {'n_iter': 50, param_name: param_value} tsne = TSNE() embedding = tsne.prepare_initial(self.x) embedding.optimize(**params, inplace=True) self.assertEqual(1, gradient_descent.call_count) check_call_contains_kwargs(gradient_descent.mock_calls[0], params)
def test_embedding_transform(self, param_name, param_value, gradient_descent): # type: (str, Any, MagicMock) -> None # Make sure mock still conforms to signature gradient_descent.return_value = (1, MagicMock()) # Perform initial embedding - this is tested above tsne = TSNE() embedding = tsne.fit(self.x) gradient_descent.reset_mock() embedding.transform(self.x_test, **{param_name: param_value}) # Early exaggeration training loop if param_name in ('early_exaggeration_iter', 'early_exaggeration'): call_idx = 0 # Main training loop elif param_name in ('n_iter', 'final_momentum'): call_idx = 1 # If general parameter, should be applied to every call else: call_idx = 0 self.assertEqual(2, gradient_descent.call_count) check_call_contains_kwargs( gradient_descent.mock_calls[call_idx], {param_name: param_value}, )
def setUpClass(cls): cls.tsne = TSNE(early_exaggeration_iter=20, n_iter=100) # Set up two modalities, if we want to viually inspect test results cls.x = np.vstack(( np.random.normal(+1, 1, (100, 4)), np.random.normal(-1, 1, (100, 4)), )) cls.x_test = np.random.normal(0, 1, (25, 4))
def run_graph(): graph = nx.read_edgelist(join(DATA_DIR, 'dolphins.edges')) affinities = NxGraphAffinities(graph) tsne = TSNE() y_coords = tsne.generate_initial_coordinates(affinities.P, initialization='random') embedding = TSNEEmbedding( y_coords, affinities, { 'callbacks': None, 'negative_gradient_method': 'bh', 'dof': 1, 'momentum': 0, 'learning_rate': 100 }) embedding.optimize(1000) plt.plot(embedding[:, 0], embedding[:, 1], 'o') plt.show()
def test_same_partial_embedding_on_fixed_random_state(self): tsne = TSNE(random_state=1, initialization='random') embedding = tsne.fit(self.x) partial1 = embedding.prepare_partial(self.x_test, initialization='random') partial2 = embedding.prepare_partial(self.x_test, initialization='random') np.testing.assert_array_equal( partial1, partial2, 'Same random state produced different partial embeddings')
def test_mismatching_embedding_dimensions_simple_api(self): # Fit tsne = TSNE(n_components=2, initialization=self.x[:10, :2]) with self.assertRaises(ValueError, msg='fit::ncorrect number of points'): tsne.fit(self.x[:25]) with self.assertRaises(ValueError, msg='fit::ncorrect number of dimensions'): TSNE(n_components=2, initialization=self.x[:10, :4]) # Transform tsne = TSNE(n_components=2, initialization='random') embedding = tsne.fit(self.x) with self.assertRaises(ValueError, msg='transform::incorrect number of points'): embedding.transform(X=self.x[:5], initialization=self.x[:10, :2]) with self.assertRaises( ValueError, msg='transform::incorrect number of dimensions'): embedding.transform(X=self.x, initialization=self.x[:, :4])
def test_iris(self): iris = datasets.load_iris() x, y = iris['data'], iris['target'] # Evaluate tSNE optimization using a KNN classifier knn = KNeighborsClassifier(n_neighbors=10) tsne = TSNE(perplexity=30, initialization='random', random_state=0) # Prepare a random initialization embedding = tsne.prepare_initial(x) # KNN should do poorly on a random initialization knn.fit(embedding, y) predictions = knn.predict(embedding) self.assertTrue(accuracy_score(predictions, y) < .5) # Optimize the embedding for a small number of steps so tests run fast embedding.optimize(50, inplace=True) # Similar points should be grouped together, therefore KNN should do well knn.fit(embedding, y) predictions = knn.predict(embedding) self.assertTrue(accuracy_score(predictions, y) > .95)
def test_same_results_on_fixed_random_state_pca_init(self): """Results should be exactly the same if we provide a random state.""" tsne1 = TSNE(random_state=1, initialization='pca') embedding1 = tsne1.fit(self.x) tsne2 = TSNE(random_state=1, initialization='pca') embedding2 = tsne2.fit(self.x) np.testing.assert_array_equal( embedding1, embedding2, 'Same random state produced different initial embeddings')
def test_nndescent_distances(self, param_name, metric, nndescent: MagicMock): """Distance metrics should be properly passed down to NN descent""" assert param_name == 'metric' tsne = TSNE(metric=metric, neighbors='approx') # We don't care about what happens later, just that the NN method is # properly called nndescent.side_effect = InterruptedError() try: # Haversine distance only supports two dimensions tsne.prepare_initial(self.x[:, :2]) except InterruptedError: pass self.assertEqual(nndescent.call_count, 1) check_call_contains_kwargs(nndescent.mock_calls[0], {'metric': metric})
def test_nndescent_mahalanobis_distance(self, nndescent: MagicMock): """Distance metrics and additional params should be correctly passed down to NN descent""" metric = 'mahalanobis' C = np.cov(self.x) tsne = TSNE(metric=metric, metric_params={'V': C}, neighbors='approx') # We don't care about what happens later, just that the NN method is # properly called nndescent.side_effect = InterruptedError() try: tsne.prepare_initial(self.x) except InterruptedError: pass self.assertEqual(nndescent.call_count, 1) check_call_contains_kwargs(nndescent.mock_calls[0], {'metric': metric})
def test_constructor(self, param_name, param_value, gradient_descent): # type: (str, Any, MagicMock) -> None # Make sure mock still conforms to signature gradient_descent.return_value = (1, MagicMock()) # Early exaggeration training loop if param_name == 'early_exaggeration_iter': check_param_name = 'n_iter' call_idx = 0 elif param_name == 'early_exaggeration': check_param_name = 'exaggeration' call_idx = 0 elif param_name == 'initial_momentum': check_param_name = 'momentum' call_idx = 0 # Main training loop elif param_name == 'n_iter': check_param_name = param_name call_idx = 1 elif param_name == 'final_momentum': check_param_name = 'momentum' call_idx = 1 # Early exaggeration training loop elif param_name == 'late_exaggeration_iter': check_param_name = 'n_iter' call_idx = 2 elif param_name == 'late_exaggeration': check_param_name = 'exaggeration' call_idx = 2 # If general parameter, should be applied to every call else: check_param_name = param_name call_idx = 0 TSNE(**{param_name: param_value}).fit(self.x) self.assertEqual(3, gradient_descent.call_count) check_call_contains_kwargs(gradient_descent.mock_calls[call_idx], {check_param_name: param_value})
def run(perplexity=30, learning_rate=100, n_jobs=4): x, y = get_mouse_60k() # x, y = get_fashion_mnist() angle = 0.5 ee = 12 metric = 'euclidean' print(x.shape) start = time.time() tsne = TSNE( perplexity=perplexity, learning_rate=learning_rate, early_exaggeration=ee, n_jobs=n_jobs, theta=angle, initialization='random', metric=metric, n_components=2, n_iter=750, early_exaggeration_iter=250, neighbors='approx', negative_gradient_method='fft', min_num_intervals=10, ints_in_interval=1, late_exaggeration_iter=0, late_exaggeration=2., callbacks=ErrorLogger(), ) # x = PCA(n_components=50).fit_transform(x) embedding = tsne.fit(x) print('-' * 80) print('tsne', time.time() - start) plt.title('tsne') plot(embedding, y) return x = np.ascontiguousarray(x.astype(np.float64)) from fitsne import FItSNE start = time.time() embedding = FItSNE( x, 2, perplexity=perplexity, stop_lying_iter=250, ann_not_vptree=True, early_exag_coeff=ee, nthreads=n_jobs, theta=angle, ) print('-' * 80) print('fft interp %.4f' % (time.time() - start)) plt.title('fft interp') plot(embedding, y) plt.show() return init = PCA(n_components=2).fit_transform(x) start = time.time() embedding = MulticoreTSNE(early_exaggeration=ee, learning_rate=learning_rate, perplexity=perplexity, n_jobs=n_jobs, cheat_metric=False, angle=angle, init=init, metric=metric, verbose=True).fit_transform(x) print('-' * 80) print('mctsne', time.time() - start) plt.title('mctsne') plot(embedding, y) plt.show() start = time.time() embedding = SKLTSNE( early_exaggeration=ee, learning_rate=learning_rate, angle=angle, perplexity=perplexity, init='pca', metric=metric, ).fit_transform(x) print('-' * 80) print('sklearn', time.time() - start) plt.title('sklearn') plot(embedding, y) plt.show()
def setUpClass(cls): cls.tsne = TSNE() cls.x = np.random.randn(100, 4) cls.x_test = np.random.randn(25, 4)
def setUpClass(cls): cls.tsne = TSNE() random_state = np.random.RandomState(42) cls.x = random_state.randn(100, 4) cls.x_test = random_state.randn(25, 4)
def test_fitted_pca_model(self): """Using PCA initialization in `transform` should work when the initial embedding was initialized with PCA.""" tsne = TSNE(initialization='pca') embedding = tsne.fit(self.x) embedding.transform(self.x_test, initialization='pca')