def test_tsne_pickle(tmpdir, datatype, nrows, ncols): iris = load_iris() iris_selection = np.random.RandomState(42).choice([True, False], 150, replace=True, p=[0.75, 0.25]) X = iris.data[iris_selection] model = cuml.manifold.TSNE(n_components=2, random_state=199) # Pickle the model model_pickle = pickle_save_load(tmpdir, model) model_params = model_pickle.__dict__ if "handle" in model_params: del model_params["handle"] # Confirm params in model are identical new_keys = set(model_params.keys()) for key, value in zip(model_params.keys(), model_params.values()): assert (model_params[key] == value) new_keys -= set([key]) # Check all keys have been checked assert (len(new_keys) == 0) # Transform data model.fit(X) trust_before = trustworthiness(X, model.Y, 10) # Save model + embeddings model = pickle_save_load(tmpdir, model) trust_after = trustworthiness(X, model.Y.to_pandas(), 10) assert trust_before == trust_after
def test_trustworthiness_not_euclidean_metric(): # Test trustworthiness with a metric different from 'euclidean' and # 'precomputed' random_state = check_random_state(0) X = random_state.randn(100, 2) assert (trustworthiness(X, X, metric='cosine') == trustworthiness( pairwise_distances(X, metric='cosine'), X, metric='precomputed'))
def test_umap_pickle(tmpdir, datatype, model, nrows, ncols): iris = load_iris() iris_selection = np.random.RandomState(42).choice([True, False], 150, replace=True, p=[0.75, 0.25]) X_train = iris.data[iris_selection] cu_before_pickle_transform = model.fit_transform(X_train) cu_before_embed = model.arr_embed cu_trust_before = trustworthiness(X_train, cu_before_pickle_transform, 10) cu_after_pickle_model = pickle_save_load(tmpdir, model) cu_after_pickle_transform = cu_after_pickle_model.transform(X_train) cu_after_embed = model.arr_embed cu_trust_after = trustworthiness(X_train, cu_after_pickle_transform, 10) assert array_equal(cu_before_embed, cu_after_embed) assert cu_trust_after >= cu_trust_before - 0.2
def test_umap_fit_transform_trust(name): if name == 'iris': iris = datasets.load_iris() data = iris.data labels = iris.target elif name == 'digits': digits = datasets.load_digits(n_class=5) data = digits.data labels = digits.target elif name == 'wine': wine = datasets.load_wine() data = wine.data labels = wine.target else: data, labels = make_blobs(n_samples=5000, n_features=10, centers=10, random_state=42) model = umap.UMAP(n_neighbors=10, min_dist=0.01) cuml_model = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False) embedding = model.fit_transform(data) cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True) trust = trustworthiness(data, embedding, 10) cuml_trust = trustworthiness(data, cuml_embedding, 10) assert array_equal(trust, cuml_trust, 1e-1, with_sign=True)
def test_tsne_knn_graph_used(name, type_knn_graph): datasets X = eval("datasets.load_{}".format(name))().data neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \ else cuKNN(n_neighbors=90) neigh.fit(X) knn_graph = neigh.kneighbors_graph(X, mode="distance") tsne = TSNE() # Perform tsne with normal knn_graph Y = tsne.fit_transform(X, True, knn_graph) trust_normal = trustworthiness(X, Y) print("Trust = ", trust_normal) X_garbage = np.ones(X.shape) knn_graph_garbage = neigh.kneighbors_graph(X_garbage, mode="distance") # Perform tsne with garbage knn_graph Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y) print("Trust = ", trust_garbage) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage.tocoo()) trust_garbage = trustworthiness(X, Y) print("Trust = ", trust_garbage) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage.tocsc()) trust_garbage = trustworthiness(X, Y) print("Trust = ", trust_garbage) assert (trust_normal - trust_garbage) > 0.15
def test_trustworthiness_not_euclidean_metric(): # Test trustworthiness with a metric different from 'euclidean' and # 'precomputed' random_state = check_random_state(0) X = random_state.randn(100, 2) assert_equal(trustworthiness(X, X, metric='cosine'), trustworthiness(pairwise_distances(X, metric='cosine'), X, metric='precomputed'))
def test_tsne(name): """ This tests how TSNE handles a lot of input data across time. (1) cuDF DataFrames are passed input (2) Numpy arrays are passed in (3) Params are changed in the TSNE class (4) The class gets re-used across time (5) Trustworthiness is checked (6) Tests NAN in TSNE output for learning rate explosions (7) Tests verbosity """ datasets X = eval("datasets.load_{}".format(name))().data X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X)) for i in range(3): print("iteration = ", i) tsne = TSNE(2, random_state=i, verbose=0, learning_rate=2 + i) Y = tsne.fit_transform(X_cudf).to_pandas().values nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > 0.76 assert nans == 0 del Y # Reuse Y = tsne.fit_transform(X) nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > 0.76 assert nans == 0 del Y # Again tsne = TSNE(2, random_state=i + 2, verbose=1, learning_rate=2 + i + 2) Y = tsne.fit_transform(X_cudf).to_pandas().values nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > 0.76 assert nans == 0 del Y # Reuse Y = tsne.fit_transform(X) nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > 0.76 assert nans == 0 del Y
def run_training(corruption_chance, perplexity, batch_size): global train_data, test_data corrupt = lambda x: 0 if np.random.uniform() <= corruption_chance else x train_data = np.vectorize(corrupt)(train_data) test_data = np.vectorize(corrupt)(test_data) def hook(args): print(args) if np.isnan(args[2]): raise Exception if isinstance(args[0], PTSNE) and args[2] <= 0.0: raise Exception vae = VAE( [n_input_dimensions], get_gaussian_network_builder(vae_encoder_layers, n_latent_dimensions), gaussian_prior_supplier, gaussian_supplier, get_bernoulli_network_builder(vae_decoder_layers, n_input_dimensions), bernoulli_supplier) ptsne = PTSNE( [n_input_dimensions], get_feed_forward_network_builder(vptsne_layers, batch_normalization=False), perplexity=perplexity) vptsne = VPTSNE( vae, get_feed_forward_network_builder(vptsne_layers, batch_normalization=False), perplexity=perplexity) ptsne.fit(train_data, n_iters=1500, batch_size=batch_size, hook_fn=hook) vptsne.fit(train_data, n_iters=1500, n_vae_iters=10000, batch_size=batch_size, vae_batch_size=1000, hook_fn=hook) knn_score = KNC(n_neighbors=1).fit( ptsne.transform(train_data), train_labels).score( ptsne.transform(test_data), test_labels) knn_score_vptsne = KNC(n_neighbors=1).fit( vptsne.transform(train_data), train_labels).score( vptsne.transform(test_data), test_labels) tw = trustworthiness( test_data, ptsne.transform(test_data), n_neighbors=12) tw_vptsne = trustworthiness( test_data, vptsne.transform(test_data), n_neighbors=12) train_data = np.copy(non_corrupted_train_data) test_data = np.copy(non_corrupted_test_data) return knn_score, tw, knn_score_vptsne, tw_vptsne
def test_supervised_umap_trustworthiness_against_umap_learn(): iris = datasets.load_iris() data = iris.data embedding = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False).fit_transform(data, iris.target, convert_dtype=True) skl_embedding = umap.UMAP(n_neighbors=10, min_dist=0.01, verbose=False).fit_transform(data, iris.target) trust = trustworthiness(iris.data, embedding, 10) skl_trust = trustworthiness(iris.data, skl_embedding, 10) assert (skl_trust - 0.009) <= trust <= (skl_trust + 0.009)
def test_tsne_knn_graph_used(dataset, type_knn_graph, method): X = dataset.data neigh = cuKNN(n_neighbors=DEFAULT_N_NEIGHBORS, metric="euclidean").fit(X) knn_graph = neigh.kneighbors_graph(X, mode="distance").astype('float32') if type_knn_graph == 'cuml': knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph) tsne = TSNE(random_state=1, n_neighbors=DEFAULT_N_NEIGHBORS, method=method, perplexity=DEFAULT_PERPLEXITY, learning_rate_method='none', min_grad_norm=1e-12) # Perform tsne with normal knn_graph Y = tsne.fit_transform(X, True, knn_graph) trust_normal = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) X_garbage = np.ones(X.shape) knn_graph_garbage = neigh.kneighbors_graph( X_garbage, mode="distance").astype('float32') if type_knn_graph == 'cuml': knn_graph_garbage = cupyx.scipy.sparse.csr_matrix(knn_graph_garbage) tsne = TSNE(random_state=1, n_neighbors=DEFAULT_N_NEIGHBORS, method=method, perplexity=DEFAULT_PERPLEXITY, learning_rate_method='none', min_grad_norm=1e-12) # Perform tsne with garbage knn_graph Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) assert (trust_normal - trust_garbage) > 0.15
def test_umap_trustworthiness_on_iris(): data = iris.data embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform(data) trust = trustworthiness(iris.data, embedding, 10) assert ( trust >= 0.97 ), "Insufficiently trustworthy embedding for iris dataset: {}".format(trust)
def check_embedding(X, Y, score=0.76): """Compares TSNE embedding trustworthiness, NAN and verbosity""" nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > score assert nans == 0
def test_tsne_transform_on_digits_sparse(input_type): datasets digits = datasets.load_digits() digits_selection = np.random.RandomState(42).choice( [True, False], 1797, replace=True, p=[0.60, 0.40]) if input_type == 'cupy': sp_prefix = cupyx.scipy.sparse else: sp_prefix = scipy.sparse fitter = TSNE(2, n_neighbors=15, random_state=1, learning_rate=500, angle=0.8) new_data = sp_prefix.csr_matrix( scipy.sparse.csr_matrix(digits.data[~digits_selection])) embedding = fitter.fit_transform(new_data, convert_dtype=True) if input_type == 'cupy': embedding = embedding.get() trust = trustworthiness(digits.data[~digits_selection], embedding, 15) assert trust >= 0.85
def test_umap_trustworthiness_on_sphere_iris(): data = iris.data embedding = UMAP( n_neighbors=10, min_dist=0.01, n_epochs=200, random_state=42, output_metric="haversine", ).fit_transform(data) # Since trustworthiness doesn't support haversine, project onto # a 3D embedding of the sphere and use cosine distance r = 3 projected_embedding = np.vstack( [ r * np.sin(embedding[:, 0]) * np.cos(embedding[:, 1]), r * np.sin(embedding[:, 0]) * np.sin(embedding[:, 1]), r * np.cos(embedding[:, 0]), ] ).T trust = trustworthiness(iris.data, projected_embedding, 10, metric="cosine") assert_greater_equal( trust, 0.80, "Insufficiently trustworthy spherical embedding for iris dataset: {}".format( trust ), )
def test_supervised_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data embedding = UMAP(n_neighbors=10, min_dist=0.01, verbose=True).fit_transform(data, iris.target) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.97
def test_tsne_fit_transform_on_digits_sparse(input_type, method): digits = test_datasets['digits'].data if input_type == 'cupy': sp_prefix = cupyx.scipy.sparse else: sp_prefix = scipy.sparse fitter = TSNE(n_components=2, random_state=1, method=method, min_grad_norm=1e-12, n_neighbors=DEFAULT_N_NEIGHBORS, learning_rate_method="none", perplexity=DEFAULT_PERPLEXITY) new_data = sp_prefix.csr_matrix( scipy.sparse.csr_matrix(digits)).astype('float32') embedding = fitter.fit_transform(new_data, convert_dtype=True) if input_type == 'cupy': embedding = embedding.get() trust = trustworthiness(digits, embedding, n_neighbors=DEFAULT_N_NEIGHBORS) assert trust >= 0.85
def test_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data embedding = cuUMAP(n_neighbors=10, min_dist=0.01).fit_transform(data, convert_dtype=True) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.97
def compute_metrics(original_data, embedding_dict, labels): metric_dict = { '1NNgeneralization_error': {}, 'trustworthiness': {}, 'cost_function_value': {} } for key in embedding_dict.keys(): # 1NN generalization error labels_predict = predict_labels(embedding_dict[key], labels) generalization_error = compute_generalization_error( labels, labels_predict) metric_dict['1NNgeneralization_error'][key[0]] = generalization_error if len(labels) < 20000: # trustworthiness (12) trustw = trustworthiness(original_data, embedding_dict[key][:, 0:2], n_neighbors=12) metric_dict['trustworthiness'][key[0]] = trustw # cost function value cost = sum(embedding_dict[key][:, 2]) metric_dict['cost_function_value'][key[0]] = cost return metric_dict
def test_umap_fit_transform_trustworthiness_with_consistency_enabled(): iris = datasets.load_iris() data = iris.data algo = cuUMAP(n_neighbors=10, min_dist=0.01, random_state=42) embedding = algo.fit_transform(data, convert_dtype=True) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.97
def test_umap_trustworthiness_on_iris_random_init(): iris = datasets.load_iris() data = iris.data embedding = UMAP( n_neighbors=10, min_dist=0.01, init="random" ).fit_transform(data) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.95
def test_umap_trustworthiness_on_iris(): embedding = iris_model.embedding_ trust = trustworthiness(iris.data, embedding, 10) assert_greater_equal( trust, 0.97, "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), )
def test_preserve_trustworthiness_approximately_with_precomputed_distances(): # Nearest neighbors should be preserved approximately. random_state = check_random_state(0) X = random_state.randn(100, 2) D = squareform(pdist(X), "sqeuclidean") tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, metric="precomputed", random_state=0, verbose=0) X_embedded = tsne.fit_transform(D) assert_almost_equal(trustworthiness(D, X_embedded, n_neighbors=1, precomputed=True), 1.0, decimal=1)
def validate_embedding(X, Y, score=0.74, n_neighbors=DEFAULT_N_NEIGHBORS): """Compares TSNE embedding trustworthiness, NAN and verbosity""" nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y, n_neighbors=n_neighbors) print("Trust=%s" % trust) assert trust > score assert nans == 0
def test_supervised_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data embedding = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False).fit_transform(data, iris.target, convert_dtype=True) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.97 - TRUST_TOLERANCE_THRESH
def test_fit_csr_matrix(): # X can be a sparse matrix. random_state = check_random_state(0) X = random_state.randn(100, 2) X[(np.random.randint(0, 100, 50), np.random.randint(0, 2, 50))] = 0.0 X_csr = sp.csr_matrix(X) tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0, random_state=0, method="exact") X_embedded = tsne.fit_transform(X_csr) assert_almost_equal(trustworthiness(X_csr, X_embedded, n_neighbors=1), 1.0, decimal=1)
def test_trustworthiness(): # Test trustworthiness score. random_state = check_random_state(0) # Affine transformation X = random_state.randn(100, 2) assert_equal(trustworthiness(X, 5.0 + X / 10.0), 1.0) # Randomly shuffled X = np.arange(100).reshape(-1, 1) X_embedded = X.copy() random_state.shuffle(X_embedded) assert_less(trustworthiness(X, X_embedded), 0.6) # Completely different X = np.arange(5).reshape(-1, 1) X_embedded = np.array([[0], [2], [4], [1], [3]]) assert_almost_equal(trustworthiness(X, X_embedded, n_neighbors=1), 0.2)
def test_umap_sparse_trustworthiness(): embedding = UMAP(n_neighbors=10).fit_transform(sparse_test_data[:100]) trust = trustworthiness(sparse_test_data[:100].toarray(), embedding, 10) assert_greater_equal( trust, 0.91, "Insufficiently trustworthy embedding for" "sparse test dataset: {}".format(trust), )
def test_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data embedding = UMAP(n_neighbors=10, min_dist=0.01).fit_transform(data) trust = trustworthiness(iris.data, embedding, 10) # We are doing a spectral embedding but not a # multi-component layout (which is marked experimental). # As a result, our score drops by 0.006. assert trust >= 0.964
def test_preserve_trustworthiness_approximately(): """Nearest neighbors should be preserved approximately.""" random_state = check_random_state(0) X = random_state.randn(100, 2) for init in ('random', 'pca'): tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0, init=init, random_state=0) X_embedded = tsne.fit_transform(X) assert_almost_equal(trustworthiness(X, X_embedded, n_neighbors=1), 1.0, decimal=1)
def test_preserve_trustworthiness_approximately(method, init): # Nearest neighbors should be preserved approximately. random_state = check_random_state(0) n_components = 2 X = random_state.randn(50, n_components).astype(np.float32) tsne = TSNE(n_components=n_components, init=init, random_state=0, method=method, n_iter=700) X_embedded = tsne.fit_transform(X) t = trustworthiness(X, X_embedded, n_neighbors=1) assert t > 0.85
def test_semisupervised_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data target = iris.target.copy() target[25:75] = -1 embedding = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False).fit_transform(data, target) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.97 - TRUST_TOLERANCE_THRESH
def assert_model(pickled_model, X_train): cu_after_embed = pickled_model.embedding_.to_output('numpy') n_neighbors = pickled_model.n_neighbors assert array_equal(result["umap_embedding"], cu_after_embed) cu_trust_after = trustworthiness(X_train, pickled_model.transform(X_train), n_neighbors) assert cu_trust_after >= result["umap"] - 0.2
def test_preserve_trustworthiness_approximately_with_precomputed_distances(): # Nearest neighbors should be preserved approximately. random_state = check_random_state(0) for i in range(3): X = random_state.randn(100, 2) D = squareform(pdist(X), "sqeuclidean") tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, early_exaggeration=2.0, metric="precomputed", random_state=i, verbose=0) X_embedded = tsne.fit_transform(D) t = trustworthiness(D, X_embedded, n_neighbors=1, metric="precomputed") assert t > .95
def test_preserve_trustworthiness_approximately(): # Nearest neighbors should be preserved approximately. random_state = check_random_state(0) n_components = 2 methods = ['exact', 'barnes_hut'] X = random_state.randn(50, n_components).astype(np.float32) for init in ('random', 'pca'): for method in methods: tsne = TSNE(n_components=n_components, init=init, random_state=0, method=method) X_embedded = tsne.fit_transform(X) t = trustworthiness(X, X_embedded, n_neighbors=1) assert_greater(t, 0.9)
def tsne(D, medoids_df, dest_dir, fn): # Reproducing braincode/calculate_cluster_medoids_tSNE print('2D TSNE embedding plotting') tSNE = TSNE(n_components=2, perplexity=5, early_exaggeration=1.0, learning_rate=10.0, metric='precomputed', verbose=True, random_state=0) medoids2D = pd.DataFrame(tSNE.fit_transform(D), index=medoids_df.index) print('Trusty TSNE: %.2f' % trustworthiness(D.values, medoids2D.values, n_neighbors=5, precomputed=True)) fig, ax = plt.subplots(nrows=1, ncols=1) cluster_scatter_plot(medoids2D[0], medoids2D[1], labels=map(str, medoids2D.index), ax=ax) plt.savefig(op.join(dest_dir, fn + '.singletons.tsne.png'))
def test_preserve_trustworthiness_approximately(): # Nearest neighbors should be preserved approximately. random_state = check_random_state(0) # The Barnes-Hut approximation uses a different method to estimate # P_ij using only a number of nearest neighbors instead of all # points (so that k = 3 * perplexity). As a result we set the # perplexity=5, so that the number of neighbors is 5%. n_components = 2 methods = ['exact', 'barnes_hut'] X = random_state.randn(100, n_components).astype(np.float32) for init in ('random', 'pca'): for method in methods: tsne = TSNE(n_components=n_components, perplexity=50, learning_rate=100.0, init=init, random_state=0, method=method) X_embedded = tsne.fit_transform(X) T = trustworthiness(X, X_embedded, n_neighbors=1) assert_almost_equal(T, 1.0, decimal=1)