def test_umap_transform_on_iris_modified_dtype(iris, iris_selection): data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data) fitter.embedding_ = fitter.embedding_.astype(np.float64) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert_greater_equal( trust, 0.8, "Insufficiently trustworthy transform for iris dataset: {}".format( trust), )
def test_umap_transform_on_iris(iris, iris_selection): data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, n_epochs=200, random_state=42).fit(data) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert_greater_equal( trust, 0.85, "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust), )
def test_umap_trustworthiness_on_sphere_iris(iris, ): data = iris.data embedding = UMAP( n_neighbors=10, min_dist=0.01, n_epochs=200, random_state=42, output_metric="haversine", ).fit_transform(data) # Since trustworthiness doesn't support haversine, project onto # a 3D embedding of the sphere and use cosine distance r = 3 projected_embedding = np.vstack([ r * np.sin(embedding[:, 0]) * np.cos(embedding[:, 1]), r * np.sin(embedding[:, 0]) * np.sin(embedding[:, 1]), r * np.cos(embedding[:, 0]), ]).T trust = trustworthiness(iris.data, projected_embedding, 10, metric="cosine") assert_greater_equal( trust, 0.80, "Insufficiently trustworthy spherical embedding for iris dataset: {}". format(trust), )
def test_multi_component_layout(): data, labels = make_blobs(100, 2, centers=5, cluster_std=0.5, center_box=[-20, 20], random_state=42) true_centroids = np.empty((labels.max() + 1, data.shape[1]), dtype=np.float64) for label in range(labels.max() + 1): true_centroids[label] = data[labels == label].mean(axis=0) true_centroids = normalize(true_centroids, norm="l2") embedding = UMAP(n_neighbors=4).fit_transform(data) embed_centroids = np.empty((labels.max() + 1, data.shape[1]), dtype=np.float64) embed_labels = KMeans(n_clusters=5).fit_predict(embedding) for label in range(embed_labels.max() + 1): embed_centroids[label] = data[embed_labels == label].mean(axis=0) embed_centroids = normalize(embed_centroids, norm="l2") error = np.sum((true_centroids - embed_centroids)**2) assert_less(error, 15.0, msg="Multi component embedding to far astray")
def test_umap_transform_embedding_stability(iris, iris_selection): """Test that transforming data does not alter the learned embeddings Issue #217 describes how using transform to embed new data using a trained UMAP transformer causes the fitting embedding matrix to change in cases when the new data has the same number of rows as the original training data. """ data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data) original_embedding = fitter.embedding_.copy() # The important point is that the new data has the same number of rows # as the original fit data new_data = np.random.random(data.shape) _ = fitter.transform(new_data) assert_array_equal( original_embedding, fitter.embedding_, "Transforming new data changed the original embeddings", ) # Example from issue #217 a = np.random.random((1000, 10)) b = np.random.random((1000, 5)) umap = UMAP() u1 = umap.fit_transform(a[:, :5]) u1_orig = u1.copy() assert_array_equal(u1_orig, umap.embedding_) _ = umap.transform(b) assert_array_equal(u1_orig, umap.embedding_)
def test_umap_sparse_trustworthiness(sparse_test_data): embedding = UMAP(n_neighbors=10).fit_transform(sparse_test_data[:100]) trust = trustworthiness(sparse_test_data[:100].toarray(), embedding, 10) assert_greater_equal( trust, 0.89, "Insufficiently trustworthy embedding for" "sparse test dataset: {}".format(trust), )
def test_supervised_umap_trustworthiness(): data, labels = make_blobs(50, cluster_std=0.5, random_state=42) embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform(data, labels) trust = trustworthiness(data, embedding, 10) assert_greater_equal( trust, 0.97, "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust), )
def test_umap_trustworthiness_random_init(nn_data): data = nn_data[:50] embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42, init="random").fit_transform(data) trust = trustworthiness(data, embedding, 10) assert_greater_equal( trust, 0.75, "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust), )
def test_umap_sparse_transform_on_iris(iris, iris_selection): data = sparse.csr_matrix(iris.data[iris_selection]) assert sparse.issparse(data) fitter = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, force_approximation_algorithm=True, ).fit(data) new_data = sparse.csr_matrix(iris.data[~iris_selection]) assert sparse.issparse(new_data) embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert_greater_equal( trust, 0.80, "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust), )
def test_initialized_umap_trustworthiness_on_iris(iris): data = iris.data embedding = UMAP(n_neighbors=10, min_dist=0.01, init=data[:, 2:], n_epochs=200, random_state=42).fit_transform(data) trust = trustworthiness(iris.data, embedding, 10) assert_greater_equal( trust, 0.97, "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), )
def test_umap_trustworthiness_fast_approx(nn_data): data = nn_data[:50] embedding = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, force_approximation_algorithm=True, ).fit_transform(data) trust = trustworthiness(data, embedding, 10) assert_greater_equal( trust, 0.75, "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust), )
def test_discrete_metric_supervised_umap_trustworthiness(): data, labels = make_blobs(50, cluster_std=0.5, random_state=42) embedding = UMAP( n_neighbors=10, min_dist=0.01, target_metric="ordinal", target_weight=0.8, n_epochs=100, random_state=42, ).fit_transform(data, labels) trust = trustworthiness(data, embedding, 10) assert_greater_equal( trust, 0.95, "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust), )
def test_bad_transform_data(nn_data): u = UMAP().fit([[1, 1, 1, 1]]) assert_raises(ValueError, u.transform, [[0, 0, 0, 0]])
def test_blobs_cluster(): data, labels = make_blobs(n_samples=500, n_features=10, centers=5) embedding = UMAP().fit_transform(data) assert_equal(adjusted_rand_score(labels, KMeans(5).fit_predict(embedding)), 1.0)
def test_repeated_points_large_n(repetition_dense): model = UMAP(n_neighbors=5, unique=True).fit(repetition_dense) assert_equal(model._n_neighbors, 3)
def test_repeated_points_small_dense_binary(binary_repeats): model = UMAP(n_neighbors=3, unique=True).fit(binary_repeats) assert_equal(np.unique(binary_repeats[0:2], axis=0).shape[0], 1) assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)
def test_repeated_points_large_dense_binary(binary_repeats): model = UMAP(n_neighbors=3, unique=True, force_approximation_algorithm=True).fit(binary_repeats) assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)
def test_repeated_points_small_sparse_spatial(sparse_spatial_data_repeats): model = UMAP(n_neighbors=3, unique=True).fit(sparse_spatial_data_repeats) assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)
def supervised_iris_model(iris): return UMAP(n_neighbors=10, min_dist=0.01, n_epochs=200, random_state=42).fit(iris.data, iris.target)
def iris_model(iris): return UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(iris.data)
joint_umap_obj_uni = JUMAP(init='random') joint_umap_uni = joint_umap_obj_uni.fit_transform(X=data, method='uniform') # Save data np.savetxt(data_name + "_uniformjumap" + "_noise" + str(noise_level) + ".csv", joint_umap_uni, delimiter=",") # Run concat print("Run concat") concat = np.concatenate( (normalize_matrix(expr_mat_log_t), normalize_matrix(adt_mat), normalize_matrix(expr_mat_shuffle)), axis=1) expr_reduced = PCA(n_components=100).fit_transform(concat) concat_umap = UMAP(init='random').fit_transform(expr_reduced) # save data np.savetxt(data_name + "_concatFroumap" + "_noise" + str(noise_level) + ".csv", concat_umap, delimiter=",") print("Run concat") concat = np.concatenate((normalize_matrix( expr_mat_log_t, "max"), normalize_matrix( adt_mat, "max"), normalize_matrix(expr_mat_shuffle, "max")), axis=1) expr_reduced = PCA(n_components=100).fit_transform(concat) concat_umap = UMAP(init='random').fit_transform(expr_reduced) # save data np.savetxt(data_name + "_concatMaxumap" + "_noise" + str(noise_level) + ".csv",