Example #1
0
def test_umap_transform_on_iris_modified_dtype(iris, iris_selection):
    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data)
    fitter.embedding_ = fitter.embedding_.astype(np.float64)

    new_data = iris.data[~iris_selection]
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert_greater_equal(
        trust,
        0.8,
        "Insufficiently trustworthy transform for iris dataset: {}".format(
            trust),
    )
Example #2
0
def test_umap_transform_on_iris(iris, iris_selection):
    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, n_epochs=200,
                  random_state=42).fit(data)

    new_data = iris.data[~iris_selection]
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert_greater_equal(
        trust,
        0.85,
        "Insufficiently trustworthy transform for"
        "iris dataset: {}".format(trust),
    )
Example #3
0
def test_umap_trustworthiness_on_sphere_iris(iris, ):
    data = iris.data
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        n_epochs=200,
        random_state=42,
        output_metric="haversine",
    ).fit_transform(data)
    # Since trustworthiness doesn't support haversine, project onto
    # a 3D embedding of the sphere and use cosine distance
    r = 3
    projected_embedding = np.vstack([
        r * np.sin(embedding[:, 0]) * np.cos(embedding[:, 1]),
        r * np.sin(embedding[:, 0]) * np.sin(embedding[:, 1]),
        r * np.cos(embedding[:, 0]),
    ]).T
    trust = trustworthiness(iris.data,
                            projected_embedding,
                            10,
                            metric="cosine")
    assert_greater_equal(
        trust,
        0.80,
        "Insufficiently trustworthy spherical embedding for iris dataset: {}".
        format(trust),
    )
Example #4
0
def test_multi_component_layout():
    data, labels = make_blobs(100,
                              2,
                              centers=5,
                              cluster_std=0.5,
                              center_box=[-20, 20],
                              random_state=42)

    true_centroids = np.empty((labels.max() + 1, data.shape[1]),
                              dtype=np.float64)

    for label in range(labels.max() + 1):
        true_centroids[label] = data[labels == label].mean(axis=0)

    true_centroids = normalize(true_centroids, norm="l2")

    embedding = UMAP(n_neighbors=4).fit_transform(data)
    embed_centroids = np.empty((labels.max() + 1, data.shape[1]),
                               dtype=np.float64)
    embed_labels = KMeans(n_clusters=5).fit_predict(embedding)

    for label in range(embed_labels.max() + 1):
        embed_centroids[label] = data[embed_labels == label].mean(axis=0)

    embed_centroids = normalize(embed_centroids, norm="l2")

    error = np.sum((true_centroids - embed_centroids)**2)

    assert_less(error, 15.0, msg="Multi component embedding to far astray")
Example #5
0
def test_umap_transform_embedding_stability(iris, iris_selection):
    """Test that transforming data does not alter the learned embeddings

    Issue #217 describes how using transform to embed new data using a
    trained UMAP transformer causes the fitting embedding matrix to change
    in cases when the new data has the same number of rows as the original
    training data.
    """

    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data)
    original_embedding = fitter.embedding_.copy()

    # The important point is that the new data has the same number of rows
    # as the original fit data
    new_data = np.random.random(data.shape)
    _ = fitter.transform(new_data)

    assert_array_equal(
        original_embedding,
        fitter.embedding_,
        "Transforming new data changed the original embeddings",
    )

    # Example from issue #217
    a = np.random.random((1000, 10))
    b = np.random.random((1000, 5))

    umap = UMAP()
    u1 = umap.fit_transform(a[:, :5])
    u1_orig = u1.copy()
    assert_array_equal(u1_orig, umap.embedding_)

    _ = umap.transform(b)
    assert_array_equal(u1_orig, umap.embedding_)
Example #6
0
def test_umap_sparse_trustworthiness(sparse_test_data):
    embedding = UMAP(n_neighbors=10).fit_transform(sparse_test_data[:100])
    trust = trustworthiness(sparse_test_data[:100].toarray(), embedding, 10)
    assert_greater_equal(
        trust,
        0.89,
        "Insufficiently trustworthy embedding for"
        "sparse test dataset: {}".format(trust),
    )
Example #7
0
def test_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    embedding = UMAP(n_neighbors=10, min_dist=0.01,
                     random_state=42).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.97,
        "Insufficiently trustworthy embedding for"
        "blobs dataset: {}".format(trust),
    )
Example #8
0
def test_umap_trustworthiness_random_init(nn_data):
    data = nn_data[:50]
    embedding = UMAP(n_neighbors=10,
                     min_dist=0.01,
                     random_state=42,
                     init="random").fit_transform(data)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.75,
        "Insufficiently trustworthy embedding for"
        "nn dataset: {}".format(trust),
    )
Example #9
0
def test_umap_sparse_transform_on_iris(iris, iris_selection):
    data = sparse.csr_matrix(iris.data[iris_selection])
    assert sparse.issparse(data)
    fitter = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        random_state=42,
        n_epochs=100,
        force_approximation_algorithm=True,
    ).fit(data)

    new_data = sparse.csr_matrix(iris.data[~iris_selection])
    assert sparse.issparse(new_data)
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert_greater_equal(
        trust,
        0.80,
        "Insufficiently trustworthy transform for"
        "iris dataset: {}".format(trust),
    )
Example #10
0
def test_initialized_umap_trustworthiness_on_iris(iris):
    data = iris.data
    embedding = UMAP(n_neighbors=10,
                     min_dist=0.01,
                     init=data[:, 2:],
                     n_epochs=200,
                     random_state=42).fit_transform(data)
    trust = trustworthiness(iris.data, embedding, 10)
    assert_greater_equal(
        trust,
        0.97,
        "Insufficiently trustworthy embedding for"
        "iris dataset: {}".format(trust),
    )
Example #11
0
def test_umap_trustworthiness_fast_approx(nn_data):
    data = nn_data[:50]
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        random_state=42,
        n_epochs=100,
        force_approximation_algorithm=True,
    ).fit_transform(data)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.75,
        "Insufficiently trustworthy embedding for"
        "nn dataset: {}".format(trust),
    )
Example #12
0
def test_discrete_metric_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="ordinal",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for"
        "blobs dataset: {}".format(trust),
    )
Example #13
0
def test_bad_transform_data(nn_data):
    u = UMAP().fit([[1, 1, 1, 1]])
    assert_raises(ValueError, u.transform, [[0, 0, 0, 0]])
Example #14
0
def test_blobs_cluster():
    data, labels = make_blobs(n_samples=500, n_features=10, centers=5)
    embedding = UMAP().fit_transform(data)
    assert_equal(adjusted_rand_score(labels,
                                     KMeans(5).fit_predict(embedding)), 1.0)
def test_repeated_points_large_n(repetition_dense):
    model = UMAP(n_neighbors=5, unique=True).fit(repetition_dense)
    assert_equal(model._n_neighbors, 3)
def test_repeated_points_small_dense_binary(binary_repeats):
    model = UMAP(n_neighbors=3, unique=True).fit(binary_repeats)
    assert_equal(np.unique(binary_repeats[0:2], axis=0).shape[0], 1)
    assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)
def test_repeated_points_large_dense_binary(binary_repeats):
    model = UMAP(n_neighbors=3,
                 unique=True,
                 force_approximation_algorithm=True).fit(binary_repeats)
    assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)
def test_repeated_points_small_sparse_spatial(sparse_spatial_data_repeats):
    model = UMAP(n_neighbors=3, unique=True).fit(sparse_spatial_data_repeats)
    assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)
Example #19
0
def supervised_iris_model(iris):
    return UMAP(n_neighbors=10, min_dist=0.01, n_epochs=200,
                random_state=42).fit(iris.data, iris.target)
Example #20
0
def iris_model(iris):
    return UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(iris.data)
Example #21
0
    joint_umap_obj_uni = JUMAP(init='random')
    joint_umap_uni = joint_umap_obj_uni.fit_transform(X=data, method='uniform')
    # Save data
    np.savetxt(data_name + "_uniformjumap" + "_noise" + str(noise_level) +
               ".csv",
               joint_umap_uni,
               delimiter=",")

    # Run concat
    print("Run concat")
    concat = np.concatenate(
        (normalize_matrix(expr_mat_log_t), normalize_matrix(adt_mat),
         normalize_matrix(expr_mat_shuffle)),
        axis=1)
    expr_reduced = PCA(n_components=100).fit_transform(concat)
    concat_umap = UMAP(init='random').fit_transform(expr_reduced)
    # save data
    np.savetxt(data_name + "_concatFroumap" + "_noise" + str(noise_level) +
               ".csv",
               concat_umap,
               delimiter=",")
    print("Run concat")
    concat = np.concatenate((normalize_matrix(
        expr_mat_log_t, "max"), normalize_matrix(
            adt_mat, "max"), normalize_matrix(expr_mat_shuffle, "max")),
                            axis=1)
    expr_reduced = PCA(n_components=100).fit_transform(concat)
    concat_umap = UMAP(init='random').fit_transform(expr_reduced)
    # save data
    np.savetxt(data_name + "_concatMaxumap" + "_noise" + str(noise_level) +
               ".csv",