Esempio n. 1
0
def test_umap_transform_on_iris():
    iris = datasets.load_iris()
    iris_selection = np.random.choice(
        [True, False], 150, replace=True, p=[0.75, 0.25])
    data = iris.data[iris_selection]

    fitter = UMAP(n_neighbors=10, min_dist=0.01, verbose=True)
    fitter.fit(data)

    new_data = iris.data[~iris_selection]
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert trust >= 0.90
Esempio n. 2
0
def test_supervised_umap_trustworthiness_on_iris():
    iris = datasets.load_iris()
    data = iris.data
    embedding = UMAP(n_neighbors=10, min_dist=0.01,
                     verbose=True).fit_transform(data, iris.target)
    trust = trustworthiness(iris.data, embedding, 10)
    assert trust >= 0.97
Esempio n. 3
0
def test_blobs_cluster():
    data, labels = datasets.make_blobs(
        n_samples=500, n_features=10, centers=5)
    embedding = UMAP(verbose=True).fit_transform(data)
    score = adjusted_rand_score(labels,
                                KMeans(5).fit_predict(embedding))
    assert score == 1.0
Esempio n. 4
0
def test_umap_trustworthiness_on_iris_random_init():
    iris = datasets.load_iris()
    data = iris.data
    embedding = UMAP(
        n_neighbors=10, min_dist=0.01,  init="random"
    ).fit_transform(data)
    trust = trustworthiness(iris.data, embedding, 10)
    assert trust >= 0.95
Esempio n. 5
0
def test_umap_trustworthiness_on_iris():
    iris = datasets.load_iris()
    data = iris.data
    embedding = UMAP(n_neighbors=10, min_dist=0.01).fit_transform(data)
    trust = trustworthiness(iris.data, embedding, 10)

    # We are doing a spectral embedding but not a
    # multi-component layout (which is marked experimental).
    # As a result, our score drops by 0.006.
    assert trust >= 0.964
Esempio n. 6
0
def test_umap_data_formats(input_type, should_downcast):

    dtype = np.float32 if not should_downcast else np.float64

    # For now, FAISS based nearest_neighbors only supports single precision
    digits = datasets.load_digits(n_class=9)
    X = digits["data"].astype(dtype)

    umap = UMAP(n_neighbors=3, n_components=2,
                should_downcast=should_downcast, verbose=True)

    if input_type == 'dataframe':
        X = cudf.DataFrame.from_pandas(pd.DataFrame(X))
        embeds = umap.fit_transform(X)

        assert type(embeds) == cudf.DataFrame
    else:
        embeds = umap.fit_transform(X)

        assert type(embeds) == np.ndarray
Esempio n. 7
0
 def create_umap(self, features):
     # move flags to constructor?
     map = UMAP(n_neighbors=FLAGS.umap_n_neighbors,
                n_components=FLAGS.umap_n_components,
                n_epochs=FLAGS.umap_n_epochs,
                learning_rate=FLAGS.umap_learning_rate,
                init=FLAGS.umap_init,
                min_dist=FLAGS.umap_min_dist,
                spread=FLAGS.umap_spread,
                set_op_mix_ratio=FLAGS.umap_set_op_mix,
                local_connectivity=FLAGS.umap_local_connectivity,
                repulsion_strength=FLAGS.umap_repulsion_strength,
                negative_sample_rate=FLAGS.umap_negative_sample_rate,
                transform_queue_size=FLAGS.umap_transform_queue_size,
                a=None,
                b=None,
                verbose=FLAGS.umap_verbose).fit_transform(self.features)
     return map
Esempio n. 8
0
def test_umap_downcast_fails(input_type):

    X = np.array([[1.0, 1.0], [50.0, 1.0], [51.0, 1.0]], dtype=np.float64)

    # Test fit() fails with double precision when should_downcast set to False
    umap = UMAP(should_downcast=False, verbose=True)
    if input_type == 'dataframe':
        X = cudf.DataFrame.from_pandas(pd.DataFrame(X))

    with pytest.raises(Exception):
        umap.fit(X, should_downcast=False)

    # Test fit() fails when downcast corrupted data
    X = np.array([[np.finfo(np.float32).max]], dtype=np.float64)

    umap = UMAP(should_downcast=True)
    if input_type == 'dataframe':
        X = cudf.DataFrame.from_pandas(pd.DataFrame(X))

    with pytest.raises(Exception):
        umap.fit(X, should_downcast=True)
Esempio n. 9
0
def test_simplicial_set_embedding(n_rows, n_features, n_neighbors,
                                  n_components):
    n_clusters = 30
    random_state = 42
    metric = 'euclidean'
    initial_alpha = 1.0
    a, b = UMAP.find_ab_params(1.0, 0.1)
    gamma = 0
    negative_sample_rate = 5
    n_epochs = 500
    init = 'random'
    metric = 'euclidean'
    metric_kwds = {}
    densmap = False
    densmap_kwds = {}
    output_dens = False
    output_metric = 'euclidean'
    output_metric_kwds = {}

    X, _ = make_blobs(n_samples=n_rows,
                      centers=n_clusters,
                      n_features=n_features,
                      random_state=random_state)
    X = X.get()

    ref_fss_graph = ref_fuzzy_simplicial_set(X, n_neighbors, random_state,
                                             metric)[0]
    ref_embedding = ref_simplicial_set_embedding(
        X,
        ref_fss_graph,
        n_components,
        initial_alpha,
        a,
        b,
        gamma,
        negative_sample_rate,
        n_epochs,
        init,
        np.random.RandomState(random_state),
        dist.named_distances_with_gradients[metric],
        metric_kwds,
        densmap,
        densmap_kwds,
        output_dens,
        output_metric=output_metric,
        output_metric_kwds=output_metric_kwds)[0]

    cu_fss_graph = cu_fuzzy_simplicial_set(X, n_neighbors, random_state,
                                           metric)

    cu_embedding = cu_simplicial_set_embedding(
        X,
        cu_fss_graph,
        n_components,
        initial_alpha,
        a,
        b,
        gamma,
        negative_sample_rate,
        n_epochs,
        init,
        random_state,
        metric,
        metric_kwds,
        output_metric=output_metric,
        output_metric_kwds=output_metric_kwds)

    ref_embedding = cp.array(ref_embedding)
    assert correctness_dense(ref_embedding,
                             cu_embedding,
                             rtol=0.1,
                             threshold=0.95)