def test_umap_transform_on_iris(): iris = datasets.load_iris() iris_selection = np.random.choice( [True, False], 150, replace=True, p=[0.75, 0.25]) data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, verbose=True) fitter.fit(data) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert trust >= 0.90
def test_supervised_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data embedding = UMAP(n_neighbors=10, min_dist=0.01, verbose=True).fit_transform(data, iris.target) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.97
def test_blobs_cluster(): data, labels = datasets.make_blobs( n_samples=500, n_features=10, centers=5) embedding = UMAP(verbose=True).fit_transform(data) score = adjusted_rand_score(labels, KMeans(5).fit_predict(embedding)) assert score == 1.0
def test_umap_trustworthiness_on_iris_random_init(): iris = datasets.load_iris() data = iris.data embedding = UMAP( n_neighbors=10, min_dist=0.01, init="random" ).fit_transform(data) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.95
def test_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data embedding = UMAP(n_neighbors=10, min_dist=0.01).fit_transform(data) trust = trustworthiness(iris.data, embedding, 10) # We are doing a spectral embedding but not a # multi-component layout (which is marked experimental). # As a result, our score drops by 0.006. assert trust >= 0.964
def test_umap_data_formats(input_type, should_downcast): dtype = np.float32 if not should_downcast else np.float64 # For now, FAISS based nearest_neighbors only supports single precision digits = datasets.load_digits(n_class=9) X = digits["data"].astype(dtype) umap = UMAP(n_neighbors=3, n_components=2, should_downcast=should_downcast, verbose=True) if input_type == 'dataframe': X = cudf.DataFrame.from_pandas(pd.DataFrame(X)) embeds = umap.fit_transform(X) assert type(embeds) == cudf.DataFrame else: embeds = umap.fit_transform(X) assert type(embeds) == np.ndarray
def create_umap(self, features): # move flags to constructor? map = UMAP(n_neighbors=FLAGS.umap_n_neighbors, n_components=FLAGS.umap_n_components, n_epochs=FLAGS.umap_n_epochs, learning_rate=FLAGS.umap_learning_rate, init=FLAGS.umap_init, min_dist=FLAGS.umap_min_dist, spread=FLAGS.umap_spread, set_op_mix_ratio=FLAGS.umap_set_op_mix, local_connectivity=FLAGS.umap_local_connectivity, repulsion_strength=FLAGS.umap_repulsion_strength, negative_sample_rate=FLAGS.umap_negative_sample_rate, transform_queue_size=FLAGS.umap_transform_queue_size, a=None, b=None, verbose=FLAGS.umap_verbose).fit_transform(self.features) return map
def test_umap_downcast_fails(input_type): X = np.array([[1.0, 1.0], [50.0, 1.0], [51.0, 1.0]], dtype=np.float64) # Test fit() fails with double precision when should_downcast set to False umap = UMAP(should_downcast=False, verbose=True) if input_type == 'dataframe': X = cudf.DataFrame.from_pandas(pd.DataFrame(X)) with pytest.raises(Exception): umap.fit(X, should_downcast=False) # Test fit() fails when downcast corrupted data X = np.array([[np.finfo(np.float32).max]], dtype=np.float64) umap = UMAP(should_downcast=True) if input_type == 'dataframe': X = cudf.DataFrame.from_pandas(pd.DataFrame(X)) with pytest.raises(Exception): umap.fit(X, should_downcast=True)
def test_simplicial_set_embedding(n_rows, n_features, n_neighbors, n_components): n_clusters = 30 random_state = 42 metric = 'euclidean' initial_alpha = 1.0 a, b = UMAP.find_ab_params(1.0, 0.1) gamma = 0 negative_sample_rate = 5 n_epochs = 500 init = 'random' metric = 'euclidean' metric_kwds = {} densmap = False densmap_kwds = {} output_dens = False output_metric = 'euclidean' output_metric_kwds = {} X, _ = make_blobs(n_samples=n_rows, centers=n_clusters, n_features=n_features, random_state=random_state) X = X.get() ref_fss_graph = ref_fuzzy_simplicial_set(X, n_neighbors, random_state, metric)[0] ref_embedding = ref_simplicial_set_embedding( X, ref_fss_graph, n_components, initial_alpha, a, b, gamma, negative_sample_rate, n_epochs, init, np.random.RandomState(random_state), dist.named_distances_with_gradients[metric], metric_kwds, densmap, densmap_kwds, output_dens, output_metric=output_metric, output_metric_kwds=output_metric_kwds)[0] cu_fss_graph = cu_fuzzy_simplicial_set(X, n_neighbors, random_state, metric) cu_embedding = cu_simplicial_set_embedding( X, cu_fss_graph, n_components, initial_alpha, a, b, gamma, negative_sample_rate, n_epochs, init, random_state, metric, metric_kwds, output_metric=output_metric, output_metric_kwds=output_metric_kwds) ref_embedding = cp.array(ref_embedding) assert correctness_dense(ref_embedding, cu_embedding, rtol=0.1, threshold=0.95)