def assert_model(pickled_model, X_train): cu_after_embed = pickled_model.embedding_ n_neighbors = pickled_model.n_neighbors assert array_equal(result["umap_embedding"], cu_after_embed) cu_trust_after = trustworthiness(X_train, pickled_model.transform(X_train), n_neighbors=n_neighbors) assert cu_trust_after >= result["umap"] - 0.2
def test_initialized_umap_trustworthiness_on_iris(iris): data = iris.data embedding = UMAP( n_neighbors=10, min_dist=0.01, init=data[:, 2:], n_epochs=200, random_state=42, ).fit_transform(data) trust = trustworthiness(iris.data, embedding, 10) assert_greater_equal( trust, 0.97, "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), )
def test_semisupervised_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data target = iris.target.copy() target[25:75] = -1 embedding = cuUMAP(n_neighbors=10, random_state=0, min_dist=0.01).fit_transform( data, target, convert_dtype=True) trust = trustworthiness(iris.data, embedding, n_neighbors=10) assert trust >= 0.97
def test_umap_transform_on_iris(iris, iris_subset_model, iris_selection): fitter = iris_subset_model new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert ( trust >= 0.85 ), "Insufficiently trustworthy transform for" "iris dataset: {}".format( trust)
def test_supervised_umap_trustworthiness(): data, labels = make_blobs(50, cluster_std=0.5, random_state=42) embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100).fit_transform(data, labels) trust = trustworthiness(data, embedding, 10) assert ( trust >= 0.95 ), "Insufficiently trustworthy embedding for" "blobs dataset: {}".format( trust)
def test_densmap_trustworthiness_random_init(nn_data): # pragma: no cover data = nn_data[:50] embedding = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, init="random", densmap=True, ).fit_transform(data) trust = trustworthiness(data, embedding, 10) assert_greater_equal( trust, 0.75, "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust), )
def test_umap_trustworthiness_random_init(nn_data): data = nn_data[:50] embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, init="random").fit_transform(data) trust = trustworthiness(data, embedding, 10) assert ( trust >= 0.75 ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format( trust)
def test_semisupervised_umap_trustworthiness(): data, labels = make_blobs(50, cluster_std=0.5, random_state=42) labels[10:30] = -1 embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform(data, labels) trust = trustworthiness(data, embedding, 10) assert_greater_equal( trust, 0.97, "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust), )
def test_composite_trustworthiness_on_iris(iris): iris_model1 = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, ).fit(iris.data[:, :2]) iris_model2 = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, ).fit(iris.data[:, 2:]) embedding = (iris_model1 + iris_model2).embedding_ trust = trustworthiness(iris.data, embedding, 10) assert_greater_equal( trust, 0.82, "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), ) embedding = (iris_model1 * iris_model2).embedding_ trust = trustworthiness(iris.data, embedding, 10) assert_greater_equal( trust, 0.82, "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), )
def test_umap_trustworthiness_fast_approx(nn_data): data = nn_data[:50] embedding = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, force_approximation_algorithm=True, ).fit_transform(data) trust = trustworthiness(data, embedding, n_neighbors=10) assert ( trust >= 0.8 ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
def test_umap_transform_trustworthiness_with_consistency_enabled(): iris = datasets.load_iris() data = iris.data selection = np.random.RandomState(42).choice( [True, False], data.shape[0], replace=True, p=[0.5, 0.5]) fit_data = data[selection] transform_data = data[~selection] model = cuUMAP(n_neighbors=10, min_dist=0.01, init="random", random_state=42) model.fit(fit_data, convert_dtype=True) embedding = model.transform(transform_data, convert_dtype=True) trust = trustworthiness(transform_data, embedding, n_neighbors=10) assert trust >= 0.92
def create_mod(): X_train = load_iris().data model = umap_model[keys](output_type="numpy") cu_before_pickle_transform = model.fit_transform(X_train) result["umap_embedding"] = model.embedding_ n_neighbors = model.n_neighbors result["umap"] = trustworthiness(X_train, cu_before_pickle_transform, n_neighbors=n_neighbors) return model, X_train
def test_composite_trustworthiness_random_init(nn_data): # pragma: no cover data = nn_data[:50] model1 = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=50, init="random", ).fit(data) model2 = UMAP( n_neighbors=30, min_dist=0.01, random_state=42, n_epochs=50, init="random", ).fit(data) model3 = model1 * model2 trust = trustworthiness(data, model3.embedding_, 10) assert_greater_equal( trust, 0.82, "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust), ) model4 = model1 + model2 trust = trustworthiness(data, model4.embedding_, 10) assert_greater_equal( trust, 0.82, "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust), )
def test_umap_transform_on_iris_modified_dtype(iris, iris_subset_model, iris_selection): fitter = iris_subset_model fitter.embedding_ = fitter.embedding_.astype(np.float64) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert ( trust >= 0.8 ), "Insufficiently trustworthy transform for iris dataset: {}".format( trust)
def test_densmap_trustworthiness_on_iris_supervised(iris): densmap_iris_model = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, densmap=True, verbose=True, ).fit(iris.data, y=iris.target) embedding = densmap_iris_model.embedding_ trust = trustworthiness(iris.data, embedding, n_neighbors=10) assert ( trust >= 0.97 ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format( trust)
def test_densmap_trustworthiness(nn_data): data = nn_data[:50] embedding, rad_h, rad_l = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, densmap=True, output_dens=True, ).fit_transform(data) trust = trustworthiness(data, embedding, 10) assert ( trust >= 0.75 ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
def test_sparse_precomputed_metric_umap_trustworthiness(): data, labels = make_blobs(50, cluster_std=0.5, random_state=42) dmat = scipy.sparse.csr_matrix(pairwise_distances(data)) embedding = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, metric="precomputed", ).fit_transform(dmat) trust = trustworthiness(data, embedding, n_neighbors=10) assert ( trust >= 0.75 ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
def test_umap_transform_on_iris(iris, iris_selection): data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, n_epochs=200, random_state=42).fit(data) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert_greater_equal( trust, 0.85, "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust), )
def test_string_metric_supervised_umap_trustworthiness(): data, labels = make_blobs(50, cluster_std=0.5, random_state=42) labels = np.array(["this", "that", "other"])[labels] embedding = UMAP( n_neighbors=10, min_dist=0.01, target_metric="string", target_weight=0.8, n_epochs=100, random_state=42, ).fit_transform(data, labels) trust = trustworthiness(data, embedding, n_neighbors=10) assert ( trust >= 0.95 ), "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust)
def test_count_metric_supervised_umap_trustworthiness(): data, labels = make_blobs(50, cluster_std=0.5, random_state=42) labels = (labels ** 2) + 2 * labels embedding = UMAP( n_neighbors=10, min_dist=0.01, target_metric="count", target_weight=0.8, n_epochs=100, random_state=42, ).fit_transform(data, labels) trust = trustworthiness(data, embedding, n_neighbors=10) assert ( trust >= 0.95 ), "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust)
def test_umap_transform_on_iris_modified_dtype(iris, iris_selection): data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data) fitter.embedding_ = fitter.embedding_.astype(np.float64) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert_greater_equal( trust, 0.8, "Insufficiently trustworthy transform for iris dataset: {}".format( trust), )
def test_densmap_trustworthiness_on_iris(iris): densmap_iris_model = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, densmap=True, verbose=True, ).fit(iris.data) embedding = densmap_iris_model.embedding_ trust = trustworthiness(iris.data, embedding, 10) assert ( trust >= 0.97 ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust) with pytest.raises(NotImplementedError): densmap_iris_model.transform(iris.data[:10]) with pytest.raises(ValueError): densmap_iris_model.inverse_transform(embedding[:10])
def test_composite_trustworthiness(nn_data, iris_model): data = nn_data[:50] model1 = UMAP(n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=50).fit(data) model2 = UMAP( n_neighbors=30, min_dist=0.01, random_state=42, n_epochs=50, init=model1.embedding_, ).fit(data) model3 = model1 * model2 trust = trustworthiness(data, model3.embedding_, 10) assert_greater_equal( trust, 0.82, "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust), ) model4 = model1 + model2 trust = trustworthiness(data, model4.embedding_, 10) assert_greater_equal( trust, 0.82, "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust), ) with pytest.raises(ValueError): model5 = model1 + iris_model with pytest.raises(ValueError): model5 = model1 * iris_model with pytest.raises(ValueError): model5 = model1 - iris_model
def test_discrete_metric_supervised_umap_trustworthiness(): data, labels = make_blobs(50, cluster_std=0.5, random_state=42) embedding = UMAP( n_neighbors=10, min_dist=0.01, target_metric="ordinal", target_weight=0.8, n_epochs=100, random_state=42, ).fit_transform(data, labels) trust = trustworthiness(data, embedding, 10) assert_greater_equal( trust, 0.95, "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust), )
def assert_model(pickled_model, X): model_params = pickled_model.__dict__ # Confirm params in model are identical new_keys = set(model_params.keys()) for key, value in zip(model_params.keys(), model_params.values()): assert (model_params[key] == value) new_keys -= set([key]) # Check all keys have been checked assert (len(new_keys) == 0) # Transform data result["fit_model"] = pickled_model.fit(X) result["data"] = X result["trust"] = trustworthiness(X, pickled_model.embedding_, n_neighbors=10)
def test_umap_transform_on_iris(target_metric): iris = datasets.load_iris() iris_selection = np.random.RandomState(42).choice( [True, False], 150, replace=True, p=[0.75, 0.25]) data = iris.data[iris_selection] fitter = cuUMAP(n_neighbors=10, init="random", n_epochs=800, min_dist=0.01, random_state=42, target_metric=target_metric) fitter.fit(data, convert_dtype=True) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data, convert_dtype=True) assert not np.isnan(embedding).any() trust = trustworthiness(new_data, embedding, n_neighbors=10) assert trust >= 0.85
def test_umap_transform_on_iris_w_pynndescent(iris, iris_selection): data = iris.data[iris_selection] fitter = UMAP( n_neighbors=10, min_dist=0.01, n_epochs=100, random_state=42, force_approximation_algorithm=True, ).fit(data) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert ( trust >= 0.85 ), "Insufficiently trustworthy transform for" "iris dataset: {}".format( trust)
def test_contrastive_trustworthiness_on_iris(iris): iris_model1 = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, ).fit(iris.data[:, :2]) iris_model2 = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, ).fit(iris.data[:, 2:]) embedding = (iris_model1 - iris_model2).embedding_ trust = trustworthiness(iris.data, embedding, n_neighbors=10) assert ( trust >= 0.75 ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format( trust)
def test_precomputed_sparse_transform_on_iris(iris, iris_selection): data = iris.data[iris_selection] distance_matrix = sparse.csr_matrix(squareform(pdist(data))) fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, metric='precomputed').fit(distance_matrix) new_data = iris.data[~iris_selection] new_distance_matrix = sparse.csr_matrix(cdist(new_data, data)) embedding = fitter.transform(new_distance_matrix) trust = trustworthiness(new_data, embedding, 10) assert ( trust >= 0.85 ), "Insufficiently trustworthy transform for" "iris dataset: {}".format( trust)
def test_umap_transform_on_digits_sparse(target_metric, input_type, xform_method): digits = datasets.load_digits() digits_selection = np.random.RandomState(42).choice([True, False], 1797, replace=True, p=[0.75, 0.25]) if input_type == 'cupy': sp_prefix = cupyx.scipy.sparse else: sp_prefix = scipy.sparse data = sp_prefix.csr_matrix( scipy.sparse.csr_matrix(digits.data[digits_selection])) fitter = cuUMAP(n_neighbors=15, verbose=logger.level_info, init="random", n_epochs=0, min_dist=0.01, random_state=42, target_metric=target_metric) new_data = sp_prefix.csr_matrix( scipy.sparse.csr_matrix(digits.data[~digits_selection])) if xform_method == 'fit': fitter.fit(data, convert_dtype=True) embedding = fitter.transform(new_data, convert_dtype=True) else: embedding = fitter.fit_transform(new_data, convert_dtype=True) if input_type == 'cupy': embedding = embedding.get() trust = trustworthiness(digits.data[~digits_selection], embedding, n_neighbors=15) assert trust >= 0.96