Beispiel #1
0
def test_umap_downcast_fails(input_type, nrows, n_feats):
    n_samples = nrows
    n_feats = n_feats
    X, y = datasets.make_blobs(n_samples=n_samples,
                               n_features=n_feats,
                               random_state=0)

    # Test fit() fails with double precision when should_downcast set to False
    umap = cuUMAP(should_downcast=False, verbose=False)

    if input_type == 'dataframe':
        X = cudf.DataFrame.from_pandas(pd.DataFrame(X))

    with pytest.raises(Exception):
        umap.fit(X, should_downcast=False, convert_dtype=False)

    # Test fit() fails when downcast corrupted data
    X = np.array([[np.finfo(np.float32).max]], dtype=np.float64)

    umap = cuUMAP(should_downcast=True)
    if input_type == 'dataframe':
        X = cudf.DataFrame.from_pandas(pd.DataFrame(X))

    with pytest.raises(Exception):
        umap.fit(X, convert_dtype=True)
Beispiel #2
0
def test_umap_fit_transform_against_fit_and_transform():

    n_samples = 500
    n_features = 20

    data, labels = make_blobs(n_samples=n_samples,
                              n_features=n_features,
                              centers=10,
                              random_state=42)
    """
    First test the default option does not hash the input
    """

    cuml_model = cuUMAP(verbose=False)

    ft_embedding = cuml_model.fit_transform(data, convert_dtype=True)
    fit_embedding_same_input = cuml_model.transform(data, convert_dtype=True)

    assert joblib.hash(ft_embedding) != joblib.hash(fit_embedding_same_input)
    """
    Next, test explicitly enabling feature hashes the input
    """

    cuml_model = cuUMAP(hash_input=True, verbose=False)

    ft_embedding = cuml_model.fit_transform(data, convert_dtype=True)
    fit_embedding_same_input = cuml_model.transform(data, convert_dtype=True)

    assert joblib.hash(ft_embedding) == joblib.hash(fit_embedding_same_input)

    fit_embedding_diff_input = cuml_model.transform(data[1:],
                                                    convert_dtype=True)
    assert joblib.hash(ft_embedding) != joblib.hash(fit_embedding_diff_input)
Beispiel #3
0
 def transform_embed(knn_graph=None):
     model = cuUMAP(random_state=42,
                    init='random',
                    n_neighbors=n_neighbors)
     model.fit(data, knn_graph=knn_graph, convert_dtype=True)
     return model.transform(data, knn_graph=knn_graph,
                            convert_dtype=True)
Beispiel #4
0
 def get_embedding(n_components, random_state):
     reducer = cuUMAP(verbose=False,
                      init="random",
                      n_components=n_components,
                      random_state=random_state)
     reducer.fit(fit_data, convert_dtype=True)
     return reducer.transform(transform_data, convert_dtype=True)
Beispiel #5
0
def test_umap_trustworthiness_on_iris():
    iris = datasets.load_iris()
    data = iris.data
    embedding = cuUMAP(n_neighbors=10,
                       min_dist=0.01).fit_transform(data, convert_dtype=True)
    trust = trustworthiness(iris.data, embedding, 10)
    assert trust >= 0.97
Beispiel #6
0
def test_umap_transform_on_digits(target_metric):

    digits = datasets.load_digits()

    digits_selection = np.random.RandomState(42).choice([True, False],
                                                        1797,
                                                        replace=True,
                                                        p=[0.75, 0.25])
    data = digits.data[digits_selection]

    fitter = cuUMAP(n_neighbors=15,
                    verbose=logger.level_debug,
                    init="random",
                    n_epochs=0,
                    min_dist=0.01,
                    random_state=42,
                    target_metric=target_metric)
    fitter.fit(data, convert_dtype=True)

    new_data = digits.data[~digits_selection]

    embedding = fitter.transform(new_data, convert_dtype=True)
    trust = trustworthiness(digits.data[~digits_selection],
                            embedding,
                            n_neighbors=15)
    assert trust >= 0.96
Beispiel #7
0
def test_umap_fit_transform_trustworthiness_with_consistency_enabled():
    iris = datasets.load_iris()
    data = iris.data
    algo = cuUMAP(n_neighbors=10, min_dist=0.01, random_state=42)
    embedding = algo.fit_transform(data, convert_dtype=True)
    trust = trustworthiness(iris.data, embedding, 10)
    assert trust >= 0.97
Beispiel #8
0
def test_umap_fit_transform_score(nrows, n_feats):

    n_samples = nrows
    n_features = n_feats

    data, labels = make_blobs(n_samples=n_samples,
                              n_features=n_features,
                              centers=10,
                              random_state=42)

    model = umap.UMAP(n_neighbors=10, min_dist=0.1)
    cuml_model = cuUMAP(n_neighbors=10, min_dist=0.01)

    embedding = model.fit_transform(data)
    cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True)

    assert not np.isnan(embedding).any()
    assert not np.isnan(cuml_embedding).any()

    if nrows < 500000:
        cuml_score = adjusted_rand_score(
            labels,
            KMeans(10).fit_predict(cuml_embedding))
        score = adjusted_rand_score(labels, KMeans(10).fit_predict(embedding))

        assert array_equal(score, cuml_score, 1e-2, with_sign=True)
Beispiel #9
0
def test_umap_fit_transform_trust(name):

    if name == 'iris':
        iris = datasets.load_iris()
        data = iris.data
        labels = iris.target

    elif name == 'digits':
        digits = datasets.load_digits(n_class=5)
        data = digits.data
        labels = digits.target

    elif name == 'wine':
        wine = datasets.load_wine()
        data = wine.data
        labels = wine.target
    else:
        data, labels = make_blobs(n_samples=5000,
                                  n_features=10,
                                  centers=10,
                                  random_state=42)

    model = umap.UMAP(n_neighbors=10, min_dist=0.01)
    cuml_model = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False)
    embedding = model.fit_transform(data)
    cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True)

    trust = trustworthiness(data, embedding, 10)
    cuml_trust = trustworthiness(data, cuml_embedding, 10)

    assert array_equal(trust, cuml_trust, 1e-1, with_sign=True)
Beispiel #10
0
def test_umap_data_formats(input_type, should_downcast, nrows, n_feats, name):

    dtype = np.float32 if not should_downcast else np.float64
    n_samples = nrows
    n_feats = n_feats

    if name == 'digits':
        # use the digits dataset for unit test
        digits = datasets.load_digits(n_class=9)
        X = digits["data"].astype(dtype)

    else:
        X, y = datasets.make_blobs(n_samples=n_samples,
                                   n_features=n_feats,
                                   random_state=0)

    umap = cuUMAP(n_neighbors=3, n_components=2, verbose=False)

    if input_type == 'dataframe':
        X_pd = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X_pd)
        embeds = umap.fit_transform(X_cudf, convert_dtype=True)
        assert type(embeds) == cudf.DataFrame

    else:
        embeds = umap.fit_transform(X)
        assert type(embeds) == np.ndarray
Beispiel #11
0
def test_supervised_umap_trustworthiness_on_iris():
    iris = datasets.load_iris()
    data = iris.data
    embedding = cuUMAP(n_neighbors=10, min_dist=0.01,
                       verbose=False).fit_transform(data, iris.target)
    trust = trustworthiness(iris.data, embedding, 10)
    assert trust >= 0.97 - TRUST_TOLERANCE_THRESH
Beispiel #12
0
def test_supervised_umap_trustworthiness_on_iris():
    iris = datasets.load_iris()
    data = iris.data
    embedding = cuUMAP(n_neighbors=10, random_state=0,
                       min_dist=0.01).fit_transform(
        data, iris.target, convert_dtype=True)
    trust = trustworthiness(iris.data, embedding, n_neighbors=10)
    assert trust >= 0.97
Beispiel #13
0
def test_blobs_cluster(nrows, n_feats):
    data, labels = datasets.make_blobs(
        n_samples=nrows, n_features=n_feats, centers=5, random_state=0)
    embedding = cuUMAP(verbose=False).fit_transform(data, convert_dtype=True)

    if nrows < 500000:
        score = adjusted_rand_score(labels,
                                    KMeans(5).fit_predict(embedding))
        assert score == 1.0
Beispiel #14
0
    def compare_exp_decay_params(a=None, b=None, min_dist=0.1, spread=1.0):
        cuml_model = cuUMAP(a=a, b=b, min_dist=min_dist, spread=spread)
        state = cuml_model.__getstate__()
        cuml_a, cuml_b = state['a'], state['b']
        skl_model = umap.UMAP(a=a, b=b, min_dist=min_dist, spread=spread)
        skl_model.fit(np.zeros((1, 1)))
        sklearn_a, sklearn_b = skl_model._a, skl_model._b

        assert abs(cuml_a) - abs(sklearn_a) < 1e-6
        assert abs(cuml_b) - abs(sklearn_b) < 1e-6
Beispiel #15
0
def test_semisupervised_umap_trustworthiness_on_iris():
    iris = datasets.load_iris()
    data = iris.data
    target = iris.target.copy()
    target[25:75] = -1
    embedding = cuUMAP(n_neighbors=10, min_dist=0.01,
                       verbose=False).fit_transform(data, target,
                                                    convert_dtype=True)

    trust = trustworthiness(iris.data, embedding, 10)
    assert trust >= 0.97 - TRUST_TOLERANCE_THRESH
Beispiel #16
0
def test_umap_trustworthiness_on_iris():
    iris = datasets.load_iris()
    data = iris.data
    embedding = cuUMAP(n_neighbors=10, min_dist=0.01,
                       verbose=False).fit_transform(data, convert_dtype=True)
    trust = trustworthiness(iris.data, embedding, 10)

    # We are doing a spectral embedding but not a
    # multi-component layout (which is marked experimental).
    # As a result, our score drops by 0.006.
    assert trust >= 0.964 - TRUST_TOLERANCE_THRESH
Beispiel #17
0
def test_umap_transform_trustworthiness_with_consistency_enabled():
    iris = datasets.load_iris()
    data = iris.data
    selection = np.random.RandomState(42).choice(
        [True, False], data.shape[0], replace=True, p=[0.5, 0.5])
    fit_data = data[selection]
    transform_data = data[~selection]
    model = cuUMAP(n_neighbors=10, min_dist=0.01, random_state=42,
                   verbose=False)
    model.fit(fit_data, convert_dtype=True)
    embedding = model.transform(transform_data, convert_dtype=True)
    trust = trustworthiness(transform_data, embedding, 10)
    assert trust >= 0.92
Beispiel #18
0
def test_umap_transform_on_iris():

    iris = datasets.load_iris()
    iris_selection = np.random.RandomState(42).choice(
        [True, False], 150, replace=True, p=[0.75, 0.25])
    data = iris.data[iris_selection]

    fitter = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False)
    fitter.fit(data, convert_dtype=True)
    new_data = iris.data[~iris_selection]
    embedding = fitter.transform(new_data, convert_dtype=True)

    trust = trustworthiness(new_data, embedding, 10)
    assert trust >= 0.89
Beispiel #19
0
def test_supervised_umap_trustworthiness_against_umap_learn():
    iris = datasets.load_iris()
    data = iris.data
    embedding = cuUMAP(n_neighbors=10, min_dist=0.01,
                       verbose=False).fit_transform(data, iris.target,
                                                    convert_dtype=True)

    skl_embedding = umap.UMAP(n_neighbors=10, min_dist=0.01,
                              verbose=False).fit_transform(data, iris.target)

    trust = trustworthiness(iris.data, embedding, 10)

    skl_trust = trustworthiness(iris.data, skl_embedding, 10)
    assert (skl_trust - 0.009) <= trust <= (skl_trust + 0.009)
Beispiel #20
0
def test_umap_transform_on_iris(target_metric):

    iris = datasets.load_iris()

    iris_selection = np.random.RandomState(42).choice(
        [True, False], 150, replace=True, p=[0.75, 0.25])
    data = iris.data[iris_selection]

    fitter = cuUMAP(n_neighbors=10, init="random", n_epochs=800, min_dist=0.01,
                    random_state=42, target_metric=target_metric)
    fitter.fit(data, convert_dtype=True)
    new_data = iris.data[~iris_selection]
    embedding = fitter.transform(new_data, convert_dtype=True)

    assert not np.isnan(embedding).any()

    trust = trustworthiness(new_data, embedding, n_neighbors=10)
    assert trust >= 0.85
Beispiel #21
0
def test_umap_transform_on_digits_sparse(target_metric, input_type,
                                         xform_method):

    digits = datasets.load_digits()

    digits_selection = np.random.RandomState(42).choice([True, False],
                                                        1797,
                                                        replace=True,
                                                        p=[0.75, 0.25])

    if input_type == 'cupy':
        sp_prefix = cupyx.scipy.sparse
    else:
        sp_prefix = scipy.sparse

    data = sp_prefix.csr_matrix(
        scipy.sparse.csr_matrix(digits.data[digits_selection]))

    fitter = cuUMAP(n_neighbors=15,
                    verbose=logger.level_info,
                    init="random",
                    n_epochs=0,
                    min_dist=0.01,
                    random_state=42,
                    target_metric=target_metric)

    new_data = sp_prefix.csr_matrix(
        scipy.sparse.csr_matrix(digits.data[~digits_selection]))

    if xform_method == 'fit':
        fitter.fit(data, convert_dtype=True)
        embedding = fitter.transform(new_data, convert_dtype=True)
    else:
        embedding = fitter.fit_transform(new_data, convert_dtype=True)

    if input_type == 'cupy':
        embedding = embedding.get()

    trust = trustworthiness(digits.data[~digits_selection],
                            embedding,
                            n_neighbors=15)
    assert trust >= 0.96
Beispiel #22
0
def test_umap_data_formats(input_type, should_downcast, nrows, n_feats, name):

    dtype = np.float32 if not should_downcast else np.float64
    n_samples = nrows
    n_feats = n_feats

    if name == 'digits':
        # use the digits dataset for unit test
        digits = datasets.load_digits(n_class=9)
        X = digits["data"].astype(dtype)

    else:
        X, y = datasets.make_blobs(n_samples=n_samples,
                                   n_features=n_feats,
                                   random_state=0)

    umap = cuUMAP(n_neighbors=3, n_components=2, verbose=False)

    embeds = umap.fit_transform(X)
    assert type(embeds) == np.ndarray
Beispiel #23
0
def test_umap_fit_transform_score_default():

    n_samples = 500
    n_features = 20

    data, labels = make_blobs(n_samples=n_samples,
                              n_features=n_features,
                              centers=10,
                              random_state=42)

    model = umap.UMAP()
    cuml_model = cuUMAP(verbose=False)

    embedding = model.fit_transform(data)
    cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True)

    cuml_score = adjusted_rand_score(labels,
                                     KMeans(10).fit_predict(cuml_embedding))
    score = adjusted_rand_score(labels, KMeans(10).fit_predict(embedding))

    assert array_equal(score, cuml_score, 1e-2, with_sign=True)
Beispiel #24
0
def test_fuzzy_simplicial_set(n_rows,
                              n_features,
                              n_neighbors):
    n_clusters = 30
    random_state = 42

    X, _ = make_blobs(n_samples=n_rows, centers=n_clusters,
                      n_features=n_features, random_state=random_state)

    model = cuUMAP(n_neighbors=n_neighbors)
    model.fit(X)
    cu_fss_graph = model.graph_

    model = umap.UMAP(n_neighbors=n_neighbors)
    model.fit(X)
    ref_fss_graph = model.graph_

    cu_fss_graph = cu_fss_graph.todense()
    ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense()
    assert correctness_sparse(ref_fss_graph,
                              cu_fss_graph,
                              atol=0.1,
                              rtol=0.2,
                              threshold=0.95)
Beispiel #25
0
 def get_embedding(n_components, random_state):
     reducer = cuUMAP(init="random",
                      n_components=n_components,
                      random_state=random_state)
     return reducer.fit_transform(data, convert_dtype=True)
Beispiel #26
0
def encoder_latent_umaps(workdir, outdir, epochs, n_particles_total, subset, random_seed, use_umap_gpu, random_state, n_epochs_umap, LOG):
    '''
    Calculates UMAP embeddings of subset of particles' selected epochs' latent encodings

    Inputs
        workdir: path to directory containing cryodrgn training results
        outdir: path to base directory to save outputs
        epochs: array of epochs for which to calculate UMAPs
        n_particles_total: int of total number of particles trained
        subset: int, size of subset on which to calculate umap, None means all
        random_seed: int, seed for random selection of subset particles
        use_umap_gpu: bool, whether to use the cuML library to GPU accelerate UMAP calculations (if available in env)
        random_state: int, random state seed used by UMAP for reproducibility at slight cost of performance (None means faster but non-reproducible)

    Outputs
        pkl of each UMAP embedding stored in outdir/umaps/umap.epoch.pkl
        png of all UMAPs

    # apparently running multiple UMAP embeddings (i.e. for each epoch's z.pkl) in parallel on CPU requires difficult backend setup
    # see https://github.com/lmcinnes/umap/issues/707
    # therefore not implemented currently
    '''

    if subset == 'None':
        n_particles_subset = n_particles_total
        flog('Using full particle stack for UMAP', LOG)
    else:
        if random_seed is None:
            random_seed = random.randint(0, 100000)
            random.seed(random_seed)
        else:
            random.seed(random_seed)
        n_particles_subset = min(n_particles_total, int(subset))
        flog(f'Randomly selecting {n_particles_subset} particle subset on which to run UMAP (with random seed {random_seed})', LOG)
    ind_subset = sorted(random.sample(range(0, n_particles_total), k=n_particles_subset))
    utils.save_pkl(ind_subset, outdir + '/ind_subset.pkl')

    for epoch in epochs:
        flog(f'Now calculating UMAP for epoch {epoch} with random_state {random_state}', LOG)
        z = utils.load_pkl(workdir + f'/z.{epoch}.pkl')[ind_subset, :]
        if use_umap_gpu: #using cuML library GPU-accelerated UMAP
            reducer = cuUMAP(random_state=random_state, n_epochs=n_epochs_umap)
            umap_embedding = reducer.fit_transform(z)
        else: #using umap-learn library CPU-bound UMAP
            reducer = umap.UMAP(random_state=random_state)
            umap_embedding = reducer.fit_transform(z)
        utils.save_pkl(umap_embedding, f'{outdir}/umaps/umap.{epoch}.pkl')


    n_cols = int(np.ceil(len(epochs) ** 0.5))
    n_rows = int(np.ceil(len(epochs) / n_cols))

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(2 * n_cols, 2 * n_rows), sharex='all', sharey='all')
    fig.tight_layout()

    for i, ax in enumerate(axes.flat):
        try:
            umap_embedding = utils.load_pkl(f'{outdir}/umaps/umap.{epochs[i]}.pkl')
            toplot = ax.hexbin(umap_embedding[:, 0], umap_embedding[:, 1], bins='log', mincnt=1)
            ax.set_title(f'epoch {epochs[i]}')
        except IndexError:
            pass
        except FileNotFoundError:
            flog(f'Could not find file {outdir}/umaps/umap.{epoch}.pkl', LOG)
            pass

    if len(axes.shape) == 1:
        axes[0].set_ylabel('UMAP2')
        for a in axes[:]: a.set_xlabel('UMAP1')
    else:
        assert len(axes.shape) == 2 #there are more than one row and column of axes
        for a in axes[:, 0]: a.set_ylabel('UMAP2')
        for a in axes[-1, :]: a.set_xlabel('UMAP1')
    fig.subplots_adjust(right=0.96)
    cbar_ax = fig.add_axes([0.98, 0.15, 0.02, 0.7])
    cbar = fig.colorbar(toplot, cax=cbar_ax)
    cbar.ax.set_ylabel('particle density', rotation=90)

    plt.subplots_adjust(wspace=0.1)
    plt.subplots_adjust(hspace=0.3)
    plt.savefig(f'{outdir}/plots/01_encoder_umaps.png', dpi=300, format='png', transparent=True, bbox_inches='tight')
    flog(f'Saved UMAP distribution plot to {outdir}/plots/01_encoder_umaps.png', LOG)
Beispiel #27
0
 def fit_transform_embed(knn_graph=None):
     model = cuUMAP(verbose=False, random_state=42, n_neighbors=n_neighbors)
     return model.fit_transform(data,
                                knn_graph=knn_graph,
                                convert_dtype=True)