Python UMAP Examples

Programming Language: Python

Namespace/Package Name: umap

Class/Type: UMAP

Examples at hotexamples.com: 33

UMAP (Uniform Manifold Approximation and Projection) is a dimensionality reduction technique used in machine learning and data analysis. It can help in visualizing high-dimensional data and identifying patterns in the data.

Here are some examples of how to use the UMAP package in Python:

Example 1: Reducing the dimensionality of the Iris dataset and visualizing the results

from sklearn.datasets import load_iris
import umap
import matplotlib.pyplot as plt

iris = load_iris()
X = iris.data
y = iris.target

reducer = umap.UMAP()
embedding = reducer.fit_transform(X)

plt.scatter(embedding[:, 0], embedding[:, 1], c=y)
plt.show()

In this example, we load the Iris dataset and use UMAP to reduce the dimensionality of the data to two dimensions. We then plot the results using matplotlib and color the points based on their class labels.

Package library: scikit-learn

Example 2: Creating a UMAP transformer for use in a scikit-learn pipeline

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import umap

digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)

preprocessor = make_pipeline(StandardScaler(), umap.UMAP(n_components=30))
knn = KNeighborsClassifier(n_neighbors=3)

model = make_pipeline(preprocessor, knn)
model.fit(X_train, y_train)

score = model.score(X_test, y_test)
print("Accuracy:", score)

In this example, we create a scikit-learn pipeline that includes a UMAP transformer to reduce the dimensionality of the data before passing it to a K-Nearest Neighbors classifier. We then train the model and output the accuracy on the test set. Package library: scikit-learn

Python UMAP - 33 examples found. These are the top rated real world Python examples of umap.UMAP extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

UMAP(30)

fit(30)

fit_transform(30)

transform(30)

inverse_transform(4)

astype(2)

embedding_(2)

compile(1)

get_params(1)

max(1)

min(1)

set_params(1)

to_csv(1)

to_json(1)

tolist(1)

update(1)

Example #1

Show file

File: transform.py Project: alexeyche/alexeyche-junk

 def __init__(
     self,
     n_neighbors=15,
     min_dist=0.1,
     metric="euclidean",
     n_components=2,
     spread=1.0,
     random_state=None
 ):
     self._inst = UMAP(
         n_neighbors = n_neighbors,
         min_dist = min_dist,
         metric = metric,
         n_components=n_components,
         spread=spread,
     )

Example #2

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_negative_n_neighbours(nn_data):
    u = UMAP(n_neighbors=-1)
    assert_raises(ValueError, u.fit, nn_data)

Example #3

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_non_integer_n_components(nn_data):
    u = UMAP(n_components=1.5)
    assert_raises(ValueError, u.fit, nn_data)

Example #4

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_negative_min_dist(nn_data):
    u = UMAP(min_dist=-1)
    assert_raises(ValueError, u.fit, nn_data)

Example #5

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_bad_hellinger_data(nn_data):
    u = UMAP(metric="hellinger")
    assert_raises(ValueError, u.fit, -nn_data)

Example #6

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_negative_op(nn_data):
    u = UMAP(set_op_mix_ratio=-1.0)
    assert_raises(ValueError, u.fit, nn_data)

Example #7

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_unique_and_precomputed(nn_data):
    u = UMAP(metric="precomputed", unique=True)
    assert_raises(ValueError, u.fit, nn_data)

Example #8

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_densmap_var_shift(nn_data):
    u = UMAP(densmap=True, dens_var_shift=-1.0)
    assert_raises(ValueError, u.fit, nn_data)

Example #9

Show file

def test_bad_transform_data(nn_data):
    u = UMAP().fit([[1, 1, 1, 1]])
    with pytest.raises(ValueError):
        u.transform([[0, 0, 0, 0]])

Example #10

Show file

File: graph_streamlit.py Project: martinoni/streamlit_medical

    'small', 'by', 'this', 'have', 'in', 'obviously', 'ten', 'those', 'vessel',
    'good', 'up', 'will', 'combination', 'rather', 'should', 'if', 'so',
    'plan', 'interesting', 'chat', 'let', 'now', 'imply', 'the', 'image',
    'information', 'get', 'particular', 'test', 'show', 'about', 'strong',
    'seventy', 'would', 'two', 'eighty', 'grey', 'at', 'last', 'always',
    'blood', 'on', 'first', 'light', 'can', 'point', 'family', 'take',
    'between', 'must', 'than', 'dr', 'honest', 'which', 'do', 'seem', 'an',
    'all', 'black', '10', ' ', '  ', '   ', 'johnson', 'gosh', 'when', 'far',
    'mean', 'with', 'absolutely', 'for', 'make', 'as', 'somewhere', 'screen',
    'true', '20', 'correct', 'into', 'specifically', '90', 'dark', 'start',
    'bottom', 'then', 'd', '100', 'out', 'line', 'where', 'pass', 'ct', 'i',
    'round', 'open', 'mrs', 'clog'
]
vectors = []
for word in words:
    vectors.append(nlp(word).vector)
vectors = array(vectors)
st.sidebar.subheader('Embedding Visualization')
n_words = st.sidebar.slider("Number of words", 1, len(words), 30)
words = array(words[:n_words])
vectors = vectors[:n_words, :]
reducer = UMAP(n_components=3)
scaled_data = StandardScaler().fit_transform(vectors)
embedding = reducer.fit_transform(scaled_data)
fig = scatter_3d(x=embedding[:, 0],
                 y=embedding[:, 1],
                 z=embedding[:, 2],
                 text=words,
                 hover_name=words)
st.plotly_chart(fig)

Example #11

Show file

    distmat = nan_to_num(distmat)

distmat[distmat < 0] = 0.0
distmat = distmat / max(distmat)
print(distmat.shape)

from umap import UMAP
from py.utils.safe_pickle import pickle_dump
import os

dirname = "../../../exact_embeddings/" + distname + "_" + dataset
if not os.path.exists(dirname):
    os.mkdir(dirname)

n_neighbors = 40
for n_components in [50, 100, 300, 1000]:
    for min_dist in [1.0, 1.5, 2.0]:
        for spread in [1.0, 2.5]:
            if min_dist > spread:
                continue

            print(n_components, n_neighbors, min_dist, spread)
            t = UMAP(n_components=n_components, n_neighbors = n_neighbors,\
                min_dist=min_dist, metric = "precomputed", random_state=42,\
                n_epochs = 1000, spread = spread)
            embeddings = t.fit_transform(distmat)
            print(embeddings.shape)
            pickle_dump(
                embeddings, dirname + "/" + str(n_components) + "-" +
                str(min_dist) + "-" + str(spread) + ".p")

Example #12

Show file

File: predict_davis2011kinase.py Project: tjustorm/uncertainty

def latent_scatter(var_unk_pred, y_unk_pred, acquisition, **kwargs):
    chems = kwargs['chems']
    chem2feature = kwargs['chem2feature']
    idx_obs = kwargs['idx_obs']
    idx_unk = kwargs['idx_unk']
    regress_type = kwargs['regress_type']
    prot_target = kwargs['prot_target']

    chem_idx_obs = sorted(set([i for i, _ in idx_obs]))
    chem_idx_unk = sorted(set([i for i, _ in idx_unk]))

    feature_obs = np.array([chem2feature[chems[i]] for i in chem_idx_obs])
    feature_unk = np.array([chem2feature[chems[i]] for i in chem_idx_unk])

    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=1).fit(feature_obs)
    dist = np.ravel(nbrs.kneighbors(feature_unk)[0])
    print('Distance Spearman r = {}, P = {}'.format(
        *ss.spearmanr(dist, var_unk_pred)))
    print('Distance Pearson rho = {}, P = {}'.format(
        *ss.pearsonr(dist, var_unk_pred)))

    X = np.vstack([feature_obs, feature_unk])
    labels = np.concatenate(
        [np.zeros(len(chem_idx_obs)),
         np.ones(len(chem_idx_unk))])
    sidx = np.argsort(-var_unk_pred)

    from fbpca import pca
    U, s, Vt = pca(
        X,
        k=3,
    )
    X_pca = U * s

    from umap import UMAP
    um = UMAP(
        n_neighbors=15,
        min_dist=0.5,
        n_components=2,
        metric='euclidean',
    )
    X_umap = um.fit_transform(X)

    from MulticoreTSNE import MulticoreTSNE as TSNE
    tsne = TSNE(
        n_components=2,
        n_jobs=20,
    )
    X_tsne = tsne.fit_transform(X)

    if prot_target is None:
        suffix = ''
    else:
        suffix = '_' + prot_target

    for name, coords in zip(
        ['pca', 'umap', 'tsne'],
        [X_pca, X_umap, X_tsne],
    ):
        plt.figure()
        sns.scatterplot(
            x=coords[labels == 1, 0],
            y=coords[labels == 1, 1],
            color='blue',
            alpha=0.1,
        )
        plt.scatter(
            x=coords[labels == 0, 0],
            y=coords[labels == 0, 1],
            color='orange',
            alpha=1.0,
            marker='x',
            linewidths=10,
        )
        plt.savefig('figures/latent_scatter_{}_ypred_{}{}.png'.format(
            name, regress_type, suffix),
                    dpi=300)
        plt.close()

        plt.figure()
        plt.scatter(x=coords[labels == 1, 0],
                    y=coords[labels == 1, 1],
                    c=ss.rankdata(var_unk_pred),
                    alpha=0.1,
                    cmap='coolwarm')
        plt.savefig('figures/latent_scatter_{}_var_{}{}.png'.format(
            name, regress_type, suffix),
                    dpi=300)
        plt.close()

        plt.figure()
        plt.scatter(x=coords[labels == 1, 0],
                    y=coords[labels == 1, 1],
                    c=-acquisition,
                    alpha=0.1,
                    cmap='hot')
        plt.savefig('figures/latent_scatter_{}_acq_{}{}.png'.format(
            name, regress_type, suffix),
                    dpi=300)
        plt.close()

Example #13

Show file

def visualize_topics(topic_model,
                     topics: List[int] = None,
                     top_n_topics: int = None,
                     width: int = 650,
                     height: int = 650) -> go.Figure:
    """ Visualize topics, their sizes, and their corresponding words

    This visualization is highly inspired by LDAvis, a great visualization
    technique typically reserved for LDA.

    Arguments:
        topic_model: A fitted BERTopic instance.
        topics: A selection of topics to visualize
        top_n_topics: Only select the top n most frequent topics
        width: The width of the figure.
        height: The height of the figure.

    Usage:

    To visualize the topics simply run:

    ```python
    topic_model.visualize_topics()
    ```

    Or if you want to save the resulting figure:

    ```python
    fig = topic_model.visualize_topics()
    fig.write_html("path/to/file.html")
    ```
    """
    # Select topics based on top_n and topics args
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(
            topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1])
    else:
        topics = sorted(list(topic_model.get_topics().keys()))

    # Extract topic words and their frequencies
    topic_list = sorted(topics)
    frequencies = [topic_model.topic_sizes[topic] for topic in topic_list]
    words = [
        " | ".join([word[0] for word in topic_model.get_topic(topic)[:5]])
        for topic in topic_list
    ]

    # Embed c-TF-IDF into 2D
    all_topics = sorted(list(topic_model.get_topics().keys()))
    indices = np.array([all_topics.index(topic) for topic in topics])
    embeddings = topic_model.c_tf_idf.toarray()[indices]
    embeddings = MinMaxScaler().fit_transform(embeddings)
    embeddings = UMAP(n_neighbors=2, n_components=2,
                      metric='hellinger').fit_transform(embeddings)

    # Visualize with plotly
    df = pd.DataFrame({
        "x": embeddings[1:, 0],
        "y": embeddings[1:, 1],
        "Topic": topic_list[1:],
        "Words": words[1:],
        "Size": frequencies[1:]
    })
    return _plotly_topic_visualization(df, topic_list, width, height)

Example #14

Show file

File: scale_plot.py Project: wugene/EPIPREX

def plot_embedding(X,
                   labels,
                   classes=None,
                   method='tSNE',
                   cmap='tab20',
                   figsize=(4, 4),
                   markersize=4,
                   marker=None,
                   return_emb=False,
                   save=False,
                   save_emb=False,
                   show_legend=True,
                   show_axis_label=True,
                   **legend_params):

    if marker is not None:
        X = np.concatenate([X, marker], axis=0)
    N = len(labels)
    if X.shape[1] != 2:
        if method == 'tSNE':
            #from sklearn.manifold import TSNE
            from MulticoreTSNE import MulticoreTSNE as TSNE
            X = TSNE(n_components=2, random_state=124,
                     n_jobs=32).fit_transform(X)
        if method == 'UMAP':
            from umap import UMAP
            X = UMAP(n_neighbors=30, min_dist=0.3,
                     metric='correlation').fit_transform(X)
        if method == 'PCA':
            from sklearn.decomposition import PCA
            X = PCA(n_components=2, random_state=124).fit_transform(X)

    plt.figure(figsize=figsize)
    if classes is None:
        classes = np.unique(labels)

    if cmap is not None:
        cmap = cmap
    elif len(classes) <= 10:
        cmap = 'tab10'
    elif len(classes) <= 20:
        cmap = 'tab20'
    else:
        cmap = 'husl'
    colors = sns.color_palette(cmap, n_colors=len(classes))

    for i, c in enumerate(classes):
        plt.scatter(X[:N][labels == c, 0],
                    X[:N][labels == c, 1],
                    s=markersize,
                    color=colors[i],
                    label=c)
    if marker is not None:
        plt.scatter(X[N:, 0],
                    X[N:, 1],
                    s=10 * markersize,
                    color='black',
                    marker='*')


#     plt.axis("off")

    legend_params_ = {
        'loc': 'center left',
        'bbox_to_anchor': (1.0, 0.45),
        'fontsize': 10,
        'ncol': 1,
        'frameon': False,
        'markerscale': 1.5
    }
    legend_params_.update(**legend_params)
    if show_legend:
        plt.legend(**legend_params_)
    sns.despine(offset=10, trim=True)
    if show_axis_label:
        plt.xlabel(method + ' dim 1', fontsize=12)
        plt.ylabel(method + ' dim 2', fontsize=12)

    if save:
        plt.savefig(save, format='pdf', bbox_inches='tight')
    else:
        plt.show()

    if save_emb:
        np.savetxt(save_emb, X)
    if return_emb:
        return X

Example #15

Show file

File: transform.py Project: alexeyche/alexeyche-junk

class TUmap(Transform):
    """
    n_neighbors:
        This determines the number of neighboring points used in local approximations
        of manifold structure. Larger values will result in more global structure being
        preserved at the loss of detailed local structure.
        In general this parameter should often be in the range 5 to 50,
        with a choice of 10 to 15 being a sensible default.
    min_dist:
        This controls how tightly the embedding is allowed compress points together.
        Larger values ensure embedded points are more evenly distributed, while smaller
        values allow the algorithm to optimise more accurately with regard to local structure.
        Sensible values are in the range 0.001 to 0.5, with 0.1 being a reasonable default.
    metric:
        This determines the choice of metric used to measure distance in the input space.
        A wide variety of metrics are already coded, and a user defined function can be passed
        as long as it has been JITd by numba.
    """

    def __init__(
        self,
        n_neighbors=15,
        min_dist=0.1,
        metric="euclidean",
        n_components=2,
        spread=1.0,
        random_state=None
    ):
        self._inst = UMAP(
            n_neighbors = n_neighbors,
            min_dist = min_dist,
            metric = metric,
            n_components=n_components,
            spread=spread,
        )


    def transform(self, fp):
        x = FeaturePool(fp).array()
        logger.info("TUmap: starting UMAP transform ...")
        x_emb = self._inst.fit_transform(x)
        logger.info("TUamp: Done")

        for f_id in range(x_emb.shape[1]):
            yield Feature(
                "UMAP feature #{}".format(f_id),
                x_emb[:, f_id]
            )

    @staticmethod
    def plot_embedding(efp: FeaturePool, split_by=None):
        x = efp.array()
        assert x.shape[1] == 2, "Embedding is expected to be with the size 2 to plot, got {}".format(x.shape[1])
        fig = plt.figure(figsize=(7, 7))
        ax = fig.add_subplot(111)
        if split_by is not None:
            d = split_by.data
            ax.scatter(x[:, 0], x[:, 1], c=d, alpha=0.5)
        else:
            ax.scatter(x[:, 0], x[:, 1], alpha=0.5)
        if split_by is not None:
            ax.set_title(
                "UMAP for a feature pool splitted by feature `{}`".format(split_by.name)
            )
        else:
            ax.set_title(
                "UMAP for a feature pool"
            )
        fig.show()

Example #16

Show file

File: mnist.py Project: yyht/reval_clustering

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from reval.best_nclust_cv import FindBestClustCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import zero_one_loss, adjusted_mutual_info_score
import matplotlib.pyplot as plt
from umap import UMAP
from reval.visualization import plot_metrics
from reval.relative_validation import _kuhn_munkres_algorithm

# MNIST dataset with 10 classes
mnist, label = fetch_openml('mnist_784', version=1, return_X_y=True)
transform = UMAP(n_neighbors=30,
                 min_dist=0.0,
                 n_components=10,
                 random_state=42)

# Stratified subsets of 7000 elements for both training and test set
mnist_tr, mnist_ts, label_tr, label_ts = train_test_split(mnist,
                                                          label,
                                                          train_size=0.1,
                                                          test_size=0.1,
                                                          random_state=42,
                                                          stratify=label)

# Dimensionality reduction with UMAP as pre-processing step
mnist_tr = transform.fit_transform(mnist_tr)
mnist_ts = transform.transform(mnist_ts)

plt.scatter(mnist_tr[:, 0],

Example #17

Show file

def test_blobs_cluster():
    data, labels = make_blobs(n_samples=500, n_features=10, centers=5)
    embedding = UMAP(n_epochs=100).fit_transform(data)
    assert adjusted_rand_score(labels, KMeans(5).fit_predict(embedding)) == 1.0

Example #18

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_densmap_frac(nn_data):
    u = UMAP(densmap=True, dens_frac=-1.0)
    assert_raises(ValueError, u.fit, nn_data)
    u = UMAP(densmap=True, dens_frac=2.0)
    assert_raises(ValueError, u.fit, nn_data)

Example #19

Show file

                for neighbors in n_neighbors:
                    for min_d in min_dist:
                        for k in ks:

                            print(dataset + ', ' + metric + ', ' + scaler +
                                  ', n_components=' + str(components) +
                                  ', n_neighbors=' + str(neighbors) +
                                  ', min_dist=' + str(min_d) + ', k=' + str(k))

                            # Se estandariza usando el scaler correspondiente
                            df = scalers[scaler].fit_transform(
                                datasets[dataset])

                            # Se aplica UMAP
                            um = UMAP(n_components=components,
                                      n_neighbors=neighbors,
                                      min_dist=min_d,
                                      metric=metric)
                            embedding = um.fit_transform(df)

                            # Se calculan las validaciones internas
                            sil = get_silhouette_avg(embedding, k)
                            sse = get_sse(embedding, k)

                            # Se aplica KMeans
                            km = KMeans(n_clusters=k,
                                        random_state=0).fit(embedding)

                            # Se calcula la matriz de confusión
                            tmp = pd.DataFrame({
                                'Generos': metadata.genre,
                                'data': km.labels_

Example #20

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_densmap_bad_output_metric(nn_data):
    u = UMAP(densmap=True, output_metric="haversine")
    assert_raises(ValueError, u.fit, nn_data)

Example #21

Show file

#all_docs_tagged = [TaggedDocument(doc, [i]) for i, doc in all_docs.items()]
tagged_docs = [TaggedDocument(doc, [i]) for i, doc in X.items()]

print("Train Doc2Vec model...")
#model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
#model.build_vocab(all_docs_tagged)
model = Doc2Vec.load(model_path)

print("Infer doc vectors...")
docvecs = X.progress_apply(lambda x: model.infer_vector(x))
docvecs = list(docvecs)

#docvecs = docvecs.to_numpy()
#print("dim reduction ...")
dim_reduced_vecs = UMAP(metric="cosine",
                        set_op_mix_ratio=0,
                        n_components=n_comps,
                        random_state=42).fit_transform(docvecs)

print("dim reduction 2D ...")
vecs_2d = UMAP(metric="cosine",
               set_op_mix_ratio=0,
               n_components=2,
               random_state=42).fit_transform(docvecs)

#print("Local outlier factor ...")
#df["predicted"] = LocalOutlierFactor(
#    novelty=False, metric="euclidean", contamination=d["contamination"]).fit_predict(dim_reduced_vecs)

print("HDBSCAN ...")
#dim_reduced_vecs = normalize(dim_reduced_vecs, norm="l2")
clusterer = HDBSCAN(min_cluster_size=min_cluster_size,

Example #22

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_bad_n_jobs(nn_data):
    u = UMAP(n_jobs=-2)
    assert_raises(ValueError, u.fit, nn_data)
    u = UMAP(n_jobs=0)
    assert_raises(ValueError, u.fit, nn_data)

Example #23

Show file

File: manuscript_examples.py Project: IIT-LAND/reval_clustering

def example2():
    mnist = fetch_openml('mnist_784', version=1)
    mnist.target = mnist.target.astype(int)

    X_tr, y_tr = mnist['data'][:60000], mnist.target[:60000]
    X_ts, y_ts = mnist['data'][60000::], mnist.target[60000::]
    transform = UMAP(n_components=2,
                     random_state=42,
                     n_neighbors=30,
                     min_dist=0.0)
    X_tr = transform.fit_transform(X_tr)
    X_ts = transform.transform(X_ts)

    s = KNeighborsClassifier(n_neighbors=30)
    c = hdbscan.HDBSCAN(min_samples=10,
                        min_cluster_size=200)

    reval = FindBestClustCV(s=s,
                            c=c,
                            nfold=2,
                            nrand=10,
                            n_jobs=N_JOBS)

    metrics, nclustbest, tr_lab = reval.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr)

    plot_metrics(metrics)

    out = reval.evaluate(X_tr, X_ts, nclust=nclustbest, tr_lab=tr_lab)
    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)
    logging.info(f"Validation stability: {metrics['val'][nclustbest]}")

    logging.info(f"Best number of clusters during CV: {nclustbest}")
    logging.info(f"Best number of clusters on test set: "
                 f"{len([lab for lab in np.unique(out.test_cllab) if lab >= 0])}")
    logging.info(f'AMI (true labels vs predicted labels) = '
                 f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    logging.info('\n\n')

    logging.info("Metrics from true label comparisons on test set:")
    class_scores = compute_metrics(y_ts, perm_lab)
    for k, val in class_scores.items():
        logging.info(f'{k}, {val}')
    logging.info('\n\n')

    # Visualization
    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=y_tr, cmap='rainbow_r',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set true labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=kuhn_munkres_algorithm(y_tr, tr_lab),
                         cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set predicted labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         c=y_ts, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set true labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         s=0.1,
                         c=perm_lab, cmap='tab20')
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set clustering labels (digits dataset)")
    plt.show()

    # Internal measures
    # SILHOUETTE
    logging.info("Silhouette score based selection")
    sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, c, silhouette_score, select='max')
    sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, c, silhouette_score, select='max')
    logging.info(
        f"Best number of clusters (and scores) for tr/ts independent runs: "
        f"{sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}')
    logging.info('\n\n')

    # DAVIES-BOULDIN
    logging.info("Davies-Bouldin score based selection")
    db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, c, davies_bouldin_score,
                                                       select='min')
    db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, c, davies_bouldin_score,
                                                       select='min')

    logging.info(
        f"Best number of clusters (and scores) for tr/ts independent runs: "
        f"{db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}')
    logging.info('\n\n')

    # Visualization
    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=sil_label_tr, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set silhouette labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         c=sil_label_ts, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set silhouette labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=db_label_tr, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set Davies-Bouldin labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         s=0.1,
                         c=db_label_ts, cmap='tab20')
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set Davies-Bouldin labels (digits dataset)")
    plt.show()

Example #24

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_too_large_op(nn_data):
    u = UMAP(set_op_mix_ratio=1.5)
    assert_raises(ValueError, u.fit, nn_data)

Example #25

Show file

File: manuscript_examples.py Project: IIT-LAND/reval_clustering

def example3(n_jobs, preprocess=None):
    """
    :param preprocess: it can be 'scaled',
        'umap', 'scaled+umap', default None for raw processing.
    :type preprocess: str
    :return:
    """
    # Example 4: best clussifier/clustering for UCI dataset

    # Classifiers
    s = [LogisticRegression(solver='liblinear',
                            random_state=42),
         RandomForestClassifier(n_estimators=100,
                                random_state=42),
         KNeighborsClassifier(n_neighbors=1,
                              metric='euclidean'),
         SVC(C=1,
             random_state=42)]

    # Clustering
    c = [AgglomerativeClustering(),
         KMeans(random_state=42),
         hdbscan.HDBSCAN()]

    scparam = {'s': s,
               'c': c}

    transform = UMAP(n_neighbors=30, min_dist=0.0, random_state=42)
    scale = StandardScaler()

    # Import benchmark datasets
    uci_data = build_ucidatasets()
    # Run ensemble learning algorithm
    best_results = {}
    for data, name in zip(uci_data, uci_data._fields):
        scparam['s'][-1].gamma = (1 / data['data'].shape[0])
        nclass = len(np.unique(data['target']))
        logging.info(f"Processing dataset {name}")
        logging.info(f"True number of classes: {nclass}\n")
        X_tr, X_ts, y_tr, y_ts = train_test_split(data['data'],
                                                  data['target'],
                                                  test_size=0.40,
                                                  random_state=42,
                                                  stratify=data['target'])
        if preprocess == 'umap+scaled':
            X_tr = transform.fit_transform(scale.fit_transform(X_tr))
        elif preprocess == 'umap':
            X_tr = transform.fit_transform(X_tr)
        elif preprocess == 'scaled':
            X_tr = scale.fit_transform(X_tr)

        scparam_select = SCParamSelection(sc_params=scparam,
                                          cv=2,
                                          nrand=10,
                                          clust_range=list(range(2, nclass + 3, 1)),
                                          n_jobs=n_jobs,
                                          iter_cv=10,
                                          strat=y_tr)
        scparam_select.fit(X_tr, nclass=nclass)
        best_results[name] = scparam_select.best_param_
        # Uncomment to save the results
        #     pkl.dump(best_results, open('./best_resultUCI_scaledumap.pkl', 'wb'))
        logging.info('*' * 100)
        logging.info('\n\n')

Example #26

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_negative_n_components(nn_data):
    u = UMAP(n_components=-1)
    assert_raises(ValueError, u.fit, nn_data)

Example #27

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_haversine_on_highd(nn_data):
    u = UMAP(metric="haversine")
    assert_raises(ValueError, u.fit, nn_data)

Example #28

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_too_small_n_neighbours(nn_data):
    u = UMAP(n_neighbors=0.5)
    assert_raises(ValueError, u.fit, nn_data)

Example #29

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_haversine_embed_to_highd(nn_data):
    u = UMAP(n_components=3, output_metric="haversine")
    assert_raises(ValueError, u.fit, nn_data)

Example #30

Show file

File: umap_transformer.py Project: cmagnusb/machine_learning_examples

from datetime import datetime
from util import getKaggleMNIST
from sklearn.linear_model import LogisticRegression
from umap import UMAP

# get the data
Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()

print("Score without transformation:")
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print(model.score(Xtrain, Ytrain))
print(model.score(Xtest, Ytest))


umapper = UMAP(n_neighbors=5, n_components=10)
t0 = datetime.now()
Ztrain = umapper.fit_transform(Xtrain)
print("umap fit_transform took:", datetime.now() - t0)
t0 = datetime.now()
Ztest = umapper.transform(Xtest)
print("umap transform took:", datetime.now() - t0)

print("Score with transformation")
model = LogisticRegression()
t0 = datetime.now()
model.fit(Ztrain, Ytrain)
print("logistic regression fit took:", datetime.now() - t0)
print(model.score(Ztrain, Ytrain))
print(model.score(Ztest, Ytest))

Example #31

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_umap_too_many_neighbors_warns(nn_data):
    u = UMAP(a=1.2, b=1.75, n_neighbors=2000, n_epochs=11, init="random")
    u.fit(nn_data[:100, ])
    assert_equal(u._a, 1.2)
    assert_equal(u._b, 1.75)

Example #32

Show file

File: test_umap_validation_params.py Project: pifparfait/umap

def test_densmap_lambda(nn_data):
    u = UMAP(densmap=True, dens_lambda=-1.0)
    assert_raises(ValueError, u.fit, nn_data)

Example #33

Show file

File: model.py Project: nebw/temporal_nmf

    def get_factor_df(self,
                      ids=None,
                      embedding_dim=2,
                      batch_size=128,
                      valid_ages=None):
        if embedding_dim is not None:
            if embedding_dim == self.num_embeddings:
                embs_reduced = np.abs(
                    self.embeddings.weight.data.cpu().numpy())
            else:
                from umap import UMAP

                embs_reduced = UMAP(n_components=embedding_dim).fit_transform(
                    np.abs(self.embeddings.weight.data.cpu().numpy()))

        with torch.no_grad():
            idx = 0
            dfs = []

            all_temporal_idxs = torch.LongTensor(list(range(self.num_days)))
            while idx < self.num_entities:
                batch_idxs = torch.arange(
                    idx, min((idx + batch_size, self.num_entities)))
                (
                    _,
                    _,
                    _,
                    factors_by_emb,
                    _,
                    _,
                ) = self.forward(all_temporal_idxs, batch_idxs)

                idx += batch_size

                bee_ages_flat = self.ages[:, batch_idxs].numpy().flatten()
                factors_flat = factors_by_emb.data.cpu().numpy().reshape(
                    -1, self.num_factors)
                day_flat = np.tile(
                    np.arange(self.num_days)[:, None],
                    (1, len(batch_idxs))).flatten()
                columns = ["age", "day"
                           ] + [f"f_{f}" for f in range(self.num_factors)]
                df_data = np.concatenate(
                    (bee_ages_flat[:, None], day_flat[:, None], factors_flat),
                    axis=-1)

                if ids is not None:
                    columns = ["bee_id"] + columns
                    ids_flat = np.tile(ids[batch_idxs][None, :],
                                       (self.num_days, 1)).flatten()
                    df_data = np.concatenate((ids_flat[:, None], df_data),
                                             axis=-1)

                if valid_ages is not None:
                    columns = ["valid_age"] + columns
                    valid_flat = valid_ages[:, batch_idxs].flatten()
                    df_data = np.concatenate((valid_flat[:, None], df_data),
                                             axis=-1)

                if embedding_dim is not None:
                    columns += [f"e_{f}" for f in range(embedding_dim)]
                    embs_flat = np.tile(embs_reduced[batch_idxs][None, :],
                                        (self.num_days, 1)).reshape(
                                            -1, embedding_dim)
                    df_data = np.concatenate((df_data, embs_flat), axis=-1)

                factor_df = pd.DataFrame(df_data, columns=columns)
                dfs.append(factor_df)

            factor_df = pd.concat(dfs)

        factor_df.reset_index(inplace=True, drop=True)
        factor_df = factor_df[factor_df.age >= 0]

        return factor_df