Beispiel #1
0
 def __init__(
     self,
     n_neighbors=15,
     min_dist=0.1,
     metric="euclidean",
     n_components=2,
     spread=1.0,
     random_state=None
 ):
     self._inst = UMAP(
         n_neighbors = n_neighbors,
         min_dist = min_dist,
         metric = metric,
         n_components=n_components,
         spread=spread,
     )
def test_umap_negative_n_neighbours(nn_data):
    u = UMAP(n_neighbors=-1)
    assert_raises(ValueError, u.fit, nn_data)
def test_umap_non_integer_n_components(nn_data):
    u = UMAP(n_components=1.5)
    assert_raises(ValueError, u.fit, nn_data)
def test_umap_negative_min_dist(nn_data):
    u = UMAP(min_dist=-1)
    assert_raises(ValueError, u.fit, nn_data)
def test_umap_bad_hellinger_data(nn_data):
    u = UMAP(metric="hellinger")
    assert_raises(ValueError, u.fit, -nn_data)
def test_umap_negative_op(nn_data):
    u = UMAP(set_op_mix_ratio=-1.0)
    assert_raises(ValueError, u.fit, nn_data)
def test_umap_unique_and_precomputed(nn_data):
    u = UMAP(metric="precomputed", unique=True)
    assert_raises(ValueError, u.fit, nn_data)
def test_densmap_var_shift(nn_data):
    u = UMAP(densmap=True, dens_var_shift=-1.0)
    assert_raises(ValueError, u.fit, nn_data)
Beispiel #9
0
def test_bad_transform_data(nn_data):
    u = UMAP().fit([[1, 1, 1, 1]])
    with pytest.raises(ValueError):
        u.transform([[0, 0, 0, 0]])
    'small', 'by', 'this', 'have', 'in', 'obviously', 'ten', 'those', 'vessel',
    'good', 'up', 'will', 'combination', 'rather', 'should', 'if', 'so',
    'plan', 'interesting', 'chat', 'let', 'now', 'imply', 'the', 'image',
    'information', 'get', 'particular', 'test', 'show', 'about', 'strong',
    'seventy', 'would', 'two', 'eighty', 'grey', 'at', 'last', 'always',
    'blood', 'on', 'first', 'light', 'can', 'point', 'family', 'take',
    'between', 'must', 'than', 'dr', 'honest', 'which', 'do', 'seem', 'an',
    'all', 'black', '10', ' ', '  ', '   ', 'johnson', 'gosh', 'when', 'far',
    'mean', 'with', 'absolutely', 'for', 'make', 'as', 'somewhere', 'screen',
    'true', '20', 'correct', 'into', 'specifically', '90', 'dark', 'start',
    'bottom', 'then', 'd', '100', 'out', 'line', 'where', 'pass', 'ct', 'i',
    'round', 'open', 'mrs', 'clog'
]
vectors = []
for word in words:
    vectors.append(nlp(word).vector)
vectors = array(vectors)
st.sidebar.subheader('Embedding Visualization')
n_words = st.sidebar.slider("Number of words", 1, len(words), 30)
words = array(words[:n_words])
vectors = vectors[:n_words, :]
reducer = UMAP(n_components=3)
scaled_data = StandardScaler().fit_transform(vectors)
embedding = reducer.fit_transform(scaled_data)
fig = scatter_3d(x=embedding[:, 0],
                 y=embedding[:, 1],
                 z=embedding[:, 2],
                 text=words,
                 hover_name=words)
st.plotly_chart(fig)
Beispiel #11
0
    distmat = nan_to_num(distmat)

distmat[distmat < 0] = 0.0
distmat = distmat / max(distmat)
print(distmat.shape)

from umap import UMAP
from py.utils.safe_pickle import pickle_dump
import os

dirname = "../../../exact_embeddings/" + distname + "_" + dataset
if not os.path.exists(dirname):
    os.mkdir(dirname)

n_neighbors = 40
for n_components in [50, 100, 300, 1000]:
    for min_dist in [1.0, 1.5, 2.0]:
        for spread in [1.0, 2.5]:
            if min_dist > spread:
                continue

            print(n_components, n_neighbors, min_dist, spread)
            t = UMAP(n_components=n_components, n_neighbors = n_neighbors,\
                min_dist=min_dist, metric = "precomputed", random_state=42,\
                n_epochs = 1000, spread = spread)
            embeddings = t.fit_transform(distmat)
            print(embeddings.shape)
            pickle_dump(
                embeddings, dirname + "/" + str(n_components) + "-" +
                str(min_dist) + "-" + str(spread) + ".p")
def latent_scatter(var_unk_pred, y_unk_pred, acquisition, **kwargs):
    chems = kwargs['chems']
    chem2feature = kwargs['chem2feature']
    idx_obs = kwargs['idx_obs']
    idx_unk = kwargs['idx_unk']
    regress_type = kwargs['regress_type']
    prot_target = kwargs['prot_target']

    chem_idx_obs = sorted(set([i for i, _ in idx_obs]))
    chem_idx_unk = sorted(set([i for i, _ in idx_unk]))

    feature_obs = np.array([chem2feature[chems[i]] for i in chem_idx_obs])
    feature_unk = np.array([chem2feature[chems[i]] for i in chem_idx_unk])

    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=1).fit(feature_obs)
    dist = np.ravel(nbrs.kneighbors(feature_unk)[0])
    print('Distance Spearman r = {}, P = {}'.format(
        *ss.spearmanr(dist, var_unk_pred)))
    print('Distance Pearson rho = {}, P = {}'.format(
        *ss.pearsonr(dist, var_unk_pred)))

    X = np.vstack([feature_obs, feature_unk])
    labels = np.concatenate(
        [np.zeros(len(chem_idx_obs)),
         np.ones(len(chem_idx_unk))])
    sidx = np.argsort(-var_unk_pred)

    from fbpca import pca
    U, s, Vt = pca(
        X,
        k=3,
    )
    X_pca = U * s

    from umap import UMAP
    um = UMAP(
        n_neighbors=15,
        min_dist=0.5,
        n_components=2,
        metric='euclidean',
    )
    X_umap = um.fit_transform(X)

    from MulticoreTSNE import MulticoreTSNE as TSNE
    tsne = TSNE(
        n_components=2,
        n_jobs=20,
    )
    X_tsne = tsne.fit_transform(X)

    if prot_target is None:
        suffix = ''
    else:
        suffix = '_' + prot_target

    for name, coords in zip(
        ['pca', 'umap', 'tsne'],
        [X_pca, X_umap, X_tsne],
    ):
        plt.figure()
        sns.scatterplot(
            x=coords[labels == 1, 0],
            y=coords[labels == 1, 1],
            color='blue',
            alpha=0.1,
        )
        plt.scatter(
            x=coords[labels == 0, 0],
            y=coords[labels == 0, 1],
            color='orange',
            alpha=1.0,
            marker='x',
            linewidths=10,
        )
        plt.savefig('figures/latent_scatter_{}_ypred_{}{}.png'.format(
            name, regress_type, suffix),
                    dpi=300)
        plt.close()

        plt.figure()
        plt.scatter(x=coords[labels == 1, 0],
                    y=coords[labels == 1, 1],
                    c=ss.rankdata(var_unk_pred),
                    alpha=0.1,
                    cmap='coolwarm')
        plt.savefig('figures/latent_scatter_{}_var_{}{}.png'.format(
            name, regress_type, suffix),
                    dpi=300)
        plt.close()

        plt.figure()
        plt.scatter(x=coords[labels == 1, 0],
                    y=coords[labels == 1, 1],
                    c=-acquisition,
                    alpha=0.1,
                    cmap='hot')
        plt.savefig('figures/latent_scatter_{}_acq_{}{}.png'.format(
            name, regress_type, suffix),
                    dpi=300)
        plt.close()
Beispiel #13
0
def visualize_topics(topic_model,
                     topics: List[int] = None,
                     top_n_topics: int = None,
                     width: int = 650,
                     height: int = 650) -> go.Figure:
    """ Visualize topics, their sizes, and their corresponding words

    This visualization is highly inspired by LDAvis, a great visualization
    technique typically reserved for LDA.

    Arguments:
        topic_model: A fitted BERTopic instance.
        topics: A selection of topics to visualize
        top_n_topics: Only select the top n most frequent topics
        width: The width of the figure.
        height: The height of the figure.

    Usage:

    To visualize the topics simply run:

    ```python
    topic_model.visualize_topics()
    ```

    Or if you want to save the resulting figure:

    ```python
    fig = topic_model.visualize_topics()
    fig.write_html("path/to/file.html")
    ```
    """
    # Select topics based on top_n and topics args
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(
            topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1])
    else:
        topics = sorted(list(topic_model.get_topics().keys()))

    # Extract topic words and their frequencies
    topic_list = sorted(topics)
    frequencies = [topic_model.topic_sizes[topic] for topic in topic_list]
    words = [
        " | ".join([word[0] for word in topic_model.get_topic(topic)[:5]])
        for topic in topic_list
    ]

    # Embed c-TF-IDF into 2D
    all_topics = sorted(list(topic_model.get_topics().keys()))
    indices = np.array([all_topics.index(topic) for topic in topics])
    embeddings = topic_model.c_tf_idf.toarray()[indices]
    embeddings = MinMaxScaler().fit_transform(embeddings)
    embeddings = UMAP(n_neighbors=2, n_components=2,
                      metric='hellinger').fit_transform(embeddings)

    # Visualize with plotly
    df = pd.DataFrame({
        "x": embeddings[1:, 0],
        "y": embeddings[1:, 1],
        "Topic": topic_list[1:],
        "Words": words[1:],
        "Size": frequencies[1:]
    })
    return _plotly_topic_visualization(df, topic_list, width, height)
Beispiel #14
0
def plot_embedding(X,
                   labels,
                   classes=None,
                   method='tSNE',
                   cmap='tab20',
                   figsize=(4, 4),
                   markersize=4,
                   marker=None,
                   return_emb=False,
                   save=False,
                   save_emb=False,
                   show_legend=True,
                   show_axis_label=True,
                   **legend_params):

    if marker is not None:
        X = np.concatenate([X, marker], axis=0)
    N = len(labels)
    if X.shape[1] != 2:
        if method == 'tSNE':
            #from sklearn.manifold import TSNE
            from MulticoreTSNE import MulticoreTSNE as TSNE
            X = TSNE(n_components=2, random_state=124,
                     n_jobs=32).fit_transform(X)
        if method == 'UMAP':
            from umap import UMAP
            X = UMAP(n_neighbors=30, min_dist=0.3,
                     metric='correlation').fit_transform(X)
        if method == 'PCA':
            from sklearn.decomposition import PCA
            X = PCA(n_components=2, random_state=124).fit_transform(X)

    plt.figure(figsize=figsize)
    if classes is None:
        classes = np.unique(labels)

    if cmap is not None:
        cmap = cmap
    elif len(classes) <= 10:
        cmap = 'tab10'
    elif len(classes) <= 20:
        cmap = 'tab20'
    else:
        cmap = 'husl'
    colors = sns.color_palette(cmap, n_colors=len(classes))

    for i, c in enumerate(classes):
        plt.scatter(X[:N][labels == c, 0],
                    X[:N][labels == c, 1],
                    s=markersize,
                    color=colors[i],
                    label=c)
    if marker is not None:
        plt.scatter(X[N:, 0],
                    X[N:, 1],
                    s=10 * markersize,
                    color='black',
                    marker='*')


#     plt.axis("off")

    legend_params_ = {
        'loc': 'center left',
        'bbox_to_anchor': (1.0, 0.45),
        'fontsize': 10,
        'ncol': 1,
        'frameon': False,
        'markerscale': 1.5
    }
    legend_params_.update(**legend_params)
    if show_legend:
        plt.legend(**legend_params_)
    sns.despine(offset=10, trim=True)
    if show_axis_label:
        plt.xlabel(method + ' dim 1', fontsize=12)
        plt.ylabel(method + ' dim 2', fontsize=12)

    if save:
        plt.savefig(save, format='pdf', bbox_inches='tight')
    else:
        plt.show()

    if save_emb:
        np.savetxt(save_emb, X)
    if return_emb:
        return X
Beispiel #15
0
class TUmap(Transform):
    """
    n_neighbors:
        This determines the number of neighboring points used in local approximations
        of manifold structure. Larger values will result in more global structure being
        preserved at the loss of detailed local structure.
        In general this parameter should often be in the range 5 to 50,
        with a choice of 10 to 15 being a sensible default.
    min_dist:
        This controls how tightly the embedding is allowed compress points together.
        Larger values ensure embedded points are more evenly distributed, while smaller
        values allow the algorithm to optimise more accurately with regard to local structure.
        Sensible values are in the range 0.001 to 0.5, with 0.1 being a reasonable default.
    metric:
        This determines the choice of metric used to measure distance in the input space.
        A wide variety of metrics are already coded, and a user defined function can be passed
        as long as it has been JITd by numba.
    """

    def __init__(
        self,
        n_neighbors=15,
        min_dist=0.1,
        metric="euclidean",
        n_components=2,
        spread=1.0,
        random_state=None
    ):
        self._inst = UMAP(
            n_neighbors = n_neighbors,
            min_dist = min_dist,
            metric = metric,
            n_components=n_components,
            spread=spread,
        )


    def transform(self, fp):
        x = FeaturePool(fp).array()
        logger.info("TUmap: starting UMAP transform ...")
        x_emb = self._inst.fit_transform(x)
        logger.info("TUamp: Done")

        for f_id in range(x_emb.shape[1]):
            yield Feature(
                "UMAP feature #{}".format(f_id),
                x_emb[:, f_id]
            )

    @staticmethod
    def plot_embedding(efp: FeaturePool, split_by=None):
        x = efp.array()
        assert x.shape[1] == 2, "Embedding is expected to be with the size 2 to plot, got {}".format(x.shape[1])
        fig = plt.figure(figsize=(7, 7))
        ax = fig.add_subplot(111)
        if split_by is not None:
            d = split_by.data
            ax.scatter(x[:, 0], x[:, 1], c=d, alpha=0.5)
        else:
            ax.scatter(x[:, 0], x[:, 1], alpha=0.5)
        if split_by is not None:
            ax.set_title(
                "UMAP for a feature pool splitted by feature `{}`".format(split_by.name)
            )
        else:
            ax.set_title(
                "UMAP for a feature pool"
            )
        fig.show()
Beispiel #16
0
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from reval.best_nclust_cv import FindBestClustCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import zero_one_loss, adjusted_mutual_info_score
import matplotlib.pyplot as plt
from umap import UMAP
from reval.visualization import plot_metrics
from reval.relative_validation import _kuhn_munkres_algorithm

# MNIST dataset with 10 classes
mnist, label = fetch_openml('mnist_784', version=1, return_X_y=True)
transform = UMAP(n_neighbors=30,
                 min_dist=0.0,
                 n_components=10,
                 random_state=42)

# Stratified subsets of 7000 elements for both training and test set
mnist_tr, mnist_ts, label_tr, label_ts = train_test_split(mnist,
                                                          label,
                                                          train_size=0.1,
                                                          test_size=0.1,
                                                          random_state=42,
                                                          stratify=label)

# Dimensionality reduction with UMAP as pre-processing step
mnist_tr = transform.fit_transform(mnist_tr)
mnist_ts = transform.transform(mnist_ts)

plt.scatter(mnist_tr[:, 0],
Beispiel #17
0
def test_blobs_cluster():
    data, labels = make_blobs(n_samples=500, n_features=10, centers=5)
    embedding = UMAP(n_epochs=100).fit_transform(data)
    assert adjusted_rand_score(labels, KMeans(5).fit_predict(embedding)) == 1.0
def test_densmap_frac(nn_data):
    u = UMAP(densmap=True, dens_frac=-1.0)
    assert_raises(ValueError, u.fit, nn_data)
    u = UMAP(densmap=True, dens_frac=2.0)
    assert_raises(ValueError, u.fit, nn_data)
Beispiel #19
0
                for neighbors in n_neighbors:
                    for min_d in min_dist:
                        for k in ks:

                            print(dataset + ', ' + metric + ', ' + scaler +
                                  ', n_components=' + str(components) +
                                  ', n_neighbors=' + str(neighbors) +
                                  ', min_dist=' + str(min_d) + ', k=' + str(k))

                            # Se estandariza usando el scaler correspondiente
                            df = scalers[scaler].fit_transform(
                                datasets[dataset])

                            # Se aplica UMAP
                            um = UMAP(n_components=components,
                                      n_neighbors=neighbors,
                                      min_dist=min_d,
                                      metric=metric)
                            embedding = um.fit_transform(df)

                            # Se calculan las validaciones internas
                            sil = get_silhouette_avg(embedding, k)
                            sse = get_sse(embedding, k)

                            # Se aplica KMeans
                            km = KMeans(n_clusters=k,
                                        random_state=0).fit(embedding)

                            # Se calcula la matriz de confusión
                            tmp = pd.DataFrame({
                                'Generos': metadata.genre,
                                'data': km.labels_
def test_densmap_bad_output_metric(nn_data):
    u = UMAP(densmap=True, output_metric="haversine")
    assert_raises(ValueError, u.fit, nn_data)
Beispiel #21
0
#all_docs_tagged = [TaggedDocument(doc, [i]) for i, doc in all_docs.items()]
tagged_docs = [TaggedDocument(doc, [i]) for i, doc in X.items()]

print("Train Doc2Vec model...")
#model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
#model.build_vocab(all_docs_tagged)
model = Doc2Vec.load(model_path)

print("Infer doc vectors...")
docvecs = X.progress_apply(lambda x: model.infer_vector(x))
docvecs = list(docvecs)

#docvecs = docvecs.to_numpy()
#print("dim reduction ...")
dim_reduced_vecs = UMAP(metric="cosine",
                        set_op_mix_ratio=0,
                        n_components=n_comps,
                        random_state=42).fit_transform(docvecs)

print("dim reduction 2D ...")
vecs_2d = UMAP(metric="cosine",
               set_op_mix_ratio=0,
               n_components=2,
               random_state=42).fit_transform(docvecs)

#print("Local outlier factor ...")
#df["predicted"] = LocalOutlierFactor(
#    novelty=False, metric="euclidean", contamination=d["contamination"]).fit_predict(dim_reduced_vecs)

print("HDBSCAN ...")
#dim_reduced_vecs = normalize(dim_reduced_vecs, norm="l2")
clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
def test_umap_bad_n_jobs(nn_data):
    u = UMAP(n_jobs=-2)
    assert_raises(ValueError, u.fit, nn_data)
    u = UMAP(n_jobs=0)
    assert_raises(ValueError, u.fit, nn_data)
def example2():
    mnist = fetch_openml('mnist_784', version=1)
    mnist.target = mnist.target.astype(int)

    X_tr, y_tr = mnist['data'][:60000], mnist.target[:60000]
    X_ts, y_ts = mnist['data'][60000::], mnist.target[60000::]
    transform = UMAP(n_components=2,
                     random_state=42,
                     n_neighbors=30,
                     min_dist=0.0)
    X_tr = transform.fit_transform(X_tr)
    X_ts = transform.transform(X_ts)

    s = KNeighborsClassifier(n_neighbors=30)
    c = hdbscan.HDBSCAN(min_samples=10,
                        min_cluster_size=200)

    reval = FindBestClustCV(s=s,
                            c=c,
                            nfold=2,
                            nrand=10,
                            n_jobs=N_JOBS)

    metrics, nclustbest, tr_lab = reval.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr)

    plot_metrics(metrics)

    out = reval.evaluate(X_tr, X_ts, nclust=nclustbest, tr_lab=tr_lab)
    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)
    logging.info(f"Validation stability: {metrics['val'][nclustbest]}")

    logging.info(f"Best number of clusters during CV: {nclustbest}")
    logging.info(f"Best number of clusters on test set: "
                 f"{len([lab for lab in np.unique(out.test_cllab) if lab >= 0])}")
    logging.info(f'AMI (true labels vs predicted labels) = '
                 f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    logging.info('\n\n')

    logging.info("Metrics from true label comparisons on test set:")
    class_scores = compute_metrics(y_ts, perm_lab)
    for k, val in class_scores.items():
        logging.info(f'{k}, {val}')
    logging.info('\n\n')

    # Visualization
    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=y_tr, cmap='rainbow_r',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set true labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=kuhn_munkres_algorithm(y_tr, tr_lab),
                         cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set predicted labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         c=y_ts, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set true labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         s=0.1,
                         c=perm_lab, cmap='tab20')
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set clustering labels (digits dataset)")
    plt.show()

    # Internal measures
    # SILHOUETTE
    logging.info("Silhouette score based selection")
    sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, c, silhouette_score, select='max')
    sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, c, silhouette_score, select='max')
    logging.info(
        f"Best number of clusters (and scores) for tr/ts independent runs: "
        f"{sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}')
    logging.info('\n\n')

    # DAVIES-BOULDIN
    logging.info("Davies-Bouldin score based selection")
    db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, c, davies_bouldin_score,
                                                       select='min')
    db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, c, davies_bouldin_score,
                                                       select='min')

    logging.info(
        f"Best number of clusters (and scores) for tr/ts independent runs: "
        f"{db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}')
    logging.info('\n\n')

    # Visualization
    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=sil_label_tr, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set silhouette labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         c=sil_label_ts, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set silhouette labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=db_label_tr, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set Davies-Bouldin labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         s=0.1,
                         c=db_label_ts, cmap='tab20')
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set Davies-Bouldin labels (digits dataset)")
    plt.show()
def test_umap_too_large_op(nn_data):
    u = UMAP(set_op_mix_ratio=1.5)
    assert_raises(ValueError, u.fit, nn_data)
def example3(n_jobs, preprocess=None):
    """
    :param preprocess: it can be 'scaled',
        'umap', 'scaled+umap', default None for raw processing.
    :type preprocess: str
    :return:
    """
    # Example 4: best clussifier/clustering for UCI dataset

    # Classifiers
    s = [LogisticRegression(solver='liblinear',
                            random_state=42),
         RandomForestClassifier(n_estimators=100,
                                random_state=42),
         KNeighborsClassifier(n_neighbors=1,
                              metric='euclidean'),
         SVC(C=1,
             random_state=42)]

    # Clustering
    c = [AgglomerativeClustering(),
         KMeans(random_state=42),
         hdbscan.HDBSCAN()]

    scparam = {'s': s,
               'c': c}

    transform = UMAP(n_neighbors=30, min_dist=0.0, random_state=42)
    scale = StandardScaler()

    # Import benchmark datasets
    uci_data = build_ucidatasets()
    # Run ensemble learning algorithm
    best_results = {}
    for data, name in zip(uci_data, uci_data._fields):
        scparam['s'][-1].gamma = (1 / data['data'].shape[0])
        nclass = len(np.unique(data['target']))
        logging.info(f"Processing dataset {name}")
        logging.info(f"True number of classes: {nclass}\n")
        X_tr, X_ts, y_tr, y_ts = train_test_split(data['data'],
                                                  data['target'],
                                                  test_size=0.40,
                                                  random_state=42,
                                                  stratify=data['target'])
        if preprocess == 'umap+scaled':
            X_tr = transform.fit_transform(scale.fit_transform(X_tr))
        elif preprocess == 'umap':
            X_tr = transform.fit_transform(X_tr)
        elif preprocess == 'scaled':
            X_tr = scale.fit_transform(X_tr)

        scparam_select = SCParamSelection(sc_params=scparam,
                                          cv=2,
                                          nrand=10,
                                          clust_range=list(range(2, nclass + 3, 1)),
                                          n_jobs=n_jobs,
                                          iter_cv=10,
                                          strat=y_tr)
        scparam_select.fit(X_tr, nclass=nclass)
        best_results[name] = scparam_select.best_param_
        # Uncomment to save the results
        #     pkl.dump(best_results, open('./best_resultUCI_scaledumap.pkl', 'wb'))
        logging.info('*' * 100)
        logging.info('\n\n')
def test_umap_negative_n_components(nn_data):
    u = UMAP(n_components=-1)
    assert_raises(ValueError, u.fit, nn_data)
def test_haversine_on_highd(nn_data):
    u = UMAP(metric="haversine")
    assert_raises(ValueError, u.fit, nn_data)
def test_umap_too_small_n_neighbours(nn_data):
    u = UMAP(n_neighbors=0.5)
    assert_raises(ValueError, u.fit, nn_data)
def test_umap_haversine_embed_to_highd(nn_data):
    u = UMAP(n_components=3, output_metric="haversine")
    assert_raises(ValueError, u.fit, nn_data)
from datetime import datetime
from util import getKaggleMNIST
from sklearn.linear_model import LogisticRegression
from umap import UMAP

# get the data
Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()

print("Score without transformation:")
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print(model.score(Xtrain, Ytrain))
print(model.score(Xtest, Ytest))


umapper = UMAP(n_neighbors=5, n_components=10)
t0 = datetime.now()
Ztrain = umapper.fit_transform(Xtrain)
print("umap fit_transform took:", datetime.now() - t0)
t0 = datetime.now()
Ztest = umapper.transform(Xtest)
print("umap transform took:", datetime.now() - t0)

print("Score with transformation")
model = LogisticRegression()
t0 = datetime.now()
model.fit(Ztrain, Ytrain)
print("logistic regression fit took:", datetime.now() - t0)
print(model.score(Ztrain, Ytrain))
print(model.score(Ztest, Ytest))
def test_umap_too_many_neighbors_warns(nn_data):
    u = UMAP(a=1.2, b=1.75, n_neighbors=2000, n_epochs=11, init="random")
    u.fit(nn_data[:100, ])
    assert_equal(u._a, 1.2)
    assert_equal(u._b, 1.75)
def test_densmap_lambda(nn_data):
    u = UMAP(densmap=True, dens_lambda=-1.0)
    assert_raises(ValueError, u.fit, nn_data)
Beispiel #33
0
    def get_factor_df(self,
                      ids=None,
                      embedding_dim=2,
                      batch_size=128,
                      valid_ages=None):
        if embedding_dim is not None:
            if embedding_dim == self.num_embeddings:
                embs_reduced = np.abs(
                    self.embeddings.weight.data.cpu().numpy())
            else:
                from umap import UMAP

                embs_reduced = UMAP(n_components=embedding_dim).fit_transform(
                    np.abs(self.embeddings.weight.data.cpu().numpy()))

        with torch.no_grad():
            idx = 0
            dfs = []

            all_temporal_idxs = torch.LongTensor(list(range(self.num_days)))
            while idx < self.num_entities:
                batch_idxs = torch.arange(
                    idx, min((idx + batch_size, self.num_entities)))
                (
                    _,
                    _,
                    _,
                    factors_by_emb,
                    _,
                    _,
                ) = self.forward(all_temporal_idxs, batch_idxs)

                idx += batch_size

                bee_ages_flat = self.ages[:, batch_idxs].numpy().flatten()
                factors_flat = factors_by_emb.data.cpu().numpy().reshape(
                    -1, self.num_factors)
                day_flat = np.tile(
                    np.arange(self.num_days)[:, None],
                    (1, len(batch_idxs))).flatten()
                columns = ["age", "day"
                           ] + [f"f_{f}" for f in range(self.num_factors)]
                df_data = np.concatenate(
                    (bee_ages_flat[:, None], day_flat[:, None], factors_flat),
                    axis=-1)

                if ids is not None:
                    columns = ["bee_id"] + columns
                    ids_flat = np.tile(ids[batch_idxs][None, :],
                                       (self.num_days, 1)).flatten()
                    df_data = np.concatenate((ids_flat[:, None], df_data),
                                             axis=-1)

                if valid_ages is not None:
                    columns = ["valid_age"] + columns
                    valid_flat = valid_ages[:, batch_idxs].flatten()
                    df_data = np.concatenate((valid_flat[:, None], df_data),
                                             axis=-1)

                if embedding_dim is not None:
                    columns += [f"e_{f}" for f in range(embedding_dim)]
                    embs_flat = np.tile(embs_reduced[batch_idxs][None, :],
                                        (self.num_days, 1)).reshape(
                                            -1, embedding_dim)
                    df_data = np.concatenate((df_data, embs_flat), axis=-1)

                factor_df = pd.DataFrame(df_data, columns=columns)
                dfs.append(factor_df)

            factor_df = pd.concat(dfs)

        factor_df.reset_index(inplace=True, drop=True)
        factor_df = factor_df[factor_df.age >= 0]

        return factor_df