Example #1
0
def test_basic_dimensions_3d_chart(embset):
    embset_plt = embset.transform(Pca(3))
    ax = embset_plt.plot_3d(annot=True, title="foobar")
    assert ax.xaxis.get_label_text() == "Dimension 0"
    assert ax.yaxis.get_label_text() == "Dimension 1"
    assert ax.zaxis.get_label_text() == "Dimension 2"
    assert [t.get_text() for t in ax.texts] == words
Example #2
0
def test_named_dimensions_3d_chart_rename(embset):
    ax = embset.transform(Pca(3)).plot_3d(
        "king", "queen", "prince", annot=True, x_label="x", y_label="y"
    )
    assert ax.xaxis.get_label_text() == "x"
    assert ax.yaxis.get_label_text() == "y"
    assert ax.zaxis.get_label_text() == "prince"
    assert [t.get_text() for t in ax.texts] == words
Example #3
0
def test_correct_points_plotted_mapped(embset):
    embset_plt = embset.transform(Pca(3))
    ax = embset_plt.plot_3d("king", "red", "dog", annot=True)
    offset = ax.collections[0]._offsets3d
    king, red, dog = [v for v in np.array(offset)]

    assert np.all(king == np.array([embset_plt[w] > embset_plt["king"] for w in words]))
    assert np.all(red == np.array([embset_plt[w] > embset_plt["red"] for w in words]))
    assert np.all(dog == np.array([embset_plt[w] > embset_plt["dog"] for w in words]))
import pytest
from spacy.vocab import Vocab
from spacy.language import Language
from whatlies.language import SpacyLanguage
from whatlies.transformers import Umap, Pca, Noise, AddRandom, Tsne, OpenTsne


vocab = Vocab().from_disk("tests/custom_test_vocab/")
words = list(vocab.strings)
lang = SpacyLanguage(nlp=Language(vocab=vocab, meta={"lang": "en"}))
emb = lang[words]

transformers = [
    Umap(2),
    Umap(3),
    Pca(2),
    Pca(3),
    Noise(0.1),
    Noise(0.01),
    AddRandom(n=4),
    AddRandom(n=1),
    lambda d: d | (d["man"] - d["woman"]),
    Tsne(2, n_iter=250),
    Tsne(3, n_iter=250),
    OpenTsne(2, n_iter=100),
]
extra_sizes = [2, 3, 2, 3, 0, 0, 4, 1, 0, 2, 3, 2]
tfm_ids = [_.__class__.__name__ for _ in transformers]


@pytest.mark.parametrize(
Example #5
0
reduction_method = st.sidebar.selectbox("Reduction Method", ("Umap", "Pca"))
if reduction_method == "Umap":
    n_neighbors = st.sidebar.slider(
        "Number of UMAP neighbors", min_value=1, max_value=100, value=15, step=1
    )
    min_dist = st.sidebar.slider(
        "Minimum Distance for UMAP",
        min_value=0.01,
        max_value=0.99,
        value=0.8,
        step=0.01,
    )
    reduction = Umap(2, n_neighbors=n_neighbors, min_dist=min_dist)
else:
    reduction = Pca(2)

st.markdown("# Simple Text Clustering")
st.markdown(
    "Let's say you've gotten a lot of feedback from clients on different channels. You might like to be able to distill main topics and get an overview. It might even inspire some intents that will be used in a virtual assistant!"
)
st.markdown(
    "This tool will help you discover them. This app will attempt to cluster whatever text you give it. The chart will try to clump text together and you can explore underlying patterns."
)

if method == "CountVector SVD":
    lang = CountVectorLanguage(n_svd, ngram_range=(min_ngram, max_ngram))
    embset = lang[texts]
if method == "Lite Sentence Encoding":
    embset = EmbeddingSet(
        *[
Example #6
0
def test_correct_points_plotted(embset):
    embset_plt = embset.transform(Pca(3))
    ax = embset_plt.plot_3d(annot=True)
    offset = ax.collections[0]._offsets3d
    assert np.all(np.array(offset).T == embset_plt.to_X())