コード例 #1
0
def test_assign_arrays_raise_error():
    foo = Embedding("foo", [0.1, 0.3, 0.10])
    bar = Embedding("bar", [0.7, 0.2, 0.11])
    emb = EmbeddingSet(foo, bar)
    with pytest.raises(ValueError):
        emb_with_property = emb.assign(prop_a=["a", "b"],
                                       prop_b=np.array([1, 2, 3]))
コード例 #2
0
ファイル: _transformer.py プロジェクト: cirrushuet/whatlies
 def transform(self, embset: EmbeddingSet) -> EmbeddingSet:
     names, X = embset.to_names_X()
     if not self.is_fitted:
         self.tfm.fit(X)
     new_vecs = self.tfm.transform(X)
     new_dict = new_embedding_dict(names, new_vecs, embset)
     return EmbeddingSet(new_dict, name=f"{embset.name}.{self.name}()")
コード例 #3
0
def test_assign_literal_values():
    foo = Embedding("foo", [0.1, 0.3, 0.10])
    bar = Embedding("bar", [0.7, 0.2, 0.11])
    emb = EmbeddingSet(foo, bar)
    emb_with_property = emb.assign(prop_a="prop-one", prop_b=1)
    assert all([e.prop_a == "prop-one" for e in emb_with_property])
    assert all([e.prop_b == 1 for e in emb_with_property])
コード例 #4
0
def test_assign():
    foo = Embedding("foo", [0.1, 0.3, 0.10])
    bar = Embedding("bar", [0.7, 0.2, 0.11])
    emb = EmbeddingSet(foo, bar)
    emb_with_property = emb.assign(prop_a=lambda d: "prop-one",
                                   prop_b=lambda d: "prop-two")
    assert all([e.prop_a == "prop-one" for e in emb_with_property])
    assert all([e.prop_b == "prop-two" for e in emb_with_property])
コード例 #5
0
def test_assign_arrays():
    foo = Embedding("foo", [0.1, 0.3, 0.10])
    bar = Embedding("bar", [0.7, 0.2, 0.11])
    emb = EmbeddingSet(foo, bar)
    emb_with_property = emb.assign(prop_a=["a", "b"], prop_b=np.array([1, 2]))
    assert emb_with_property["foo"].prop_a == "a"
    assert emb_with_property["bar"].prop_a == "b"
    assert emb_with_property["foo"].prop_b == 1
    assert emb_with_property["bar"].prop_b == 2
コード例 #6
0
def test_to_x_y():
    foo = Embedding("foo", [0.1, 0.3])
    bar = Embedding("bar", [0.7, 0.2])
    buz = Embedding("buz", [0.1, 0.9])
    bla = Embedding("bla", [0.2, 0.8])

    emb1 = EmbeddingSet(foo, bar).add_property("label", lambda d: 'group-one')
    emb2 = EmbeddingSet(buz, bla).add_property("label", lambda d: 'group-two')
    emb = emb1.merge(emb2)

    X, y = emb.to_X_y(y_label='label')
    assert X.shape == emb.to_X().shape == (4, 2)
    assert list(y) == ['group-one', 'group-one', 'group-two', 'group-two']
コード例 #7
0
def test_embeddingset_creation():
    foo = Embedding("foo", [0, 1])
    bar = Embedding("bar", [1, 1])

    emb = EmbeddingSet(foo)
    assert len(emb) == 1
    assert "foo" in emb
    emb = EmbeddingSet(foo, bar)
    assert len(emb) == 2
    assert "foo" in emb
    assert "bar" in emb
    emb = EmbeddingSet({"foo": foo})
    assert len(emb) == 1
    assert "foo" in emb
コード例 #8
0
def test_from_names_X():
    names = ["foo", "bar", "buz"]
    X = [
        [1.0, 2],
        [3, 4.0],
        [0.5, 0.6],
    ]
    embset = EmbeddingSet.from_names_X(names, X)
    assert "foo" in embset
    assert len(embset) == 3
    assert np.array_equal(embset.to_X(), np.array(X))

    names = names[:2]
    with pytest.raises(ValueError, match="The number of given names"):
        EmbeddingSet.from_names_X(names, X)
コード例 #9
0
ファイル: opentsne.py プロジェクト: Agrover112/whatlies
 def transform(self, embset):
     names, X = embset.to_names_X()
     new_vecs = np.array(self.emb.transform(X))
     names_out = names + [f"tsne_{i}" for i in range(self.n_components)]
     vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)])
     new_dict = new_embedding_dict(names_out, vectors_out, embset)
     return EmbeddingSet(new_dict, name=f"{embset.name}.tsne_{self.n_components}()")
コード例 #10
0
ファイル: _transformer.py プロジェクト: cirrushuet/whatlies
 def fit(self, embset: EmbeddingSet) -> "SklearnTransformer":
     if not self.is_fitted:
         # This is a bit of an anti-pattern. You should not need to `self.tfm`. However, there are
         # some packages like OpenTSNE that return a different kind of estimator once an estimator has been fitted.
         self.tfm = self.tfm.fit(embset.to_X())
     self.is_fitted = True
     return self
コード例 #11
0
def embset():
    names = ["red", "blue", "green", "yellow", "white"]
    vectors = np.random.rand(5, 4) * 10 - 5
    embeddings = [
        Embedding(name, vector) for name, vector in zip(names, vectors)
    ]
    return EmbeddingSet(*embeddings)
コード例 #12
0
ファイル: bpemblang.py プロジェクト: Raghibshams456/whatlies
    def embset_similar(
        self,
        emb: Union[str, Embedding],
        n: int = 10,
        lower=False,
        metric="cosine",
    ) -> EmbeddingSet:
        """
        Retreive an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] that are the most similar to the passed query.

        Arguments:
            emb: query to use
            n: the number of items you'd like to see returned
            metric: metric to use to calculate distance, must be scipy or sklearn compatible
            lower: only fetch lower case tokens

        Important:
            This method is incredibly slow at the moment without a good `top_n` setting due to
            [this bug](https://github.com/facebookresearch/fastText/issues/1040).

        Returns:
            An [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] containing the similar embeddings.
        """
        embs = [
            w[0] for w in self.score_similar(
                emb=emb, n=n, lower=lower, metric=metric)
        ]
        return EmbeddingSet({w.name: w for w in embs})
コード例 #13
0
 def transform(self, embset):
     names, X = embset_to_X(embset=embset)
     new_vecs = self.tfm.transform(X)
     names_out = names + [f"pca_{i}" for i in range(self.n_components)]
     vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)])
     new_dict = new_embedding_dict(names_out, vectors_out, embset)
     return EmbeddingSet(new_dict,
                         name=f"{embset.name}.pca_{self.n_components}()")
コード例 #14
0
 def transform(self, embset):
     names, X = embset.to_names_X()
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=NumbaPerformanceWarning)
         new_vecs = self.tfm.transform(X)
     new_dict = new_embedding_dict(names, new_vecs, embset)
     return EmbeddingSet(new_dict,
                         name=f"{embset.name}.umap_{self.n_components}()")
コード例 #15
0
 def transform(self, embset):
     names, X = embset.to_names_X()
     # We are re-writing the transform method here because TSNE cannot .fit().transform().
     # Check the docs here: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE
     new_vecs = self.tfm.fit_transform(X)
     new_dict = new_embedding_dict(names, new_vecs, embset)
     return EmbeddingSet(new_dict,
                         name=f"{embset.name}.tsne({self.n_components})")
コード例 #16
0
ファイル: noise.py プロジェクト: Agrover112/whatlies
 def transform(self, embset):
     names, X = embset.to_names_X()
     np.random.seed(self.seed)
     new_vecs = self.tfm.transform(X)
     new_dict = new_embedding_dict(names, new_vecs, embset)
     return EmbeddingSet(
         new_dict,
         name=f"{embset.name}",
     )
コード例 #17
0
ファイル: umap.py プロジェクト: sfahad1414/whatlies
 def transform(self, embset):
     names, X = embset_to_X(embset=embset)
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=NumbaPerformanceWarning)
         new_vecs = self.tfm.transform(X)
     names_out = names + [f"umap_{i}" for i in range(self.n_components)]
     vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)])
     new_dict = new_embedding_dict(names_out, vectors_out, embset)
     return EmbeddingSet(new_dict,
                         name=f"{embset.name}.umap_{self.n_components}()")
コード例 #18
0
 def transform(self, embset):
     names, X = embset.to_names_X()
     np.random.seed(self.seed)
     orig_dict = embset.embeddings.copy()
     new_dict = {
         f"rand_{k}": Embedding(f"rand_{k}",
                                np.random.normal(0, self.sigma, X.shape[1]))
         for k in range(self.n)
     }
     return EmbeddingSet({**orig_dict, **new_dict})
コード例 #19
0
ファイル: helpers.py プロジェクト: RasaHQ/whatlies
def reverse_strings(embset):
    """
    This helper will reverse the strings in the embeddingset. This can be useful
    for making matplotlib plots with Arabic texts.

    This helper is meant to be used via `EmbeddingSet.pipe()`.

    Arguments:
        embset: EmbeddingSet to adapt

    Usage:

    ```python
    from whatlies.helpers import reverse_strings
    from whatlies.language import BytePairLanguage

    translation = {
       "man":"رجل",
       "woman":"امرأة",
       "king":"ملك",
       "queen":"ملكة",
       "brother":"أخ",
       "sister":"أخت",
       "cat":"قطة",
       "dog":"كلب",
       "lion":"أسد",
       "puppy":"جرو",
       "male student":"طالب",
       "female student":"طالبة",
       "university":"جامعة",
       "school":"مدرسة",
       "kitten":" قطة صغيرة",
       "apple" : "تفاحة",
       "orange" : "برتقال",
       "cabbage" : "كرنب",
       "carrot" : "جزرة"
    }

    lang_cv  = BytePairLanguage("ar")

    arabic_words = list(words.values())

    # before
    lang_cv[translation].plot_similarity()

    # after
    lang_cv[translation].pipe(reverse_strings).plot_similarity()
    ```

    ![](https://koaning.github.io/whatlies/images/arabic-before-after.png)
    """
    return EmbeddingSet(
        *[Embedding(name=e.name[::-1], vector=e.vector) for e in embset])
コード例 #20
0
ファイル: bpemblang.py プロジェクト: Raghibshams456/whatlies
    def __getitem__(self, item):
        """
        Retreive a single embedding or a set of embeddings. If an embedding contains multiple
        sub-tokens then we'll average them before retreival.

        Arguments:
            item: single string or list of strings

        **Usage**
        ```python
        > lang = BytePairLang(lang="en")
        > lang['python']
        > lang[['python', 'snake']]
        > lang[['nobody expects', 'the spanish inquisition']]
        ```
        """
        if isinstance(item, str):
            return Embedding(item, self.module.embed(item).mean(axis=0))
        if isinstance(item, list):
            return EmbeddingSet(*[self[i] for i in item])
        raise ValueError(f"Item must be list of string got {item}.")
コード例 #21
0
def test_embset_creation_warning():
    foo = Embedding("foo", [0, 1])
    # This vector has the same name dimension. Dangerzone.
    bar = Embedding("foo", [1, 2])
    with pytest.raises(Warning):
        EmbeddingSet(foo, bar)
コード例 #22
0
def emb():
    x = Embedding("x", [0.0, 1.0])
    y = Embedding("y", [1.0, 0.0])
    z = Embedding("z", [0.5, 0.5])
    return EmbeddingSet(x, y, z)
コード例 #23
0
ファイル: _ivis.py プロジェクト: ml-ai-nlp-ir/whatlies
 def transform(self, embset):
     names, X = embset.to_names_X()
     new_vecs = self.tfm.transform(X)
     new_dict = new_embedding_dict(names, new_vecs, embset)
     return EmbeddingSet(new_dict, name=f"{embset.name}.ivis_{self.n_components}()")
コード例 #24
0
 def __getitem__(self, query):
     if isinstance(query, str):
         return Embedding(query, vector=self.model.encode(query))
     else:
         return EmbeddingSet(*[self[tok] for tok in query])
コード例 #25
0
def test_add_property():
    foo = Embedding("foo", [0.1, 0.3, 0.10])
    bar = Embedding("bar", [0.7, 0.2, 0.11])
    emb = EmbeddingSet(foo, bar)
    emb_with_property = emb.add_property("prop_a", lambda d: "prop-one")
    assert all([e.prop_a == "prop-one" for e in emb_with_property])
コード例 #26
0
def test_embset_creation_error():
    foo = Embedding("foo", [0, 1])
    # This vector has a different dimension. No bueno.
    bar = Embedding("bar", [1, 1, 2])
    with pytest.raises(ValueError):
        EmbeddingSet(foo, bar)
コード例 #27
0
 def transform(self, embset):
     names, X = embset.to_names_X()
     new_vecs = np.array(self.emb.transform(X))
     new_dict = new_embedding_dict(names, new_vecs, embset)
     return EmbeddingSet(new_dict, name=f"{embset.name}.tsne_{self.n_components}()")
コード例 #28
0
ファイル: app.py プロジェクト: vishnupriyavr/rasalit
st.markdown("# Simple Text Clustering")
st.markdown(
    "Let's say you've gotten a lot of feedback from clients on different channels. You might like to be able to distill main topics and get an overview. It might even inspire some intents that will be used in a virtual assistant!"
)
st.markdown(
    "This tool will help you discover them. This app will attempt to cluster whatever text you give it. The chart will try to clump text together and you can explore underlying patterns."
)

if method == "CountVector SVD":
    lang = CountVectorLanguage(n_svd, ngram_range=(min_ngram, max_ngram))
    embset = lang[texts]
if method == "Lite Sentence Encoding":
    embset = EmbeddingSet(
        *[
            Embedding(t, v)
            for t, v in zip(texts, calculate_embeddings(texts, encodings=encodings))
        ]
    )

p = (
    embset.transform(reduction)
    .plot_interactive(annot=False)
    .properties(width=500, height=500, title="")
)

st.write(p)

st.markdown(
    "While the tool helps you in discovering clusters, it doesn't do labelling (yet). We do offer a [jupyter notebook](https://github.com/RasaHQ/rasalit/tree/master/notebooks/bulk-labelling) that might help out though."
)
コード例 #29
0
ファイル: normalizer.py プロジェクト: Agrover112/whatlies
 def transform(self, embset: EmbeddingSet) -> EmbeddingSet:
     names, X = embset.to_names_X()
     axis = 0 if self.feature else 1
     X = normalize(X, norm=self.norm, axis=axis)
     new_dict = new_embedding_dict(names, X, embset)
     return EmbeddingSet(new_dict, name=embset.name)
コード例 #30
0
To do    : None

"""

# %% load libraries
from whatlies import EmbeddingSet
from whatlies.language import SpacyLanguage

# %% load a model of the language 'via' whatlies
lang = SpacyLanguage("en_core_web_lg")

# %% create a list of lexical items
"""
let's see how animals (actors) map onto qualities (attributes)
"""
# sample animals
animals = ["cat", "dog", "mouse"]
# sample qualities
qualities = ["responsive", "loyal"]
# set of lexical items
items = animals + qualities

# %% browse the loaded model of the language, retrieve the vectors
#    and create and initialize an embedding sets (a class specific
#    to the library whatlies)
emb = EmbeddingSet(*[lang[item] for item in items])
# the position of animals should be relative to the word vectors
#   for 'smart' and 'loyal'
emb.plot_interactive(x_axis=emb["responsive"], y_axis=emb["loyal"])