def test_estimator_checks(test_fn):
    test_fn("spacy_lang", FasttextLanguage("tests/custom_fasttext_model.bin"))
Esempio n. 2
0
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

from whatlies.language import (
    FasttextLanguage,
    SpacyLanguage,
    GensimLanguage,
    BytePairLanguage,
    TFHubLanguage,
    HFTransformersLanguage,
)

backends = [
    SpacyLanguage("en_core_web_sm"),
    FasttextLanguage("tests/custom_fasttext_model.bin"),
    BytePairLanguage("en", vs=1000, dim=25, cache_dir="tests/cache"),
    GensimLanguage("tests/cache/custom_gensim_vectors.kv"),
    HFTransformersLanguage("sshleifer/tiny-gpt2", framework="tf"),
    TFHubLanguage("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"),
]


@pytest.mark.parametrize("lang", backends)
def test_sklearn_pipeline_works(lang):
    pipe = Pipeline([("embed", lang), ("model", LogisticRegression())])

    X = [
        "i really like this post",
        "thanks for that comment",
        "i enjoy this friendly forum",
def lang():
    return FasttextLanguage("tests/custom_fasttext_model.bin")
Esempio n. 4
0
def test_raise_warning():
    with pytest.warns(UserWarning):
        FasttextLanguage(model1).score_similar("cat", 1000)
Esempio n. 5
0
def test_retreive_similar_len():
    assert len(FasttextLanguage(model1).score_similar("cat", 20)) == 20
    assert len(FasttextLanguage(model2).score_similar("cat", 10)) == 10
    assert len(FasttextLanguage(model1).score_similar("cat", 1000)) == 91
    assert len(FasttextLanguage(model2).score_similar("cat", 1000)) == 91
Esempio n. 6
0
def test_load_in_model2():
    lang = FasttextLanguage(model2)
    assert lang["dog"].vector.shape[0] == 10
Esempio n. 7
0
def test_load_in_model1():
    lang = FasttextLanguage(model1)
    assert lang['dog'].vector.shape[0] == 20