Esempio n. 1
0
def test_pyss3_ss3():
    """Test SS3."""
    clf = SS3(s=.45,
              l=.5,
              p=1,
              a=0,
              name="test",
              cv_m=STR_NORM_GV_XAI,
              sn_m=STR_XAI)
    assert clf.get_name() == "test"
    with pytest.raises(pyss3.InvalidCategoryError):
        clf.get_category_index("a_category")
    with pytest.raises(pyss3.InvalidCategoryError):
        clf.get_category_name(0)
    with pytest.raises(pyss3.EmptyModelError):
        y_pred = clf.predict(x_test)
    with pytest.raises(pyss3.EmptyModelError):
        y_pred = clf.predict_proba(x_test)

    clf.fit(x_train, y_train)

    perform_tests_with(clf, [.00114, .00295, 0, 0, 0, .00016, .01894, 8.47741])

    clf = SS3(s=.32,
              l=1.24,
              p=1.1,
              a=0,
              name="test-3grams",
              cv_m=STR_NORM_GV_XAI,
              sn_m=STR_XAI)
    clf.fit(x_train, y_train, n_grams=3)

    perform_tests_with(clf, [.00074, .00124, 0, 0, 0, .00028, .00202, 9.19105])

    pred = clf.classify("android mobile and video games", json=True)
    assert pred["pars"][0]["sents"][0]["words"][0][
        "lexeme"] == "android mobile"
    assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "video games"
    assert argmax(pred["cv"]) == clf.get_category_index("science&technology")
    assert [round(p, 5)
            for p in pred["cv"]] == [0, 0, 0, 0, 0, 0, 4.3789, 0, 0]

    pred = clf.classify("playing football soccer", json=True)
    assert pred["pars"][0]["sents"][0]["words"][-1][
        "lexeme"] == "football soccer"
    assert argmax(pred["cv"]) == clf.get_category_index("sports")
    assert [round(p, 5)
            for p in pred["cv"]] == [0, 0, 0, 0, 0, .53463, 0, 1.86708, 0]
Esempio n. 2
0
def test_multilabel():
    """Test multilabel support."""
    x_train, y_train = Dataset.load_from_files_multilabel(
        path.join(dataset_multilabel_path, "train/docs.txt"),
        path.join(dataset_multilabel_path, "train/labels.txt"),
        sep_label=",",
        sep_doc="\n>>>>>\n")

    clf = SS3()

    with pytest.raises(ValueError):
        membership_matrix(clf, [])

    clf.fit(x_train, y_train)

    assert sorted(clf.get_categories()) == [
        'insult', 'obscene', 'severe_toxic', 'toxic'
    ]
    assert clf.classify_multilabel("this is a unknown document!") == []

    y_pred = [[], ['toxic'], ['severe_toxic'], ['obscene'], ['insult'],
              ['toxic', 'insult']]

    y_pred_memmatrix = membership_matrix(clf, y_pred).todense().tolist()
    assert y_pred_memmatrix == [
        [0, 0, 0, 0],  # []
        [1, 0, 0, 0],  # ['toxic']
        [0, 1, 0, 0],  # ['severe_toxic']
        [0, 0, 1, 0],  # ['obscene']
        [0, 0, 0, 1],  # ['insult']
        [1, 0, 0, 1]
    ]  # ['toxic', 'insult']

    y_pred_memmatrix = membership_matrix(clf, y_pred +
                                         [["xxx"]]).todense().tolist()
    assert y_pred_memmatrix[-1] == [0, 0, 0, 0]
Esempio n. 3
0
def test_evaluation(mocker):
    """Test Evaluation class."""
    mocker.patch("webbrowser.open")
    mocker.patch("matplotlib.pyplot.show")

    kfold_validation = Evaluation.kfold_cross_validation

    Evaluation.__cache__ = None
    Evaluation.__cache_file__ = None
    Evaluation.__clf__ = None
    Evaluation.__last_eval_tag__ = None
    Evaluation.__last_eval_method__ = None
    Evaluation.__last_eval_def_cat__ = None

    ss = [0, 0.5]
    ll = [0, 1.5]
    pp = [0, 2]
    x_data, y_data = Dataset.load_from_files(DATASET_PATH)

    clf = SS3()
    clf.set_model_path("tests")

    # no classifier assigned case
    Evaluation.clear_cache()
    with pytest.raises(ValueError):
        Evaluation.get_best_hyperparameters()
    with pytest.raises(ValueError):
        Evaluation.remove()
    with pytest.raises(ValueError):
        Evaluation.show_best()
    with pytest.raises(ValueError):
        Evaluation.plot(TMP_FOLDER)

    # Not-yet-trained model case
    Evaluation.set_classifier(clf)
    Evaluation.clear_cache()
    Evaluation.remove()
    Evaluation.show_best()
    assert Evaluation.plot(TMP_FOLDER) is False

    with pytest.raises(pyss3.EmptyModelError):
        Evaluation.test(clf, x_data, y_data)
    with pytest.raises(pyss3.EmptyModelError):
        kfold_validation(clf, x_data, y_data)
    with pytest.raises(pyss3.EmptyModelError):
        Evaluation.grid_search(clf, x_data, y_data)
    with pytest.raises(LookupError):
        Evaluation.get_best_hyperparameters()

    # default argument values
    clf.train(x_data, y_data)

    assert Evaluation.test(clf, x_data, y_data, plot=PY3) == 1
    assert Evaluation.test(clf, ['bla bla bla'], ['pos'], plot=PY3) == 0
    assert Evaluation.test(clf, ['bla bla bla', "I love this love movie!"],
                           ['pos', 'pos'],
                           plot=PY3) == 0.5
    assert kfold_validation(clf, x_data, y_data, plot=PY3) > 0
    s, l, p, a = clf.get_hyperparameters()
    s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data)
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters()
    s2, l2, p2, a2 = Evaluation.get_best_hyperparameters("recall")
    assert s0 == s and l0 == l and p0 == p and a0 == a
    assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1
    assert s0 == s2 and l0 == l2 and p0 == p2 and a0 == a2
    assert Evaluation.plot(TMP_FOLDER) is True
    Evaluation.remove()
    Evaluation.show_best()
    assert Evaluation.plot(TMP_FOLDER) is False

    # test
    #   OK
    assert Evaluation.test(clf, x_data, y_data, def_cat='unknown',
                           plot=PY3) == 1
    assert Evaluation.test(clf, x_data, y_data, def_cat='neg', plot=PY3) == 1
    assert Evaluation.test(clf, x_data, y_data, metric="f1-score",
                           plot=PY3) == 1
    assert Evaluation.test(clf,
                           x_data,
                           y_data,
                           plot=PY3,
                           metric="recall",
                           metric_target="weighted avg") == 1
    assert Evaluation.test(clf,
                           x_data,
                           y_data,
                           plot=PY3,
                           metric="recall",
                           metric_target="neg") == 1
    #   Not OK
    with pytest.raises(InvalidCategoryError):
        Evaluation.test(clf, x_data, y_data, def_cat='xxx', plot=PY3)
    with pytest.raises(KeyError):
        Evaluation.test(clf, x_data, y_data, metric="xxx", plot=PY3)
    with pytest.raises(KeyError):
        Evaluation.test(clf,
                        x_data,
                        y_data,
                        metric="recall",
                        metric_target="xxx",
                        plot=PY3)

    # k-fold
    #   OK
    assert kfold_validation(clf, x_data, y_data, n_grams=3, plot=PY3) > 0
    assert kfold_validation(clf, x_data, y_data, k=10, plot=PY3) > 0
    assert kfold_validation(
        clf, x_data, y_data, k=10, def_cat='unknown', plot=PY3) > 0
    assert kfold_validation(clf, x_data, y_data, k=10, def_cat='neg',
                            plot=PY3) > 0
    assert kfold_validation(clf, x_data, y_data, metric="f1-score",
                            plot=PY3) > 0
    assert kfold_validation(clf,
                            x_data,
                            y_data,
                            plot=PY3,
                            metric="recall",
                            metric_target="weighted avg") > 0
    assert kfold_validation(
        clf, x_data, y_data, plot=PY3, metric="recall",
        metric_target="neg") > 0
    #   Not OK
    with pytest.raises(ValueError):
        kfold_validation(clf, x_data, y_data, n_grams=-1, plot=PY3)
    with pytest.raises(ValueError):
        kfold_validation(clf, x_data, y_data, n_grams=clf, plot=PY3)
    with pytest.raises(ValueError):
        kfold_validation(clf, x_data, y_data, k=-1, plot=PY3)
    with pytest.raises(ValueError):
        kfold_validation(clf, x_data, y_data, k=clf, plot=PY3)
    with pytest.raises(ValueError):
        kfold_validation(clf, x_data, y_data, k=None, plot=PY3)
    with pytest.raises(InvalidCategoryError):
        kfold_validation(clf, x_data, y_data, def_cat='xxx', plot=PY3)
    with pytest.raises(KeyError):
        kfold_validation(clf, x_data, y_data, metric="xxx", plot=PY3)
    with pytest.raises(KeyError):
        kfold_validation(clf,
                         x_data,
                         y_data,
                         metric="recall",
                         metric_target="xxx",
                         plot=PY3)

    # grid_search
    #   OK
    s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, s=ss)
    s1, l1, p1, a1 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            s=ss,
                                            l=ll,
                                            p=pp)
    assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1
    s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, k_fold=4)
    s0, l0, p0, a0 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            def_cat='unknown',
                                            p=pp)
    s1, l1, p1, a1 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            def_cat='neg',
                                            p=pp)
    assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1
    s0, l0, p0, a0 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            metric="f1-score",
                                            p=pp)
    s1, l1, p1, a1 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            p=pp,
                                            metric="recall",
                                            metric_target="weighted avg")
    s1, l1, p1, a1 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            p=pp,
                                            metric="recall",
                                            metric_target="neg")
    assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1
    #   Not OK
    with pytest.raises(TypeError):
        Evaluation.grid_search(clf, x_data, y_data, s='asd')
    with pytest.raises(TypeError):
        Evaluation.grid_search(clf, x_data, y_data, s=clf)
    with pytest.raises(TypeError):
        Evaluation.grid_search(clf, x_data, y_data, k_fold=clf)
    with pytest.raises(TypeError):
        Evaluation.grid_search(clf, x_data, y_data, k_fold="xxx")
    with pytest.raises(InvalidCategoryError):
        Evaluation.grid_search(clf, x_data, y_data, def_cat='xxx')
    with pytest.raises(KeyError):
        Evaluation.grid_search(clf, x_data, y_data, metric="xxx")
    with pytest.raises(KeyError):
        Evaluation.grid_search(clf,
                               x_data,
                               y_data,
                               metric="recall",
                               metric_target="xxx")

    # get_best_hyperparameters
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters()
    s2, l2, p2, a2 = Evaluation.get_best_hyperparameters("recall")
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(
        "recall", "weighted avg")
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters("recall", "pos")
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold")
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold",
                                                         def_cat="neg")
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold",
                                                         def_cat="unknown")
    assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1
    assert s0 == s2 and l0 == l2 and p0 == p2 and a0 == a2

    # Not OK
    with pytest.raises(KeyError):
        Evaluation.get_best_hyperparameters("xxx")
    with pytest.raises(KeyError):
        Evaluation.get_best_hyperparameters("recall", "xxx")
    with pytest.raises(LookupError):
        Evaluation.get_best_hyperparameters(method="xxx")
    with pytest.raises(LookupError):
        Evaluation.get_best_hyperparameters(def_cat="xxx")
    with pytest.raises(LookupError):
        Evaluation.get_best_hyperparameters(method="4-fold", def_cat="unknown")

    # plot OK
    assert Evaluation.plot(TMP_FOLDER) is True

    # remove
    #   OK
    assert Evaluation.remove(s, l, p, a)[0] == 10
    assert Evaluation.remove(def_cat="neg")[0] == 2
    assert Evaluation.remove(method="test")[0] == 12
    assert Evaluation.remove(s=-10)[0] == 0
    assert Evaluation.remove(def_cat="xxx")[0] == 0
    assert Evaluation.remove(method="xxx")[0] == 0
    assert Evaluation.remove()[0] == 1
    assert Evaluation.plot(TMP_FOLDER) is False  # plot not OK (no evaluations)
    #   not OK
    with pytest.raises(TypeError):
        Evaluation.remove("xxx")
    with pytest.raises(TypeError):
        Evaluation.remove(clf)

    Evaluation.show_best()
    Evaluation.show_best(method="test")
    Evaluation.show_best(def_cat="unknown")
    Evaluation.show_best(metric="f1-score")
    Evaluation.show_best(metric="f1-score", avg="weighted avg")

    # different tag

    rmtree("./tests/ss3_models", ignore_errors=True)
Esempio n. 4
0
def test_pyss3_ss3(mockers):
    """Test SS3."""
    clf = SS3(s=.45,
              l=.5,
              p=1,
              a=0,
              name="test",
              cv_m=STR_NORM_GV_XAI,
              sn_m=STR_XAI)

    # "cold start" tests
    assert clf.get_name() == "test"
    assert clf.get_category_index("a_category") == IDX_UNKNOWN_CATEGORY
    assert clf.get_category_name(0) == STR_UNKNOWN_CATEGORY
    assert clf.get_category_name(-1) == STR_UNKNOWN_CATEGORY

    with pytest.raises(pyss3.EmptyModelError):
        clf.predict(x_test)
    with pytest.raises(pyss3.EmptyModelError):
        clf.predict_proba(x_test)

    # train and predict/classify tests (model: terms are single words)
    # cv_m=STR_NORM_GV_XAI, sn_m=STR_XAI
    clf.fit(x_train, y_train)

    perform_tests_with(clf, [.00114, .00295, 0, 0, 0, .00016, .01894, 8.47741])

    # cv_m=STR_NORM_GV, sn_m=STR_XAI
    clf = SS3(s=.45,
              l=.5,
              p=1,
              a=0,
              name="test-norm-gv-sn-xai",
              cv_m=STR_NORM_GV,
              sn_m=STR_XAI)
    clf.fit(x_train, y_train)

    perform_tests_with(clf,
                       [0.00114, 0.00295, 0, 0, 0, 0.00016, 0.01894, 8.47741])

    # cv_m=STR_GV, sn_m=STR_XAI
    clf = SS3(s=.45,
              l=.5,
              p=1,
              a=0,
              name="test-gv-sn-xai",
              cv_m=STR_GV,
              sn_m=STR_XAI)
    clf.fit(x_train, y_train)

    perform_tests_with(clf,
                       [0.00062, 0.00109, 0, 0, 0, 0.00014, 0.01894, 6.31228])

    # cv_m=STR_NORM_GV_XAI, sn_m=STR_VANILLA
    clf = SS3(s=.45,
              l=.5,
              p=1,
              a=0,
              name="test-norm-gv-xai-sn-vanilla",
              cv_m=STR_NORM_GV_XAI,
              sn_m=STR_VANILLA)
    clf.fit(x_train, y_train)

    perform_tests_with(clf,
                       [0.00114, 0.00295, 0, 0, 0, 0.00016, 0.01894, 8.47741],
                       stopwords=False)

    # train and predict/classify tests (model: terms are word n-grams)
    clf = SS3(name="test-3grams", cv_m=STR_NORM_GV_XAI, sn_m=STR_XAI)

    clf.fit(x_train, y_train, n_grams=3)

    # update_values
    clf.set_l(.3)
    clf.update_values()
    clf.set_p(.2)
    clf.update_values()
    clf.set_hyperparameters(s=.32, l=1.24, p=1.1, a=0)
    clf.update_values()
    clf.update_values()

    perform_tests_with(clf, [.00074, .00124, 0, 0, 0, .00028, .00202, 9.19105])

    # n-gram recognition tests
    pred = clf.classify("android mobile and video games", json=True)
    assert pred["pars"][0]["sents"][0]["words"][0][
        "lexeme"] == "android mobile "
    assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "video games"
    assert argmax(pred["cv"]) == clf.get_category_index("science&technology")
    assert [round(p, 5)
            for p in pred["cv"]] == [0, 0, 0, 0, 0, 0, 4.3789, 0, 0]

    pred = clf.classify("playing football soccer", json=True)
    assert pred["pars"][0]["sents"][0]["words"][-1][
        "lexeme"] == "football soccer"
    assert argmax(pred["cv"]) == clf.get_category_index("sports")
    assert [round(p, 5)
            for p in pred["cv"]] == [0, 0, 0, 0, 0, .53463, 0, 1.86708, 0]

    # extract_insight
    t = clf.extract_insight(doc_insight)
    assert len(t) == 1 and t[0][
        0] == 'text is about sports. Football soccer, you know!'
    t = clf.extract_insight(doc_insight, window_size=1)
    assert len(t) == 1 and t[0] == ('about sports. Football soccer, you ',
                                    2.8670788645841605)
    t = clf.extract_insight(doc_insight, window_size=0)
    assert t == [('Football soccer, ', 1.8670788645841605), ('sports', 1.0)]
    assert clf.extract_insight(doc_insight, cat="music") == []
    assert len(clf.extract_insight(doc_insight, min_cv=1)) == 1
    t = clf.extract_insight(doc_insight, level="sentence")
    assert len(t) == 2 and t[0][0] == ' Football soccer, you know!'
    t = clf.extract_insight(doc_insight, level="sentence", sort=False)
    assert len(t) == 2 and t[0][0] == 'Dude, this text is about sports'
    t = clf.extract_insight(doc_insight, level="paragraph", min_cv=-1)
    assert len(t) == 2 + 1 and t[2][0] == "2nd paragraph."

    with pytest.raises(pyss3.InvalidCategoryError):
        clf.extract_insight(doc_insight, cat=STR_UNKNOWN_CATEGORY)
    with pytest.raises(ValueError):
        clf.extract_insight(doc_insight, level="invalid")

    # prints
    clf.print_model_info()
    clf.print_hyperparameters_info()
    clf.print_categories_info()
    clf.print_ngram_info("video games")

    # plot_value_distribution
    if sys.version_info[0] >= 3:
        clf.plot_value_distribution(y_test[0])

    # load and save model tests
    clf.set_model_path("tests/")
    clf.save_model()
    clf.load_model()

    # get_next_words
    assert clf.get_next_words("android",
                              "science&technology")[0][0] == "mobile"

    clf = SS3(name="test-3grams")

    with pytest.raises((OSError, IOError)):
        clf.set_model_path("dummy")
        clf.load_model()

    clf.set_model_path("./tests")
    clf.load_model()

    clf.set_model_path("tests/tmp")
    clf.save_model()
    clf.save_model()
    clf.load_model()

    clf.save_model("tests/")
    clf.load_model()

    clf = SS3(name="test-3grams")
    clf.load_model("./tests/")

    clf.save_model("./tests/tmp/")
    clf.save_model()
    clf.load_model()

    # save_vocab
    clf.save_vocab("tests/tmp")

    # save_cat_vocab
    clf.save_cat_vocab("sports", "tests/tmp")
    with pytest.raises(pyss3.InvalidCategoryError):
        clf.save_cat_vocab(STR_UNKNOWN_CATEGORY, "tests/tmp")

    rmtree("./tests/tmp", ignore_errors=True)
    rmtree("./tests/ss3_models", ignore_errors=True)
Esempio n. 5
0
---
"""

# Before we begin, let's import needed modules...
from pyss3 import SS3
from pyss3.util import Dataset
from pyss3.server import Server

from sklearn.metrics import accuracy_score
from os import system

# ... and unzip the "movie_review.zip" dataset inside the `datasets` folder.
system('unzip -u datasets/topic.zip -d datasets/')

# Ok, now we are ready to begin. Let's create a new SS3 instance
clf = SS3()

# What are the default hyperparameter values? let's see
s, l, p, _ = clf.get_hyperparameters()

print("Smoothness(s):", s)
print("Significance(l):", l)
print("Sanction(p):", p)

# Ok, now let's load the training and the test set using the `load_from_files` function
# from `pyss3.util`. Since, in this dataset, there's a single file for each category,
# we will use the argument ``folder_label=False`` to tell PySS3 to use each file as a different
# category and each line inside of it as a different document:
x_train, y_train = Dataset.load_from_files("datasets/topic/train",
                                           folder_label=False)
x_test, y_test = Dataset.load_from_files("datasets/topic/test",
Esempio n. 6
0
DATASET_FOLDER_MR = "dataset_mr"
ADDRESS, PORT = "localhost", None
LT = s.Live_Test

dataset_path = path.join(path.abspath(path.dirname(__file__)), DATASET_FOLDER)
dataset_path_mr = path.join(path.abspath(path.dirname(__file__)),
                            DATASET_FOLDER_MR)

x_train, y_train = None, None
clf = None

pyss3.set_verbosity(0)

x_train, y_train = Dataset.load_from_files(dataset_path_mr)
x_train, y_train = Dataset.load_from_files(dataset_path, folder_label=False)
clf = SS3()

clf.fit(x_train, y_train)

LT.serve()  # no model error
LT.set_model(clf)
LT.get_port()


class MockCmdLineArgs:
    """Mocked command-line arguments."""

    quiet = True
    MODEL = "name"
    path = dataset_path
    label = 'folder'
Esempio n. 7
0
---
"""

# Before we begin, let's import needed modules...
from pyss3 import SS3
from pyss3.util import Dataset
from pyss3.server import Server

from sklearn.metrics import accuracy_score
from os import system

# ... and unzip the "movie_review.zip" dataset inside the `datasets` folder.
system('unzip -u datasets/movie_review.zip -d datasets/')

# Ok, now we are ready to begin. Let's create a new SS3 instance.
clf = SS3()

# What are the default hyper-parameter values? let's see
s, l, p, _ = clf.get_hyperparameters()

print("Smoothness(s):", s)
print("Significance(l):", l)
print("Sanction(p):", p)

# Ok, now let's load the training and the test set using the `load_from_files`
# function from `pyss3.util` as follow:
x_train, y_train = Dataset.load_from_files("datasets/movie_review/train")
x_test, y_test = Dataset.load_from_files("datasets/movie_review/test")

# Let's train our model...
clf.fit(x_train, y_train)
Esempio n. 8
0
def test_pyss3_ss3(mockers):
    """Test SS3."""
    with pytest.raises(ValueError):
        clf = SS3("hyperparameter")

    # Using integer labels
    test_x = ["this is the first document"
              ] * 5 + ["this is the second document"] * 5
    test_y = [0] * 5 + [1] * 5
    clf = SS3()
    clf.train(test_x, test_y)
    assert clf.classify_label("this is the first document") == 0
    assert clf.classify_label("this is the second document") == 1

    # traning only with one category
    clf = SS3()
    clf.train(["this is the first document"], ["first"])

    # training different cases
    clf = SS3(s=.45, l=.5, p=1, a=0, cv_m=STR_NORM_GV_XAI, sg_m=STR_XAI)
    clf.set_name("test")

    # "cold start" tests
    assert clf.get_name() == "test"
    assert clf.get_category_index("a_category") == IDX_UNKNOWN_CATEGORY
    assert clf.get_category_name(0) == STR_UNKNOWN_CATEGORY
    assert clf.get_category_name(-1) == STR_UNKNOWN_CATEGORY
    assert clf.get_ngrams_length() == 0

    with pytest.raises(pyss3.EmptyModelError):
        clf.classify(x_test[0])
    with pytest.raises(pyss3.EmptyModelError):
        clf.predict(x_test)
    with pytest.raises(pyss3.EmptyModelError):
        clf.predict_proba(x_test)
    with pytest.raises(ValueError):
        clf.train(x_train, [])
    with pytest.raises(ValueError):
        clf.train([], [])

    # train and predict/classify tests (model: terms are single words)
    # cv_m=STR_NORM_GV_XAI, sg_m=STR_XAI
    clf.fit(x_train, y_train)

    assert clf.get_ngrams_length() == 1

    perform_tests_with(clf, [.00114, .00294, 0, 0, 0, .00016, .01878, 8.43969])
    perform_tests_on(clf.cv, 0.4307)
    perform_tests_on(clf.gv, 0.2148)
    perform_tests_on(clf.lv, 0.2148)
    perform_tests_on(clf.sg, 1)
    perform_tests_on(clf.sn, 1)
    perform_tests_on(clf.cv, 0, "video games", "science&technology")
    perform_tests_on(clf.gv, 0, "video games", "science&technology")

    # world cloud tests
    with pytest.raises(pyss3.InvalidCategoryError):
        clf.save_wordcloud("xxx")
    with pytest.raises(ValueError):
        clf.save_wordcloud("sports", top_n=0)
    with pytest.raises(ValueError):
        clf.save_wordcloud("sports", n_grams=0)
    with pytest.raises(ValueError):
        clf.save_wordcloud("sports", size=0)

    if PYTHON3:
        clf.save_wordcloud("science&technology", n_grams=3)  # empty cloud
        clf.save_wordcloud("science&technology", plot=True)

    # cv_m=STR_NORM_GV, sg_m=STR_XAI
    clf = SS3(s=.45,
              l=.5,
              p=1,
              a=0,
              name="test-norm-gv-sn-xai",
              cv_m=STR_NORM_GV,
              sg_m=STR_XAI)
    clf.fit(x_train, y_train)

    perform_tests_with(clf, [.00114, .00294, 0, 0, 0, .00016, .01878, 8.43969])
    perform_tests_on(clf.cv, 0.4307)

    # cv_m=STR_GV, sg_m=STR_XAI
    clf = SS3(s=.45,
              l=.5,
              p=1,
              a=0,
              name="test-gv-sn-xai",
              cv_m=STR_GV,
              sg_m=STR_XAI)
    clf.fit(x_train, y_train)

    perform_tests_with(clf, [.00062, .00109, 0, 0, 0, .00014, .01878, 6.31605])
    assert clf.cv("chicken", "food") == clf.gv("chicken", "food")

    # cv_m=STR_NORM_GV_XAI, sg_m=STR_VANILLA
    clf = SS3(s=.45,
              l=.5,
              p=1,
              a=0,
              name="test-norm-gv-xai-sn-vanilla",
              cv_m=STR_NORM_GV_XAI,
              sg_m=STR_VANILLA)
    clf.fit(x_train, y_train)

    perform_tests_with(clf, [.00114, .00294, 0, 0, 0, .00016, .01878, 8.43969],
                       stopwords=False)

    # train and predict/classify tests (model: terms are word n-grams)
    clf = SS3(name="test-3grams", cv_m=STR_NORM_GV_XAI, sg_m=STR_XAI)

    clf.fit(x_train, y_train, n_grams=3)

    assert clf.get_ngrams_length() == 3

    # update_values
    clf.set_l(.3)
    clf.update_values()
    clf.set_p(.2)
    clf.update_values()
    clf.set_hyperparameters(s=.32, l=1.24, p=1.1, a=0)
    clf.update_values()
    clf.update_values()

    perform_tests_with(clf, [.00074, .00124, 0, 0, 0, .00028, .00202, 9.17139])
    perform_tests_on(clf.cv, 1.5681, "video games", "science&technology")
    perform_tests_on(clf.gv, 0.6697, "video games", "science&technology")
    perform_tests_on(clf.lv, 0.6697, "video games", "science&technology")
    perform_tests_on(clf.sg, 1, "video games", "science&technology")
    perform_tests_on(clf.sn, 1, "video games", "science&technology")

    # n-gram recognition tests
    pred = clf.classify("android mobile and video games", json=True)
    assert pred["pars"][0]["sents"][0]["words"][0][
        "lexeme"] == "android mobile"
    assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "video games"
    assert argmax(pred["cv"]) == clf.get_category_index("science&technology")
    assert [round(p, 5)
            for p in pred["cv"]] == [0, 0, 0, 0, 0, 0, 4.38069, 0, 0]

    pred = clf.classify("playing football soccer", json=True)
    assert pred["pars"][0]["sents"][0]["words"][-1][
        "lexeme"] == "football soccer"
    assert argmax(pred["cv"]) == clf.get_category_index("sports")
    assert [round(p, 5)
            for p in pred["cv"]] == [0, 0, 0, 0, 0, .53463, 0, 1.86708, 0]

    # extract_insight
    t = clf.extract_insight(doc_insight)
    assert len(
        t) == 1 and t[0][0] == ' about sports. Football soccer, you know!'
    t = clf.extract_insight(doc_insight, window_size=1)
    assert len(t) == 1 and t[0] == (' sports. Football soccer, you know',
                                    2.8670788645841605)
    t = clf.extract_insight(doc_insight, window_size=0)
    assert t == [('Football soccer', 1.8670788645841605), ('sports', 1.0)]
    assert clf.extract_insight(doc_insight, cat="music") == []
    assert len(clf.extract_insight(doc_insight, min_cv=1)) == 1
    t = clf.extract_insight(doc_insight, level="sentence")
    assert len(t) == 2 and t[0][0] == ' Football soccer, you know!'
    t = clf.extract_insight(doc_insight, level="sentence", sort=False)
    assert len(t) == 2 and t[0][0] == 'Dude, this text is about sports'
    t = clf.extract_insight(doc_insight, level="paragraph", min_cv=-1)
    assert len(t) == 2 + 1 and t[2][0] == "2nd paragraph."

    with pytest.raises(pyss3.InvalidCategoryError):
        clf.extract_insight(doc_insight, cat=STR_UNKNOWN_CATEGORY)
    with pytest.raises(ValueError):
        clf.extract_insight(doc_insight, level="invalid")

    # prints
    clf.print_model_info()
    clf.print_hyperparameters_info()
    clf.print_categories_info()
    clf.print_ngram_info("video games")

    # plot_value_distribution
    if sys.version_info[0] >= 3:
        clf.plot_value_distribution(y_test[0])

    # load and save model tests
    clf.set_model_path("tests/")
    clf.save_model()
    clf.load_model()

    # get_next_words
    assert clf.get_next_words("android",
                              "science&technology")[0][0] == "mobile"

    clf = SS3(name="test-3grams")

    with pytest.raises((OSError, IOError)):
        clf.set_model_path("dummy")
        clf.load_model()

    clf.set_model_path("./tests")
    clf.load_model()

    clf.set_model_path("tests/tmp")
    clf.save_model()
    clf.save_model()
    clf.load_model()

    clf.save_model("tests/")
    clf.load_model()

    clf = SS3(name="test-3grams")
    clf.load_model("./tests/")

    clf.save_model("./tests/tmp/")
    clf.save_model()
    clf.load_model()

    # save_vocab
    clf.save_vocab("tests/tmp")

    # save_cat_vocab
    clf.save_cat_vocab("sports", "tests/tmp")
    with pytest.raises(pyss3.InvalidCategoryError):
        clf.save_cat_vocab(STR_UNKNOWN_CATEGORY, "tests/tmp")

    rmtree("./tests/tmp", ignore_errors=True)
    rmtree("./tests/ss3_models", ignore_errors=True)
Esempio n. 9
0
def test_pyss3_ss3():
    """Test SS3."""
    clf = SS3(s=.45,
              l=.5,
              p=1,
              a=0,
              name="test",
              cv_m=STR_NORM_GV_XAI,
              sn_m=STR_XAI)

    # "cold start" tests
    assert clf.get_name() == "test"
    assert clf.get_category_index("a_category") == IDX_UNKNOWN_CATEGORY
    assert clf.get_category_name(0) == STR_UNKNOWN_CATEGORY
    assert clf.get_category_name(-1) == STR_UNKNOWN_CATEGORY

    with pytest.raises(pyss3.EmptyModelError):
        clf.predict(x_test)
    with pytest.raises(pyss3.EmptyModelError):
        clf.predict_proba(x_test)

    # train and predict/classify tests (model: terms are single words)
    clf.fit(x_train, y_train)

    perform_tests_with(clf, [.00114, .00295, 0, 0, 0, .00016, .01894, 8.47741])

    # train and predict/classify tests (model: terms are word n-grams)
    clf = SS3(s=.32,
              l=1.24,
              p=1.1,
              a=0,
              name="test-3grams",
              cv_m=STR_NORM_GV_XAI,
              sn_m=STR_XAI)
    clf.fit(x_train, y_train, n_grams=3)

    perform_tests_with(clf, [.00074, .00124, 0, 0, 0, .00028, .00202, 9.19105])

    # n-gram recognition tests
    pred = clf.classify("android mobile and video games", json=True)
    assert pred["pars"][0]["sents"][0]["words"][0][
        "lexeme"] == "android mobile"
    assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "video games"
    assert argmax(pred["cv"]) == clf.get_category_index("science&technology")
    assert [round(p, 5)
            for p in pred["cv"]] == [0, 0, 0, 0, 0, 0, 4.3789, 0, 0]

    pred = clf.classify("playing football soccer", json=True)
    assert pred["pars"][0]["sents"][0]["words"][-1][
        "lexeme"] == "football soccer"
    assert argmax(pred["cv"]) == clf.get_category_index("sports")
    assert [round(p, 5)
            for p in pred["cv"]] == [0, 0, 0, 0, 0, .53463, 0, 1.86708, 0]

    # load and save model tests
    clf.set_model_path("tests/")
    clf.save_model()
    clf.load_model()

    clf = SS3(name="test-3grams")

    with pytest.raises((OSError, IOError)):
        clf.set_model_path("dummy")
        clf.load_model()

    clf.set_model_path("./tests")
    clf.load_model()

    clf.set_model_path("tests/tmp")
    clf.save_model()
    clf.save_model()
    clf.load_model()

    clf.save_model("tests/")
    clf.load_model()

    clf = SS3(name="test-3grams")
    clf.load_model("./tests/")

    clf.save_model("./tests/tmp/")
    clf.save_model()
    clf.load_model()

    rmtree("./tests/tmp", ignore_errors=True)
    rmtree("./tests/ss3_models", ignore_errors=True)