def test_pyss3_ss3(): """Test SS3.""" clf = SS3(s=.45, l=.5, p=1, a=0, name="test", cv_m=STR_NORM_GV_XAI, sn_m=STR_XAI) assert clf.get_name() == "test" with pytest.raises(pyss3.InvalidCategoryError): clf.get_category_index("a_category") with pytest.raises(pyss3.InvalidCategoryError): clf.get_category_name(0) with pytest.raises(pyss3.EmptyModelError): y_pred = clf.predict(x_test) with pytest.raises(pyss3.EmptyModelError): y_pred = clf.predict_proba(x_test) clf.fit(x_train, y_train) perform_tests_with(clf, [.00114, .00295, 0, 0, 0, .00016, .01894, 8.47741]) clf = SS3(s=.32, l=1.24, p=1.1, a=0, name="test-3grams", cv_m=STR_NORM_GV_XAI, sn_m=STR_XAI) clf.fit(x_train, y_train, n_grams=3) perform_tests_with(clf, [.00074, .00124, 0, 0, 0, .00028, .00202, 9.19105]) pred = clf.classify("android mobile and video games", json=True) assert pred["pars"][0]["sents"][0]["words"][0][ "lexeme"] == "android mobile" assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "video games" assert argmax(pred["cv"]) == clf.get_category_index("science&technology") assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, 0, 4.3789, 0, 0] pred = clf.classify("playing football soccer", json=True) assert pred["pars"][0]["sents"][0]["words"][-1][ "lexeme"] == "football soccer" assert argmax(pred["cv"]) == clf.get_category_index("sports") assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, .53463, 0, 1.86708, 0]
def test_multilabel(): """Test multilabel support.""" x_train, y_train = Dataset.load_from_files_multilabel( path.join(dataset_multilabel_path, "train/docs.txt"), path.join(dataset_multilabel_path, "train/labels.txt"), sep_label=",", sep_doc="\n>>>>>\n") clf = SS3() with pytest.raises(ValueError): membership_matrix(clf, []) clf.fit(x_train, y_train) assert sorted(clf.get_categories()) == [ 'insult', 'obscene', 'severe_toxic', 'toxic' ] assert clf.classify_multilabel("this is a unknown document!") == [] y_pred = [[], ['toxic'], ['severe_toxic'], ['obscene'], ['insult'], ['toxic', 'insult']] y_pred_memmatrix = membership_matrix(clf, y_pred).todense().tolist() assert y_pred_memmatrix == [ [0, 0, 0, 0], # [] [1, 0, 0, 0], # ['toxic'] [0, 1, 0, 0], # ['severe_toxic'] [0, 0, 1, 0], # ['obscene'] [0, 0, 0, 1], # ['insult'] [1, 0, 0, 1] ] # ['toxic', 'insult'] y_pred_memmatrix = membership_matrix(clf, y_pred + [["xxx"]]).todense().tolist() assert y_pred_memmatrix[-1] == [0, 0, 0, 0]
def test_evaluation(mocker): """Test Evaluation class.""" mocker.patch("webbrowser.open") mocker.patch("matplotlib.pyplot.show") kfold_validation = Evaluation.kfold_cross_validation Evaluation.__cache__ = None Evaluation.__cache_file__ = None Evaluation.__clf__ = None Evaluation.__last_eval_tag__ = None Evaluation.__last_eval_method__ = None Evaluation.__last_eval_def_cat__ = None ss = [0, 0.5] ll = [0, 1.5] pp = [0, 2] x_data, y_data = Dataset.load_from_files(DATASET_PATH) clf = SS3() clf.set_model_path("tests") # no classifier assigned case Evaluation.clear_cache() with pytest.raises(ValueError): Evaluation.get_best_hyperparameters() with pytest.raises(ValueError): Evaluation.remove() with pytest.raises(ValueError): Evaluation.show_best() with pytest.raises(ValueError): Evaluation.plot(TMP_FOLDER) # Not-yet-trained model case Evaluation.set_classifier(clf) Evaluation.clear_cache() Evaluation.remove() Evaluation.show_best() assert Evaluation.plot(TMP_FOLDER) is False with pytest.raises(pyss3.EmptyModelError): Evaluation.test(clf, x_data, y_data) with pytest.raises(pyss3.EmptyModelError): kfold_validation(clf, x_data, y_data) with pytest.raises(pyss3.EmptyModelError): Evaluation.grid_search(clf, x_data, y_data) with pytest.raises(LookupError): Evaluation.get_best_hyperparameters() # default argument values clf.train(x_data, y_data) assert Evaluation.test(clf, x_data, y_data, plot=PY3) == 1 assert Evaluation.test(clf, ['bla bla bla'], ['pos'], plot=PY3) == 0 assert Evaluation.test(clf, ['bla bla bla', "I love this love movie!"], ['pos', 'pos'], plot=PY3) == 0.5 assert kfold_validation(clf, x_data, y_data, plot=PY3) > 0 s, l, p, a = clf.get_hyperparameters() s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data) s1, l1, p1, a1 = Evaluation.get_best_hyperparameters() s2, l2, p2, a2 = Evaluation.get_best_hyperparameters("recall") assert s0 == s and l0 == l and p0 == p and a0 == a assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1 assert s0 == s2 and l0 == l2 and p0 == p2 and a0 == a2 assert Evaluation.plot(TMP_FOLDER) is True Evaluation.remove() Evaluation.show_best() assert Evaluation.plot(TMP_FOLDER) is False # test # OK assert Evaluation.test(clf, x_data, y_data, def_cat='unknown', plot=PY3) == 1 assert Evaluation.test(clf, x_data, y_data, def_cat='neg', plot=PY3) == 1 assert Evaluation.test(clf, x_data, y_data, metric="f1-score", plot=PY3) == 1 assert Evaluation.test(clf, x_data, y_data, plot=PY3, metric="recall", metric_target="weighted avg") == 1 assert Evaluation.test(clf, x_data, y_data, plot=PY3, metric="recall", metric_target="neg") == 1 # Not OK with pytest.raises(InvalidCategoryError): Evaluation.test(clf, x_data, y_data, def_cat='xxx', plot=PY3) with pytest.raises(KeyError): Evaluation.test(clf, x_data, y_data, metric="xxx", plot=PY3) with pytest.raises(KeyError): Evaluation.test(clf, x_data, y_data, metric="recall", metric_target="xxx", plot=PY3) # k-fold # OK assert kfold_validation(clf, x_data, y_data, n_grams=3, plot=PY3) > 0 assert kfold_validation(clf, x_data, y_data, k=10, plot=PY3) > 0 assert kfold_validation( clf, x_data, y_data, k=10, def_cat='unknown', plot=PY3) > 0 assert kfold_validation(clf, x_data, y_data, k=10, def_cat='neg', plot=PY3) > 0 assert kfold_validation(clf, x_data, y_data, metric="f1-score", plot=PY3) > 0 assert kfold_validation(clf, x_data, y_data, plot=PY3, metric="recall", metric_target="weighted avg") > 0 assert kfold_validation( clf, x_data, y_data, plot=PY3, metric="recall", metric_target="neg") > 0 # Not OK with pytest.raises(ValueError): kfold_validation(clf, x_data, y_data, n_grams=-1, plot=PY3) with pytest.raises(ValueError): kfold_validation(clf, x_data, y_data, n_grams=clf, plot=PY3) with pytest.raises(ValueError): kfold_validation(clf, x_data, y_data, k=-1, plot=PY3) with pytest.raises(ValueError): kfold_validation(clf, x_data, y_data, k=clf, plot=PY3) with pytest.raises(ValueError): kfold_validation(clf, x_data, y_data, k=None, plot=PY3) with pytest.raises(InvalidCategoryError): kfold_validation(clf, x_data, y_data, def_cat='xxx', plot=PY3) with pytest.raises(KeyError): kfold_validation(clf, x_data, y_data, metric="xxx", plot=PY3) with pytest.raises(KeyError): kfold_validation(clf, x_data, y_data, metric="recall", metric_target="xxx", plot=PY3) # grid_search # OK s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, s=ss) s1, l1, p1, a1 = Evaluation.grid_search(clf, x_data, y_data, s=ss, l=ll, p=pp) assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1 s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, k_fold=4) s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, def_cat='unknown', p=pp) s1, l1, p1, a1 = Evaluation.grid_search(clf, x_data, y_data, def_cat='neg', p=pp) assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1 s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, metric="f1-score", p=pp) s1, l1, p1, a1 = Evaluation.grid_search(clf, x_data, y_data, p=pp, metric="recall", metric_target="weighted avg") s1, l1, p1, a1 = Evaluation.grid_search(clf, x_data, y_data, p=pp, metric="recall", metric_target="neg") assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1 # Not OK with pytest.raises(TypeError): Evaluation.grid_search(clf, x_data, y_data, s='asd') with pytest.raises(TypeError): Evaluation.grid_search(clf, x_data, y_data, s=clf) with pytest.raises(TypeError): Evaluation.grid_search(clf, x_data, y_data, k_fold=clf) with pytest.raises(TypeError): Evaluation.grid_search(clf, x_data, y_data, k_fold="xxx") with pytest.raises(InvalidCategoryError): Evaluation.grid_search(clf, x_data, y_data, def_cat='xxx') with pytest.raises(KeyError): Evaluation.grid_search(clf, x_data, y_data, metric="xxx") with pytest.raises(KeyError): Evaluation.grid_search(clf, x_data, y_data, metric="recall", metric_target="xxx") # get_best_hyperparameters s1, l1, p1, a1 = Evaluation.get_best_hyperparameters() s2, l2, p2, a2 = Evaluation.get_best_hyperparameters("recall") s1, l1, p1, a1 = Evaluation.get_best_hyperparameters( "recall", "weighted avg") s1, l1, p1, a1 = Evaluation.get_best_hyperparameters("recall", "pos") s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold") s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold", def_cat="neg") s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold", def_cat="unknown") assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1 assert s0 == s2 and l0 == l2 and p0 == p2 and a0 == a2 # Not OK with pytest.raises(KeyError): Evaluation.get_best_hyperparameters("xxx") with pytest.raises(KeyError): Evaluation.get_best_hyperparameters("recall", "xxx") with pytest.raises(LookupError): Evaluation.get_best_hyperparameters(method="xxx") with pytest.raises(LookupError): Evaluation.get_best_hyperparameters(def_cat="xxx") with pytest.raises(LookupError): Evaluation.get_best_hyperparameters(method="4-fold", def_cat="unknown") # plot OK assert Evaluation.plot(TMP_FOLDER) is True # remove # OK assert Evaluation.remove(s, l, p, a)[0] == 10 assert Evaluation.remove(def_cat="neg")[0] == 2 assert Evaluation.remove(method="test")[0] == 12 assert Evaluation.remove(s=-10)[0] == 0 assert Evaluation.remove(def_cat="xxx")[0] == 0 assert Evaluation.remove(method="xxx")[0] == 0 assert Evaluation.remove()[0] == 1 assert Evaluation.plot(TMP_FOLDER) is False # plot not OK (no evaluations) # not OK with pytest.raises(TypeError): Evaluation.remove("xxx") with pytest.raises(TypeError): Evaluation.remove(clf) Evaluation.show_best() Evaluation.show_best(method="test") Evaluation.show_best(def_cat="unknown") Evaluation.show_best(metric="f1-score") Evaluation.show_best(metric="f1-score", avg="weighted avg") # different tag rmtree("./tests/ss3_models", ignore_errors=True)
def test_pyss3_ss3(mockers): """Test SS3.""" clf = SS3(s=.45, l=.5, p=1, a=0, name="test", cv_m=STR_NORM_GV_XAI, sn_m=STR_XAI) # "cold start" tests assert clf.get_name() == "test" assert clf.get_category_index("a_category") == IDX_UNKNOWN_CATEGORY assert clf.get_category_name(0) == STR_UNKNOWN_CATEGORY assert clf.get_category_name(-1) == STR_UNKNOWN_CATEGORY with pytest.raises(pyss3.EmptyModelError): clf.predict(x_test) with pytest.raises(pyss3.EmptyModelError): clf.predict_proba(x_test) # train and predict/classify tests (model: terms are single words) # cv_m=STR_NORM_GV_XAI, sn_m=STR_XAI clf.fit(x_train, y_train) perform_tests_with(clf, [.00114, .00295, 0, 0, 0, .00016, .01894, 8.47741]) # cv_m=STR_NORM_GV, sn_m=STR_XAI clf = SS3(s=.45, l=.5, p=1, a=0, name="test-norm-gv-sn-xai", cv_m=STR_NORM_GV, sn_m=STR_XAI) clf.fit(x_train, y_train) perform_tests_with(clf, [0.00114, 0.00295, 0, 0, 0, 0.00016, 0.01894, 8.47741]) # cv_m=STR_GV, sn_m=STR_XAI clf = SS3(s=.45, l=.5, p=1, a=0, name="test-gv-sn-xai", cv_m=STR_GV, sn_m=STR_XAI) clf.fit(x_train, y_train) perform_tests_with(clf, [0.00062, 0.00109, 0, 0, 0, 0.00014, 0.01894, 6.31228]) # cv_m=STR_NORM_GV_XAI, sn_m=STR_VANILLA clf = SS3(s=.45, l=.5, p=1, a=0, name="test-norm-gv-xai-sn-vanilla", cv_m=STR_NORM_GV_XAI, sn_m=STR_VANILLA) clf.fit(x_train, y_train) perform_tests_with(clf, [0.00114, 0.00295, 0, 0, 0, 0.00016, 0.01894, 8.47741], stopwords=False) # train and predict/classify tests (model: terms are word n-grams) clf = SS3(name="test-3grams", cv_m=STR_NORM_GV_XAI, sn_m=STR_XAI) clf.fit(x_train, y_train, n_grams=3) # update_values clf.set_l(.3) clf.update_values() clf.set_p(.2) clf.update_values() clf.set_hyperparameters(s=.32, l=1.24, p=1.1, a=0) clf.update_values() clf.update_values() perform_tests_with(clf, [.00074, .00124, 0, 0, 0, .00028, .00202, 9.19105]) # n-gram recognition tests pred = clf.classify("android mobile and video games", json=True) assert pred["pars"][0]["sents"][0]["words"][0][ "lexeme"] == "android mobile " assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "video games" assert argmax(pred["cv"]) == clf.get_category_index("science&technology") assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, 0, 4.3789, 0, 0] pred = clf.classify("playing football soccer", json=True) assert pred["pars"][0]["sents"][0]["words"][-1][ "lexeme"] == "football soccer" assert argmax(pred["cv"]) == clf.get_category_index("sports") assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, .53463, 0, 1.86708, 0] # extract_insight t = clf.extract_insight(doc_insight) assert len(t) == 1 and t[0][ 0] == 'text is about sports. Football soccer, you know!' t = clf.extract_insight(doc_insight, window_size=1) assert len(t) == 1 and t[0] == ('about sports. Football soccer, you ', 2.8670788645841605) t = clf.extract_insight(doc_insight, window_size=0) assert t == [('Football soccer, ', 1.8670788645841605), ('sports', 1.0)] assert clf.extract_insight(doc_insight, cat="music") == [] assert len(clf.extract_insight(doc_insight, min_cv=1)) == 1 t = clf.extract_insight(doc_insight, level="sentence") assert len(t) == 2 and t[0][0] == ' Football soccer, you know!' t = clf.extract_insight(doc_insight, level="sentence", sort=False) assert len(t) == 2 and t[0][0] == 'Dude, this text is about sports' t = clf.extract_insight(doc_insight, level="paragraph", min_cv=-1) assert len(t) == 2 + 1 and t[2][0] == "2nd paragraph." with pytest.raises(pyss3.InvalidCategoryError): clf.extract_insight(doc_insight, cat=STR_UNKNOWN_CATEGORY) with pytest.raises(ValueError): clf.extract_insight(doc_insight, level="invalid") # prints clf.print_model_info() clf.print_hyperparameters_info() clf.print_categories_info() clf.print_ngram_info("video games") # plot_value_distribution if sys.version_info[0] >= 3: clf.plot_value_distribution(y_test[0]) # load and save model tests clf.set_model_path("tests/") clf.save_model() clf.load_model() # get_next_words assert clf.get_next_words("android", "science&technology")[0][0] == "mobile" clf = SS3(name="test-3grams") with pytest.raises((OSError, IOError)): clf.set_model_path("dummy") clf.load_model() clf.set_model_path("./tests") clf.load_model() clf.set_model_path("tests/tmp") clf.save_model() clf.save_model() clf.load_model() clf.save_model("tests/") clf.load_model() clf = SS3(name="test-3grams") clf.load_model("./tests/") clf.save_model("./tests/tmp/") clf.save_model() clf.load_model() # save_vocab clf.save_vocab("tests/tmp") # save_cat_vocab clf.save_cat_vocab("sports", "tests/tmp") with pytest.raises(pyss3.InvalidCategoryError): clf.save_cat_vocab(STR_UNKNOWN_CATEGORY, "tests/tmp") rmtree("./tests/tmp", ignore_errors=True) rmtree("./tests/ss3_models", ignore_errors=True)
--- """ # Before we begin, let's import needed modules... from pyss3 import SS3 from pyss3.util import Dataset from pyss3.server import Server from sklearn.metrics import accuracy_score from os import system # ... and unzip the "movie_review.zip" dataset inside the `datasets` folder. system('unzip -u datasets/topic.zip -d datasets/') # Ok, now we are ready to begin. Let's create a new SS3 instance clf = SS3() # What are the default hyperparameter values? let's see s, l, p, _ = clf.get_hyperparameters() print("Smoothness(s):", s) print("Significance(l):", l) print("Sanction(p):", p) # Ok, now let's load the training and the test set using the `load_from_files` function # from `pyss3.util`. Since, in this dataset, there's a single file for each category, # we will use the argument ``folder_label=False`` to tell PySS3 to use each file as a different # category and each line inside of it as a different document: x_train, y_train = Dataset.load_from_files("datasets/topic/train", folder_label=False) x_test, y_test = Dataset.load_from_files("datasets/topic/test",
DATASET_FOLDER_MR = "dataset_mr" ADDRESS, PORT = "localhost", None LT = s.Live_Test dataset_path = path.join(path.abspath(path.dirname(__file__)), DATASET_FOLDER) dataset_path_mr = path.join(path.abspath(path.dirname(__file__)), DATASET_FOLDER_MR) x_train, y_train = None, None clf = None pyss3.set_verbosity(0) x_train, y_train = Dataset.load_from_files(dataset_path_mr) x_train, y_train = Dataset.load_from_files(dataset_path, folder_label=False) clf = SS3() clf.fit(x_train, y_train) LT.serve() # no model error LT.set_model(clf) LT.get_port() class MockCmdLineArgs: """Mocked command-line arguments.""" quiet = True MODEL = "name" path = dataset_path label = 'folder'
--- """ # Before we begin, let's import needed modules... from pyss3 import SS3 from pyss3.util import Dataset from pyss3.server import Server from sklearn.metrics import accuracy_score from os import system # ... and unzip the "movie_review.zip" dataset inside the `datasets` folder. system('unzip -u datasets/movie_review.zip -d datasets/') # Ok, now we are ready to begin. Let's create a new SS3 instance. clf = SS3() # What are the default hyper-parameter values? let's see s, l, p, _ = clf.get_hyperparameters() print("Smoothness(s):", s) print("Significance(l):", l) print("Sanction(p):", p) # Ok, now let's load the training and the test set using the `load_from_files` # function from `pyss3.util` as follow: x_train, y_train = Dataset.load_from_files("datasets/movie_review/train") x_test, y_test = Dataset.load_from_files("datasets/movie_review/test") # Let's train our model... clf.fit(x_train, y_train)
def test_pyss3_ss3(mockers): """Test SS3.""" with pytest.raises(ValueError): clf = SS3("hyperparameter") # Using integer labels test_x = ["this is the first document" ] * 5 + ["this is the second document"] * 5 test_y = [0] * 5 + [1] * 5 clf = SS3() clf.train(test_x, test_y) assert clf.classify_label("this is the first document") == 0 assert clf.classify_label("this is the second document") == 1 # traning only with one category clf = SS3() clf.train(["this is the first document"], ["first"]) # training different cases clf = SS3(s=.45, l=.5, p=1, a=0, cv_m=STR_NORM_GV_XAI, sg_m=STR_XAI) clf.set_name("test") # "cold start" tests assert clf.get_name() == "test" assert clf.get_category_index("a_category") == IDX_UNKNOWN_CATEGORY assert clf.get_category_name(0) == STR_UNKNOWN_CATEGORY assert clf.get_category_name(-1) == STR_UNKNOWN_CATEGORY assert clf.get_ngrams_length() == 0 with pytest.raises(pyss3.EmptyModelError): clf.classify(x_test[0]) with pytest.raises(pyss3.EmptyModelError): clf.predict(x_test) with pytest.raises(pyss3.EmptyModelError): clf.predict_proba(x_test) with pytest.raises(ValueError): clf.train(x_train, []) with pytest.raises(ValueError): clf.train([], []) # train and predict/classify tests (model: terms are single words) # cv_m=STR_NORM_GV_XAI, sg_m=STR_XAI clf.fit(x_train, y_train) assert clf.get_ngrams_length() == 1 perform_tests_with(clf, [.00114, .00294, 0, 0, 0, .00016, .01878, 8.43969]) perform_tests_on(clf.cv, 0.4307) perform_tests_on(clf.gv, 0.2148) perform_tests_on(clf.lv, 0.2148) perform_tests_on(clf.sg, 1) perform_tests_on(clf.sn, 1) perform_tests_on(clf.cv, 0, "video games", "science&technology") perform_tests_on(clf.gv, 0, "video games", "science&technology") # world cloud tests with pytest.raises(pyss3.InvalidCategoryError): clf.save_wordcloud("xxx") with pytest.raises(ValueError): clf.save_wordcloud("sports", top_n=0) with pytest.raises(ValueError): clf.save_wordcloud("sports", n_grams=0) with pytest.raises(ValueError): clf.save_wordcloud("sports", size=0) if PYTHON3: clf.save_wordcloud("science&technology", n_grams=3) # empty cloud clf.save_wordcloud("science&technology", plot=True) # cv_m=STR_NORM_GV, sg_m=STR_XAI clf = SS3(s=.45, l=.5, p=1, a=0, name="test-norm-gv-sn-xai", cv_m=STR_NORM_GV, sg_m=STR_XAI) clf.fit(x_train, y_train) perform_tests_with(clf, [.00114, .00294, 0, 0, 0, .00016, .01878, 8.43969]) perform_tests_on(clf.cv, 0.4307) # cv_m=STR_GV, sg_m=STR_XAI clf = SS3(s=.45, l=.5, p=1, a=0, name="test-gv-sn-xai", cv_m=STR_GV, sg_m=STR_XAI) clf.fit(x_train, y_train) perform_tests_with(clf, [.00062, .00109, 0, 0, 0, .00014, .01878, 6.31605]) assert clf.cv("chicken", "food") == clf.gv("chicken", "food") # cv_m=STR_NORM_GV_XAI, sg_m=STR_VANILLA clf = SS3(s=.45, l=.5, p=1, a=0, name="test-norm-gv-xai-sn-vanilla", cv_m=STR_NORM_GV_XAI, sg_m=STR_VANILLA) clf.fit(x_train, y_train) perform_tests_with(clf, [.00114, .00294, 0, 0, 0, .00016, .01878, 8.43969], stopwords=False) # train and predict/classify tests (model: terms are word n-grams) clf = SS3(name="test-3grams", cv_m=STR_NORM_GV_XAI, sg_m=STR_XAI) clf.fit(x_train, y_train, n_grams=3) assert clf.get_ngrams_length() == 3 # update_values clf.set_l(.3) clf.update_values() clf.set_p(.2) clf.update_values() clf.set_hyperparameters(s=.32, l=1.24, p=1.1, a=0) clf.update_values() clf.update_values() perform_tests_with(clf, [.00074, .00124, 0, 0, 0, .00028, .00202, 9.17139]) perform_tests_on(clf.cv, 1.5681, "video games", "science&technology") perform_tests_on(clf.gv, 0.6697, "video games", "science&technology") perform_tests_on(clf.lv, 0.6697, "video games", "science&technology") perform_tests_on(clf.sg, 1, "video games", "science&technology") perform_tests_on(clf.sn, 1, "video games", "science&technology") # n-gram recognition tests pred = clf.classify("android mobile and video games", json=True) assert pred["pars"][0]["sents"][0]["words"][0][ "lexeme"] == "android mobile" assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "video games" assert argmax(pred["cv"]) == clf.get_category_index("science&technology") assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, 0, 4.38069, 0, 0] pred = clf.classify("playing football soccer", json=True) assert pred["pars"][0]["sents"][0]["words"][-1][ "lexeme"] == "football soccer" assert argmax(pred["cv"]) == clf.get_category_index("sports") assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, .53463, 0, 1.86708, 0] # extract_insight t = clf.extract_insight(doc_insight) assert len( t) == 1 and t[0][0] == ' about sports. Football soccer, you know!' t = clf.extract_insight(doc_insight, window_size=1) assert len(t) == 1 and t[0] == (' sports. Football soccer, you know', 2.8670788645841605) t = clf.extract_insight(doc_insight, window_size=0) assert t == [('Football soccer', 1.8670788645841605), ('sports', 1.0)] assert clf.extract_insight(doc_insight, cat="music") == [] assert len(clf.extract_insight(doc_insight, min_cv=1)) == 1 t = clf.extract_insight(doc_insight, level="sentence") assert len(t) == 2 and t[0][0] == ' Football soccer, you know!' t = clf.extract_insight(doc_insight, level="sentence", sort=False) assert len(t) == 2 and t[0][0] == 'Dude, this text is about sports' t = clf.extract_insight(doc_insight, level="paragraph", min_cv=-1) assert len(t) == 2 + 1 and t[2][0] == "2nd paragraph." with pytest.raises(pyss3.InvalidCategoryError): clf.extract_insight(doc_insight, cat=STR_UNKNOWN_CATEGORY) with pytest.raises(ValueError): clf.extract_insight(doc_insight, level="invalid") # prints clf.print_model_info() clf.print_hyperparameters_info() clf.print_categories_info() clf.print_ngram_info("video games") # plot_value_distribution if sys.version_info[0] >= 3: clf.plot_value_distribution(y_test[0]) # load and save model tests clf.set_model_path("tests/") clf.save_model() clf.load_model() # get_next_words assert clf.get_next_words("android", "science&technology")[0][0] == "mobile" clf = SS3(name="test-3grams") with pytest.raises((OSError, IOError)): clf.set_model_path("dummy") clf.load_model() clf.set_model_path("./tests") clf.load_model() clf.set_model_path("tests/tmp") clf.save_model() clf.save_model() clf.load_model() clf.save_model("tests/") clf.load_model() clf = SS3(name="test-3grams") clf.load_model("./tests/") clf.save_model("./tests/tmp/") clf.save_model() clf.load_model() # save_vocab clf.save_vocab("tests/tmp") # save_cat_vocab clf.save_cat_vocab("sports", "tests/tmp") with pytest.raises(pyss3.InvalidCategoryError): clf.save_cat_vocab(STR_UNKNOWN_CATEGORY, "tests/tmp") rmtree("./tests/tmp", ignore_errors=True) rmtree("./tests/ss3_models", ignore_errors=True)
def test_pyss3_ss3(): """Test SS3.""" clf = SS3(s=.45, l=.5, p=1, a=0, name="test", cv_m=STR_NORM_GV_XAI, sn_m=STR_XAI) # "cold start" tests assert clf.get_name() == "test" assert clf.get_category_index("a_category") == IDX_UNKNOWN_CATEGORY assert clf.get_category_name(0) == STR_UNKNOWN_CATEGORY assert clf.get_category_name(-1) == STR_UNKNOWN_CATEGORY with pytest.raises(pyss3.EmptyModelError): clf.predict(x_test) with pytest.raises(pyss3.EmptyModelError): clf.predict_proba(x_test) # train and predict/classify tests (model: terms are single words) clf.fit(x_train, y_train) perform_tests_with(clf, [.00114, .00295, 0, 0, 0, .00016, .01894, 8.47741]) # train and predict/classify tests (model: terms are word n-grams) clf = SS3(s=.32, l=1.24, p=1.1, a=0, name="test-3grams", cv_m=STR_NORM_GV_XAI, sn_m=STR_XAI) clf.fit(x_train, y_train, n_grams=3) perform_tests_with(clf, [.00074, .00124, 0, 0, 0, .00028, .00202, 9.19105]) # n-gram recognition tests pred = clf.classify("android mobile and video games", json=True) assert pred["pars"][0]["sents"][0]["words"][0][ "lexeme"] == "android mobile" assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "video games" assert argmax(pred["cv"]) == clf.get_category_index("science&technology") assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, 0, 4.3789, 0, 0] pred = clf.classify("playing football soccer", json=True) assert pred["pars"][0]["sents"][0]["words"][-1][ "lexeme"] == "football soccer" assert argmax(pred["cv"]) == clf.get_category_index("sports") assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, .53463, 0, 1.86708, 0] # load and save model tests clf.set_model_path("tests/") clf.save_model() clf.load_model() clf = SS3(name="test-3grams") with pytest.raises((OSError, IOError)): clf.set_model_path("dummy") clf.load_model() clf.set_model_path("./tests") clf.load_model() clf.set_model_path("tests/tmp") clf.save_model() clf.save_model() clf.load_model() clf.save_model("tests/") clf.load_model() clf = SS3(name="test-3grams") clf.load_model("./tests/") clf.save_model("./tests/tmp/") clf.save_model() clf.load_model() rmtree("./tests/tmp", ignore_errors=True) rmtree("./tests/ss3_models", ignore_errors=True)