Esempio n. 1
0
 def test_extract_tokenizer(text):
     we_1 = WordsExtractor(text)
     we_2 = WordsExtractor(text, tokenizer=re.compile(r'[^\w]+'))
     we_3 = WordsExtractor(text, tokenizer=wordpunct_tokenize)
     assert len(we_1.extract()) == 61
     assert len(we_2.extract()) == 62
     assert len(we_3.extract()) == 63
Esempio n. 2
0
 def test_extract_type_error(text):
     tokenizers = [666, ['a', 'b'], {'a': 'b'}]
     for tokenizer in tokenizers:
         try:
             WordsExtractor(text, tokenizer=tokenizer)
         except TypeError:
             pytest.fail("Токенизатор задан некорректно")
Esempio n. 3
0
 def test_extract_filter_nums(text):
     we = WordsExtractor(text + " 33.5 + 99", filter_nums=True)
     assert len(we.extract()) == 62
Esempio n. 4
0
 def test_extract_filter_punct(text):
     we = WordsExtractor(text, filter_punct=False)
     assert len(we.extract()) == 72
Esempio n. 5
0
 def test_extract(text):
     we = WordsExtractor(text)
     assert len(we.extract()) == 61
Esempio n. 6
0
 def test_init_value_error_1(text):
     with pytest.raises(ValueError):
         WordsExtractor(text, ngram_range=(2, 1))
Esempio n. 7
0
 def test_init_value_error_2(text):
     with pytest.raises(ValueError):
         WordsExtractor(text, min_len=10, max_len=5)
Esempio n. 8
0
 def test_get_most_common_value_error(text):
     with pytest.raises(ValueError):
         we = WordsExtractor(text)
         we.get_most_common(0)
Esempio n. 9
0
 def test_get_most_common(text):
     we = WordsExtractor(text)
     we.extract()
     assert we.get_most_common(1) == [('значений', 3)]
Esempio n. 10
0
 def test_extract_ngram_range(text):
     we = WordsExtractor(text, ngram_range=(1, 3))
     assert len(we.extract()) == 180
     assert 'формальными_онтологиями_является' in we.words
Esempio n. 11
0
 def test_extract_max_len(text):
     we = WordsExtractor(text, max_len=6)
     assert len(we.extract()) == 26
Esempio n. 12
0
 def test_extract_stopwords(text):
     we_1 = WordsExtractor(text, stopwords=stopwords.words('russian'))
     we_2 = WordsExtractor(text, stopwords=['и', 'а', 'с', 'в'])
     assert len(we_1.extract()) == 47
     assert len(we_2.extract()) == 57
Esempio n. 13
0
 def test_extract_use_lexemes(text):
     we = WordsExtractor(text, use_lexemes=True)
     assert len(set(['онтология', 'значение', 'связь']).intersection(set(we.extract()))) == 3
Esempio n. 14
0
 def test_min_len(text):
     we = WordsExtractor(text, min_len=6)
     assert len(we.extract()) == 41