def test_extract_tokenizer(text): we_1 = WordsExtractor(text) we_2 = WordsExtractor(text, tokenizer=re.compile(r'[^\w]+')) we_3 = WordsExtractor(text, tokenizer=wordpunct_tokenize) assert len(we_1.extract()) == 61 assert len(we_2.extract()) == 62 assert len(we_3.extract()) == 63
def test_extract_type_error(text): tokenizers = [666, ['a', 'b'], {'a': 'b'}] for tokenizer in tokenizers: try: WordsExtractor(text, tokenizer=tokenizer) except TypeError: pytest.fail("Токенизатор задан некорректно")
def test_extract_filter_nums(text): we = WordsExtractor(text + " 33.5 + 99", filter_nums=True) assert len(we.extract()) == 62
def test_extract_filter_punct(text): we = WordsExtractor(text, filter_punct=False) assert len(we.extract()) == 72
def test_extract(text): we = WordsExtractor(text) assert len(we.extract()) == 61
def test_init_value_error_1(text): with pytest.raises(ValueError): WordsExtractor(text, ngram_range=(2, 1))
def test_init_value_error_2(text): with pytest.raises(ValueError): WordsExtractor(text, min_len=10, max_len=5)
def test_get_most_common_value_error(text): with pytest.raises(ValueError): we = WordsExtractor(text) we.get_most_common(0)
def test_get_most_common(text): we = WordsExtractor(text) we.extract() assert we.get_most_common(1) == [('значений', 3)]
def test_extract_ngram_range(text): we = WordsExtractor(text, ngram_range=(1, 3)) assert len(we.extract()) == 180 assert 'формальными_онтологиями_является' in we.words
def test_extract_max_len(text): we = WordsExtractor(text, max_len=6) assert len(we.extract()) == 26
def test_extract_stopwords(text): we_1 = WordsExtractor(text, stopwords=stopwords.words('russian')) we_2 = WordsExtractor(text, stopwords=['и', 'а', 'с', 'в']) assert len(we_1.extract()) == 47 assert len(we_2.extract()) == 57
def test_extract_use_lexemes(text): we = WordsExtractor(text, use_lexemes=True) assert len(set(['онтология', 'значение', 'связь']).intersection(set(we.extract()))) == 3
def test_min_len(text): we = WordsExtractor(text, min_len=6) assert len(we.extract()) == 41