Exemple #1
0
def test_english_filter_words():
    analyzer = annif.analyzer.get_analyzer("snowball(english)")
    text = """Since 2000, 3D printing can be used to print
    3 kinds of objects."""
    words = analyzer.tokenize_words(text)
    assert len(words) == 7
    assert '2000' not in words
    assert 'be' not in words
    assert 'sinc' in words
    assert 'object' in words
Exemple #2
0
def test_english_filter_words_min_token():
    analyzer = annif.analyzer.get_analyzer(
        "snowball(english,token_min_length=2)")
    text = """Since 2000, a 3D printer can be used to print
    3 kinds of objects."""
    words = analyzer.tokenize_words(text)
    assert len(words) == 11
    assert '2000' not in words
    assert 'sinc' in words
    assert 'object' in words
    assert 'a' not in words
Exemple #3
0
def test_simple_analyzer_token_size():
    analyzer = annif.analyzer.get_analyzer("simple(token_min_length=2)")
    text = 'I do stuff'
    tokens = analyzer.tokenize_words(text)
    assert len(tokens) == 2
Exemple #4
0
def test_english_tokenize_words():
    analyzer = annif.analyzer.get_analyzer("snowball(english)")
    text = """To take a trivial example, which of us ever undertakes
    laborious physical exercise, except to obtain some advantage from it?"""
    words = analyzer.tokenize_words(text)
    assert len(words) == 14