Esempio n. 1
0
def test_multiple_text_filters():
    stim = TextStim(text='testing the filtering features')
    filt1 = TokenizingFilter()
    filt2 = WordStemmingFilter()
    stemmed_tokens = filt2.transform(filt1.transform(stim))
    full_text = ' '.join([s.text for s in stemmed_tokens])
    assert full_text == 'test the filter featur'
Esempio n. 2
0
def test_multiple_text_filters():
    stim = TextStim(text='testing the filtering features')
    filt1 = TokenizingFilter()
    filt2 = WordStemmingFilter()
    stemmed_tokens = filt2.transform(filt1.transform(stim))
    full_text = ' '.join([s.text for s in stemmed_tokens])
    assert full_text == 'test the filter featur'

    stim = TextStim(text='ARTICLE ONE: Rights')
    g = Graph()
    g.add_node(LowerCasingFilter())
    filt1 = LowerCasingFilter()
    filt2 = PunctuationRemovalFilter()
    filt3 = TokenizingFilter()
    final_texts = filt3.transform(filt2.transform(filt1.transform(stim)))
    assert len(final_texts) == 3
    assert final_texts[0].text == 'article'
    assert final_texts[0].order == 0
    assert final_texts[1].text == 'one'
    assert final_texts[2].text == 'rights'
    assert final_texts[2].order == 2
Esempio n. 3
0
def test_word_stemming_filter():
    stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'),
                           columns='to',
                           default_duration=1)

    # With all defaults (porter stemmer)
    filt = WordStemmingFilter()
    assert isinstance(filt.stemmer, nls.PorterStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    target = ['some', 'sampl', 'text', 'for', 'test', 'annot']
    assert stems == target

    # Try a different stemmer
    filt = WordStemmingFilter(stemmer='snowball', language='english')
    assert isinstance(filt.stemmer, nls.SnowballStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Handles StemmerI stemmer
    stemmer = nls.SnowballStemmer(language='english')
    filt = WordStemmingFilter(stemmer=stemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Fails on invalid values
    with pytest.raises(ValueError):
        filt = WordStemmingFilter(stemmer='nonexistent_stemmer')

    # Try a long text stim
    stim2 = TextStim(text='theres something happening here')
    filt = WordStemmingFilter()
    assert filt.transform(stim2).text == 'there someth happen here'
Esempio n. 4
0
def test_word_stemming_filter():
    stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'),
                           columns='to',
                           default_duration=1)

    # With all defaults (porter stemmer)
    filt = WordStemmingFilter()
    assert isinstance(filt.stemmer, nls.PorterStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    target = ['some', 'sampl', 'text', 'for', 'test', 'annot']
    assert stems == target

    # Try a different stemmer
    filt = WordStemmingFilter(stemmer='snowball', language='english')
    assert isinstance(filt.stemmer, nls.SnowballStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Handles StemmerI stemmer
    stemmer = nls.SnowballStemmer(language='english')
    filt = WordStemmingFilter(stemmer=stemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Try lemmatization filter
    try:
        nltk.find('taggers/universal_tagset')
    except LookupError:
        nltk.download('universal_tagset')
    try:
        nltk.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')
    stim = ComplexTextStim(text='These are tests for Stemming filters')
    filt = WordStemmingFilter(stemmer='wordnet')
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['these', 'be', 'test', 'for', 'stem', 'filter']
    assert lemmas == target

    # Try case sensitive
    filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True)
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['These', 'be', 'test', 'for', 'Stemming', 'filter']
    assert lemmas == target

    # Fails on invalid values
    with pytest.raises(ValueError):
        filt = WordStemmingFilter(stemmer='nonexistent_stemmer')

    # Try a long text stim
    stim2 = TextStim(text='theres something happening here')
    filt = WordStemmingFilter()
    assert filt.transform(stim2).text == 'there someth happen here'