コード例 #1
0
ファイル: test_text_filters.py プロジェクト: undarmaa/pliers
def test_multiple_text_filters():
    stim = TextStim(text='testing the filtering features')
    filt1 = TokenizingFilter()
    filt2 = WordStemmingFilter()
    stemmed_tokens = filt2.transform(filt1.transform(stim))
    full_text = ' '.join([s.text for s in stemmed_tokens])
    assert full_text == 'test the filter featur'
コード例 #2
0
ファイル: test_text_filters.py プロジェクト: undarmaa/pliers
def test_tokenizing_filter():
    stim = TextStim(join(TEXT_DIR, 'scandal.txt'))
    filt = TokenizingFilter()
    words = filt.transform(stim)
    assert len(words) == 231
    assert words[0].text == 'To'

    custom_tokenizer = PunktSentenceTokenizer()
    filt = TokenizingFilter(tokenizer=custom_tokenizer)
    sentences = filt.transform(stim)
    assert len(sentences) == 11
    assert sentences[0].text == 'To Sherlock Holmes she is always the woman.'

    filt = TokenizingFilter('RegexpTokenizer', '\w+|\$[\d\.]+|\S+')
    tokens = filt.transform(stim)
    assert len(tokens) == 231
    assert tokens[0].text == 'To'
コード例 #3
0
def test_multiple_text_filters():
    stim = TextStim(text='testing the filtering features')
    filt1 = TokenizingFilter()
    filt2 = WordStemmingFilter()
    stemmed_tokens = filt2.transform(filt1.transform(stim))
    full_text = ' '.join([s.text for s in stemmed_tokens])
    assert full_text == 'test the filter featur'

    stim = TextStim(text='ARTICLE ONE: Rights')
    g = Graph()
    g.add_node(LowerCasingFilter())
    filt1 = LowerCasingFilter()
    filt2 = PunctuationRemovalFilter()
    filt3 = TokenizingFilter()
    final_texts = filt3.transform(filt2.transform(filt1.transform(stim)))
    assert len(final_texts) == 3
    assert final_texts[0].text == 'article'
    assert final_texts[0].order == 0
    assert final_texts[1].text == 'one'
    assert final_texts[2].text == 'rights'
    assert final_texts[2].order == 2