def test_multiple_text_filters(): stim = TextStim(text='testing the filtering features') filt1 = TokenizingFilter() filt2 = WordStemmingFilter() stemmed_tokens = filt2.transform(filt1.transform(stim)) full_text = ' '.join([s.text for s in stemmed_tokens]) assert full_text == 'test the filter featur'
def test_tokenizing_filter(): stim = TextStim(join(TEXT_DIR, 'scandal.txt')) filt = TokenizingFilter() words = filt.transform(stim) assert len(words) == 231 assert words[0].text == 'To' custom_tokenizer = PunktSentenceTokenizer() filt = TokenizingFilter(tokenizer=custom_tokenizer) sentences = filt.transform(stim) assert len(sentences) == 11 assert sentences[0].text == 'To Sherlock Holmes she is always the woman.' filt = TokenizingFilter('RegexpTokenizer', '\w+|\$[\d\.]+|\S+') tokens = filt.transform(stim) assert len(tokens) == 231 assert tokens[0].text == 'To'
def test_multiple_text_filters(): stim = TextStim(text='testing the filtering features') filt1 = TokenizingFilter() filt2 = WordStemmingFilter() stemmed_tokens = filt2.transform(filt1.transform(stim)) full_text = ' '.join([s.text for s in stemmed_tokens]) assert full_text == 'test the filter featur' stim = TextStim(text='ARTICLE ONE: Rights') g = Graph() g.add_node(LowerCasingFilter()) filt1 = LowerCasingFilter() filt2 = PunctuationRemovalFilter() filt3 = TokenizingFilter() final_texts = filt3.transform(filt2.transform(filt1.transform(stim))) assert len(final_texts) == 3 assert final_texts[0].text == 'article' assert final_texts[0].order == 0 assert final_texts[1].text == 'one' assert final_texts[2].text == 'rights' assert final_texts[2].order == 2