def test_stems_and_filters_correctly(self): word_processor = WordTokenizer.from_params( Params({ 'word_stemmer': { 'type': 'porter' }, 'word_filter': { 'type': 'stopwords' } })) sentence = "this (sentence) has 'crazy' \"punctuation\"." expected_tokens = ["sentenc", "ha", "crazi", "punctuat"] tokens = word_processor.tokenize(sentence) assert tokens == expected_tokens
def from_params(cls, params: Params) -> 'NgramWordsIndexer': """ Parameters ---------- namespace : ``str``, optional (default=``shared_words_vocab``) We will use this namespace in the :class:`Vocabulary` to map the words in each token to indices. word_tokenizer : `WordTokenizer`, optional (default=`WordTokenizer(word_splitter=JustSpacesWordSplitter())`) Defines the way we split ngram to words. Default is just to split by space. """ namespace = params.pop('namespace', 'shared_words_vocab') word_tokenizer_params = params.pop('word_tokenizer', {}) word_tokenizer = WordTokenizer.from_params(word_tokenizer_params) params.assert_empty(cls.__name__) return cls(namespace=namespace, word_tokenizer=word_tokenizer)