def test_tokenizer_flush_cache(en_vocab): suffix_re = re.compile(r"[\.]$") tokenizer = Tokenizer( en_vocab, suffix_search=suffix_re.search, ) assert [t.text for t in tokenizer("a.")] == ["a", "."] tokenizer.suffix_search = None assert [t.text for t in tokenizer("a.")] == ["a."]
def custom_tokenizer(nlp): infix_re = re.compile(r'''[?;‘’`“”"'~]''') prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) tokenizer = Tokenizer(nlp.vocab) tokenizer.prefix_search = prefix_re.search tokenizer.suffix_search = suffix_re.search tokenizer.infix_finditer = infix_re.finditer tokenizer.token_match = None return tokenizer