Esempio n. 1
0
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
    """Test that custom tokenizer with not all functions defined or empty
    properties can be serialized and deserialized correctly (see #2494,
    #4991)."""
    tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
    tokenizer_bytes = tokenizer.to_bytes()
    Tokenizer(en_vocab).from_bytes(tokenizer_bytes)

    # test that empty/unset values are set correctly on deserialization
    tokenizer = get_lang_class("en")().tokenizer
    tokenizer.token_match = re.compile("test").match
    assert tokenizer.rules != {}
    assert tokenizer.token_match is not None
    assert tokenizer.url_match is not None
    tokenizer.from_bytes(tokenizer_bytes)
    assert tokenizer.rules == {}
    assert tokenizer.token_match is None
    assert tokenizer.url_match is None

    tokenizer = Tokenizer(en_vocab,
                          rules={"ABC.": [{
                              "ORTH": "ABC"
                          }, {
                              "ORTH": "."
                          }]})
    tokenizer.rules = {}
    tokenizer_bytes = tokenizer.to_bytes()
    tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
    assert tokenizer_reloaded.rules == {}
Esempio n. 2
0
def test_tokenizer_flush_specials(en_vocab):
    suffix_re = re.compile(r"[\.]$")
    rules = {"a a": [{"ORTH": "a a"}]}
    tokenizer1 = Tokenizer(
        en_vocab,
        suffix_search=suffix_re.search,
        rules=rules,
    )
    assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
    tokenizer1.rules = {}
    assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
Esempio n. 3
0
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
    """Test that custom tokenizer with not all functions defined or empty
    properties can be serialized and deserialized correctly (see #2494,
    #4991)."""
    tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
    tokenizer_bytes = tokenizer.to_bytes()
    Tokenizer(en_vocab).from_bytes(tokenizer_bytes)

    tokenizer = Tokenizer(en_vocab,
                          rules={"ABC.": [{
                              "ORTH": "ABC",
                              "ORTH": "."
                          }]})
    tokenizer.rules = {}
    tokenizer_bytes = tokenizer.to_bytes()
    tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
    assert tokenizer_reloaded.rules == {}