Beispiel #1
0
def init_tokenizer():
    # import spacy
    # import os
    from spacy.lang.en import English
    from spacy.attrs import ORTH
    # nlp = spacy.load(os.environ.get('SPACY_MODEL', 'en'), disable=['tagger', 'ner'])
    # TODO: this may have compatibility issue
    tokenizer = English().Defaults.create_tokenizer()
    #  add special segmenting case for spacy tokenizer
    tokenizer.add_special_case('I.', [{ORTH: "I"}, {ORTH: "."}])
    for token in RESERVED_TOKENS:
        tokenizer.add_special_case(token, [{ORTH: token}])
    return tokenizer
Beispiel #2
0
def test_tokenizer_special_cases_with_affixes_preserve_spacy():
    tokenizer = English().tokenizer
    # reset all special cases
    tokenizer.rules = {}

    # in-place modification (only merges)
    text = "''a'' "
    tokenizer.add_special_case("''", [{"ORTH": "''"}])
    assert tokenizer(text).text == text

    # not in-place (splits and merges)
    tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
    text = "ab ab ab ''ab ab'' ab'' ''ab"
    assert tokenizer(text).text == text
Beispiel #3
0
def test_issue1061():
    """Test special-case works after tokenizing. Was caching problem."""
    text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
    tokenizer = English().tokenizer
    doc = tokenizer(text)
    assert "MATH" in [w.text for w in doc]
    assert "_MATH_" not in [w.text for w in doc]

    tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
    doc = tokenizer(text)
    assert "_MATH_" in [w.text for w in doc]
    assert "MATH" not in [w.text for w in doc]

    # For sanity, check it works when pipeline is clean.
    tokenizer = English().tokenizer
    tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
    doc = tokenizer(text)
    assert "_MATH_" in [w.text for w in doc]
    assert "MATH" not in [w.text for w in doc]