Exemple #1
0
def test_tokenizer_add_special_case_tag(text, tokens):
    vocab = Vocab()
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
    assert doc[0].norm_ == tokens[0]["norm"]
    assert doc[1].text == tokens[1]["orth"]
Exemple #2
0
def test_tokenizer_add_special_case_tag(text, tokens):
    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
    assert doc[0].tag_ == tokens[0]["tag"]
    assert doc[0].pos_ == "NOUN"
    assert doc[1].text == tokens[1]["orth"]
Exemple #3
0
def test_tokenizer_add_special_case_tag(text, tokens):
    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
    assert doc[0].tag_ == tokens[0]["tag"]
    assert doc[0].pos_ == "NOUN"
    assert doc[1].text == tokens[1]["orth"]
Exemple #4
0
"""

References:
1. https://spacy.io/api/tokenizer
1. https://github.com/explosion/spaCy/issues/396
"""
import spacy
nlp = spacy.load('en_core_web_lg')
from spacy.attrs import ORTH, LEMMA
from spacy.tokenizer import Tokenizer
exceptions = [{"us": [{ORTH: "us"}, {ORTH: "-east", LEMMA: "east"}]}]
tokenizer = Tokenizer(nlp.vocab)
tokenizer.add_special_case("""us-east-1""", exceptions)