Beispiel #1
0
def test_mitie():
    from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer
    tk = MITIETokenizer()

    tk.tokenize(u"Hi. My name is rasa") == [
        u'Hi', u'My', u'name', u'is', u'rasa'
    ]
    tk.tokenize(u"ὦ ἄνδρες ᾿Αθηναῖοι.") == [u'ὦ', u'ἄνδρες', u'᾿Αθηναῖοι']
    tk.tokenize_with_offsets(u"Forecast for lunch") == ([
        u'Forecast', u'for', u'lunch'
    ], [0, 9, 13])
Beispiel #2
0
def test_mitie():
    from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer
    tk = MITIETokenizer()

    assert tk.tokenize(u"Hi. My name is rasa") == [
        u'Hi', u'My', u'name', u'is', u'rasa'
    ]
    assert tk.tokenize(u"ὦ ἄνδρες ᾿Αθηναῖοι") == [
        u'ὦ', u'ἄνδρες', u'᾿Αθηναῖοι'
    ]
    assert tk.tokenize_with_offsets(u"Forecast for lunch") == ([
        u'Forecast', u'for', u'lunch'
    ], [0, 9, 13])
    assert tk.tokenize_with_offsets(u"hey ńöñàśçií how're you?") == ([
        u'hey', u'ńöñàśçií', u'how', u'\'re', 'you', '?'
    ], [0, 4, 13, 16, 20, 23])
Beispiel #3
0
 def find_entity(cls, ent, text):
     tk = MITIETokenizer()
     tokens, offsets = tk.tokenize_with_offsets(text)
     if ent["start"] not in offsets:
         message = u"invalid entity {0} in example {1}:".format(ent, text) + \
             u" entities must span whole tokens"
         raise ValueError(message)
     start = offsets.index(ent["start"])
     _slice = text[ent["start"]:ent["end"]]
     val_tokens = tokenize(_slice)
     end = start + len(val_tokens)
     return start, end