def test_mitie():
    from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer
    tk = MitieTokenizer()

    assert [t.text for t in tk.tokenize("Forecast for lunch")] == ['Forecast', 'for', 'lunch']
    assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13]

    assert [t.text for t in tk.tokenize("hey ńöñàśçií how're you?")] == ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?']
    assert [t.offset for t in tk.tokenize("hey ńöñàśçií how're you?")] == [0, 4, 13, 16, 20, 23]
Beispiel #2
0
    def find_entity(ent, text):
        from mitie import tokenize

        tk = MitieTokenizer()
        tokens, offsets = tk.tokenize_with_offsets(text)
        if ent["start"] not in offsets:
            message = u"Invalid entity {0} in example '{1}':".format(ent, text) + \
                      u" entities must span whole tokens"
            raise ValueError(message)
        start = offsets.index(ent["start"])
        _slice = text[ent["start"]:ent["end"]]
        val_tokens = tokenize(_slice)
        end = start + len(val_tokens)
        return start, end
Beispiel #3
0
def test_mitie_featurizer(mitie_feature_extractor, default_config):
    from rasa_nlu.featurizers.mitie_featurizer import MitieFeaturizer

    ftr = MitieFeaturizer.create(config.load("sample_configs/config_mitie.yml"))
    sentence = "Hey how are you today"
    tokens = MitieTokenizer().tokenize(sentence)
    vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
    expected = np.array([0., -4.4551446, 0.26073121, -1.46632245, -1.84205751])
    assert np.allclose(vecs[:5], expected, atol=1e-5)
def test_mitie_featurizer(mitie_feature_extractor, default_config):
    from rasa_nlu.featurizers.mitie_featurizer import MitieFeaturizer

    mitie_component_config = {'name': "MitieFeaturizer"}
    ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
    sentence = "Hey how are you today"
    tokens = MitieTokenizer().tokenize(sentence)
    vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
    expected = np.array([0., -4.4551446, 0.26073121, -1.46632245, -1.84205751])
    assert np.allclose(vecs[:5], expected, atol=1e-5)
def test_mitie_featurizer(mitie_feature_extractor, default_config):
    from rasa_nlu.featurizers.mitie_featurizer import MitieFeaturizer

    default_config["mitie_file"] = os.environ.get('MITIE_FILE')
    if not default_config["mitie_file"] or not os.path.isfile(default_config["mitie_file"]):
        default_config["mitie_file"] = os.path.join("data", "total_word_feature_extractor.dat")

    ftr = MitieFeaturizer.load()
    sentence = "Hey how are you today"
    tokens = MitieTokenizer().tokenize(sentence)
    vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
    assert np.allclose(vecs[:5], np.array([0., -4.4551446, 0.26073121, -1.46632245, -1.84205751]), atol=1e-5)
Beispiel #6
0
def tokenizer_from_name(name, language):
    from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer
    from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer

    if name == MitieTokenizer.name:
        return MitieTokenizer()
    elif name == SpacyTokenizer.name:
        import spacy
        nlp = spacy.load(language, parser=False, entity=False)
        return SpacyTokenizer(nlp)
    elif name == WhitespaceTokenizer.name:
        return WhitespaceTokenizer()
Beispiel #7
0
def test_mitie():
    from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer
    tk = MitieTokenizer()

    assert tk.tokenize("Hi. My name is rasa") == ['Hi', 'My', 'name', 'is', 'rasa']
    assert tk.tokenize("ὦ ἄνδρες ᾿Αθηναῖοι") == ['ὦ', 'ἄνδρες', '᾿Αθηναῖοι']
    assert tk.tokenize_with_offsets("Forecast for lunch") == (['Forecast', 'for', 'lunch'], [0, 9, 13])
    assert tk.tokenize_with_offsets("hey ńöñàśçií how're you?") == (
        ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?'],
        [0, 4, 13, 16, 20, 23])
Beispiel #8
0
def test_mitie():
    from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer
    tk = MitieTokenizer()

    assert [t.text for t in tk.tokenize("Forecast for lunch")
            ] == ['Forecast', 'for', 'lunch']
    assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13]

    assert [t.text for t in tk.tokenize("hey ńöñàśçií how're you?")
            ] == ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?']
    assert [t.offset for t in tk.tokenize("hey ńöñàśçií how're you?")
            ] == [0, 4, 13, 16, 20, 23]