Beispiel #1
0
    def parse(self, utterance, context=None, N=1):
        """

        :param utterance:
        :param context: a list of entities
        :param N:
        :return:
        """
        start = time.time()
        context_trie = None
        if context and isinstance(context, list):
            # sort by confidence in ascending order, so
            # highest confidence for an entity is last.
            # see comment on TrieNode ctor
            context.sort(key=lambda x: x.get('confidence'))

            context_trie = Trie()
            for entity in context:
                entity_value, entity_type = entity.get('data')[0]
                context_trie.insert(entity_value.lower(),
                                    data=(entity_value, entity_type),
                                    weight=entity.get('confidence'))

        tagged = self._tagger.tag(utterance.lower(), context_trie=context_trie)
        self.emit("tagged_entities",
                  {
                      'utterance': utterance,
                      'tags': list(tagged),
                      'time': time.time() - start
                  })
        start = time.time()
        bke = BronKerboschExpander(self._tokenizer)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{'confidence': 0.0}])[0].get('confidence')
                score += ec * len(tagged_entity.get('entities', [{'match': ''}])[0].get('match')) / (
                    len(utterance) + 1)
            return score

        parse_results = bke.expand(tagged, clique_scoring_func=score_clique)
        count = 0
        for result in parse_results:
            count += 1
            parse_confidence = 0.0
            for tag in result:
                sample_entity = tag['entities'][0]
                entity_confidence = sample_entity.get('confidence', 0.0) * float(
                    len(sample_entity.get('match'))) / len(utterance)
                parse_confidence += entity_confidence
            yield {
                'utterance': utterance,
                'tags': result,
                'time': time.time() - start,
                'confidence': parse_confidence
            }

            if count >= N:
                break
Beispiel #2
0
 def test_data_is_correct_on_insert(self):
     trie = Trie()
     trie.insert("restaurant", "Concept")
     results = list(trie.lookup("restaurant"))
     assert len(results) == 1
     assert len(results[0].get('data')) == 1
     data = list(results[0].get('data'))
     assert data[0] == 'Concept'
Beispiel #3
0
 def test_data_is_correct_on_insert(self):
     trie = Trie()
     trie.insert("restaurant", "Concept")
     results = list(trie.lookup("restaurant"))
     assert len(results) == 1
     assert len(results[0].get('data')) == 1
     data = list(results[0].get('data'))
     assert data[0] == 'Concept'
Beispiel #4
0
    def test_is_prefix(self):
        trie = Trie()
        trie.insert("play", "PlayVerb")
        trie.insert("the big bang theory", "Television Show")
        trie.insert("the big", "Not a Thing")
        trie.insert("barenaked ladies", "Radio Station")

        assert trie.root.is_prefix("the")
        assert trie.root.is_prefix("play")
        assert not trie.root.is_prefix("Kermit")
Beispiel #5
0
 def setUp(self):
     self.trie = Trie()
     self.tokenizer = EnglishTokenizer()
     self.regex_entities = []
     self.tagger = EntityTagger(self.trie, self.tokenizer, regex_entities=self.regex_entities)
     self.trie.insert("play", ("play", "PlayVerb"))
     self.trie.insert("the big bang theory", ("the big bang theory", "Television Show"))
     self.trie.insert("the big", ("the big", "Not a Thing"))
     self.trie.insert("barenaked ladies", ("barenaked ladies", "Radio Station"))
     self.parser = Parser(self.tokenizer, self.tagger)
Beispiel #6
0
    def tag(self, utterance):
        """
        Tag known entities within the utterance.

        :param utterance: a string of natural language text

        :return: dictionary, with the following keys

        match: str - the proper entity matched

        key: str - the string that was matched to the entity

        start_token: int - 0-based index of the first token matched

        end_token: int - 0-based index of the last token matched

        entities: list - a list of entity kinds as strings (Ex: Artist, Location)
        """
        tokens = self.tokenizer.tokenize(utterance)
        entities = []
        if len(self.regex_entities) > 0:
            for part, idx in self._iterate_subsequences(tokens):
                local_trie = Trie()
                for regex_entity in self.regex_entities:
                    match = regex_entity.match(part)
                    groups = match.groupdict() if match else {}
                    for key in list(groups):
                        match_str = groups.get(key)
                        local_trie.insert(match_str, key)
                sub_tagger = EntityTagger(local_trie, self.tokenizer, max_tokens=self.max_tokens)
                for sub_entity in sub_tagger.tag(part):
                    sub_entity['start_token'] += idx
                    sub_entity['end_token'] += idx
                    for e in sub_entity['entities']:
                        e['confidence'] = 0.5
                    entities.append(sub_entity)
        additional_sort = len(entities) > 0

        for i in xrange(len(tokens)):
            part = ' '.join(tokens[i:])

            for new_entity in self.trie.gather(part):
                new_entity['data'] = list(new_entity['data'])
                entities.append({
                    'match': new_entity.get('match'),
                    'key': new_entity.get('key'),
                    'start_token': i,
                    'entities': [new_entity],
                    'end_token': i + len(self.tokenizer.tokenize(new_entity.get('match'))) - 1
                })

        if additional_sort:
            entities = self._sort_and_merge_tags(entities)

        return entities
Beispiel #7
0
    def test_remove_multi_first(self):
        trie = Trie(max_edit_distance=2)
        trie.insert("Kermit", "Muppets")
        trie.insert("Kermit", "Frogs")
        kermit_lookup = list(trie.lookup("Kermit"))[0]
        assert 'Frogs' in kermit_lookup['data']
        assert 'Muppets' in kermit_lookup['data']

        trie.remove("Kermit", "Muppets")

        kermit_lookup = list(trie.lookup("Kermit"))[0]
        assert kermit_lookup['data'] == {"Frogs"}  # Right data remains
Beispiel #8
0
 def setUp(self):
     self.tokenizer = EnglishTokenizer()
     self.trie = Trie(max_edit_distance=2)
     self.trie.insert("x-play", "Television Show")
     self.trie.insert("play", "Play Verb")
     self.trie.insert("play season", "Time Period")
     self.trie.insert("play", "Player Control")
     self.trie.insert("season", "Season Prefix")
     self.trie.insert("1", "Number")
     self.trie.insert("the big bang theory", "Television Show")
     self.trie.insert("the big", "Television Show")
     self.trie.insert("big bang", "event")
     self.trie.insert("bang theory", "Scientific Theory")
     self.tagger = EntityTagger(self.trie, self.tokenizer)
Beispiel #9
0
 def test_retrieval_based_on_insertion_order(self):
     trie = Trie()
     trie.insert("rest")
     trie.insert("restaurant")
     results = list(trie.lookup("rest"))
     assert len(results) == 1
     results = list(trie.lookup("restaurant"))
     assert len(results) == 1
Beispiel #10
0
    def test_named_remove(self):
        trie = Trie()
        trie.insert("1", "Number")
        trie.insert("1", "The Loneliest")
        results = list(trie.lookup("1"))
        assert len(results) == 1
        assert len(results[0].get('data')) == 2

        assert trie.remove("1", "Number")
        results = list(trie.lookup("1"))
        assert len(results) == 1
        assert len(results[0].get('data')) == 1
Beispiel #11
0
    def test_intent_with_regex_entity(self):
        self.trie = Trie()
        self.tagger = EntityTagger(self.trie, self.tokenizer, self.regex_entities)
        self.parser = Parser(self.tokenizer, self.tagger)
        self.trie.insert("theory", ("theory", "Concept"))
        regex = re.compile(r"the (?P<Event>.*)")
        self.regex_entities.append(regex)
        intent = IntentBuilder("mock intent")\
            .require("Event")\
            .require("Concept").build()

        for result in self.parser.parse("the big bang theory"):
            result_intent = intent.validate(result.get('tags'), result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Event') == 'big bang'
            assert result_intent.get('Concept') == "theory"
Beispiel #12
0
 def __init__(self, tokenizer=None, trie=None):
     pyee.EventEmitter.__init__(self)
     self.tokenizer = tokenizer or EnglishTokenizer()
     self.trie = trie or Trie()
     self.regular_expressions_entities = []
     self._regex_strings = set()
     self.tagger = EntityTagger(self.trie, self.tokenizer, self.regular_expressions_entities)
     self.intent_parsers = []
Beispiel #13
0
 def test_gather(self):
     trie = Trie()
     trie.insert("rest")
     trie.insert("restaurant")
     results = list(trie.gather("restaurant"))
     assert len(results) == 1
     assert results[0].get('key') == "restaurant"
Beispiel #14
0
 def test_retrieval_based_on_insertion_order(self):
     trie = Trie()
     trie.insert("rest")
     trie.insert("restaurant")
     results = list(trie.lookup("rest"))
     assert len(results) == 1
     results = list(trie.lookup("restaurant"))
     assert len(results) == 1
Beispiel #15
0
 def setUp(self):
     self.trie = Trie()
     self.tokenizer = EnglishTokenizer()
     self.regex_entities = []
     self.tagger = EntityTagger(self.trie, self.tokenizer, regex_entities=self.regex_entities)
     self.trie.insert("play", "PlayVerb")
     self.trie.insert("the big bang theory", "Television Show")
     self.trie.insert("the big", "Not a Thing")
     self.trie.insert("barenaked ladies", "Radio Station")
     self.parser = Parser(self.tokenizer, self.tagger)
Beispiel #16
0
    def test_named_remove(self):
        trie = Trie()
        trie.insert("1", "Number")
        trie.insert("1", "The Loneliest")
        results = list(trie.lookup("1"))
        assert len(results) == 1
        assert len(results[0].get('data')) == 2

        assert trie.remove("1", "Number")
        results = list(trie.lookup("1"))
        assert len(results) == 1
        assert len(results[0].get('data')) == 1
Beispiel #17
0
    def test_scan(self):
        trie = Trie(max_edit_distance=2)
        trie.insert("Kermit", "Muppets")
        trie.insert("Gonzo", "Muppets")
        trie.insert("Rowlf", "Muppets")
        trie.insert("Gobo", "Fraggles")

        def match_func(data):
            return data == "Muppets"

        results = trie.scan(match_func)
        assert len(results) == 3
        muppet_names = [r[0] for r in results]
        assert "Kermit" in muppet_names
        assert "Gonzo" in muppet_names
        assert "Rowlf" in muppet_names
Beispiel #18
0
 def test_edit_distance(self):
     trie = Trie(max_edit_distance=1)
     trie.insert("restaurant")
     results = list(trie.lookup("restauran"))
     assert len(results) == 1
     results = list(trie.lookup("estaurant"))
     assert len(results) == 1
     results = list(trie.lookup("estauran"))
     assert len(results) == 0
Beispiel #19
0
    def test_simple_remove(self):
        trie = Trie()
        trie.insert("1", "Number")
        results = list(trie.lookup("1"))
        assert len(results) == 1
        assert len(results[0].get('data')) == 1

        assert trie.remove("1")
        results = list(trie.lookup("1"))
        assert len(results) == 0
Beispiel #20
0
 def __init__(self):
     self.trie = Trie()
     self.tokenizer = EnglishTokenizer()
     self.regex_entities = []
     self.tagger = EntityTagger(self.trie,
                                self.tokenizer,
                                regex_entities=self.regex_entities)
     self.trie.insert("play", ("play", "PlayVerb"))
     self.trie.insert("play", ("play", "Command"))
     self.trie.insert("the big bang theory",
                      ("the big bang theory", "Television Show"))
     self.trie.insert("all that", ("all that", "Television Show"))
     self.trie.insert("all that", ("all that", "Radio Station"))
     self.trie.insert("the big", ("the big", "Not a Thing"))
     self.trie.insert("barenaked ladies",
                      ("barenaked ladies", "Radio Station"))
     self.trie.insert("show", ("show", "Command"))
     self.trie.insert("what", ("what", "Question"))
     self.parser = Parser(self.tokenizer, self.tagger)
     self.intent = IntentBuilder("Test Intent").require(
         "PlayVerb").one_of("Television Show", "Radio Station").build()
Beispiel #21
0
    def test_remove(self):
        trie = Trie(max_edit_distance=2)
        trie.insert("1", "Number")
        trie.insert("2", "Number")
        trie.remove("2")

        one_lookup = list(trie.gather("1"))
        two_lookup = list(trie.gather("2"))
        assert len(one_lookup) == 1  # One match found
        assert len(two_lookup) == 0  # Zero matches since removed
Beispiel #22
0
 def test_gather(self):
     trie = Trie()
     trie.insert("rest")
     trie.insert("restaurant")
     results = list(trie.gather("restaurant"))
     assert len(results) == 1
     assert results[0].get('key') == "restaurant"
Beispiel #23
0
class EntityTaggerTest(unittest.TestCase):

    def setUp(self):
        self.trie = Trie()
        self.tagger = EntityTagger(self.trie, EnglishTokenizer())
        self.trie.insert("play", "PlayVerb")
        self.trie.insert("the big bang theory", "Television Show")
        self.trie.insert("the big", "Not a Thing")

    def tearDown(self):
        pass

    def test_tag(self):
        tags = list(self.tagger.tag("play season 1 of the big bang theory"))
        assert len(tags) == 3

    def test_regex_tag(self):
        regex = re.compile(r"the (?P<Event>\w+\s\w+) theory")
        tagger = EntityTagger(self.trie, EnglishTokenizer(), regex_entities=[regex])
        tags = tagger.tag("the big bang theory")
        assert len(tags) == 3
        event_tags = [tag for tag in tags if tag.get('match') == 'big bang']
        assert len(event_tags) == 1
        assert len(event_tags[0].get('entities')) == 1
        assert len(event_tags[0].get('entities')[0].get('data')) == 1
        assert ('big bang', 'Event') in event_tags[0].get('entities')[0].get('data')

    def test_start_end_token_match_when_sorting_tagged_entities(self):
        repro_payload = [{"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "SnoozeTime"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 4, "key": "20 minutes", "entities": [{"key": "20 minutes", "data": [["20 minutes", "SnoozeTime"]], "confidence": 0.5, "match": "20 minutes"}], "start_token": 3, "match": "20 minutes"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "Which"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "Which"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 0, "key": "snooze", "entities": [{"key": "snooze", "data": [["snooze", "SnoozeKeyword"]], "confidence": 1.0, "match": "snooze"}], "start_token": 0, "match": "snooze"}, {"end_token": 2, "key": "for", "entities": [{"key": "for", "data": [["for", "SnoozeFiller"]], "confidence": 1.0, "match": "for"}], "start_token": 2, "match": "for"}]
        # just asserting that the sort does not crash in py3
        self.tagger._sort_and_merge_tags(repro_payload)
Beispiel #24
0
class EntityTaggerTest(unittest.TestCase):

    def setUp(self):
        self.trie = Trie()
        self.tagger = EntityTagger(self.trie, EnglishTokenizer())
        self.trie.insert("play", "PlayVerb")
        self.trie.insert("the big bang theory", "Television Show")
        self.trie.insert("the big", "Not a Thing")

    def tearDown(self):
        pass

    def test_tag(self):
        tags = list(self.tagger.tag("play season 1 of the big bang theory"))
        assert len(tags) == 3

    def test_regex_tag(self):
        regex = re.compile(r"the (?P<Event>\w+\s\w+) theory")
        tagger = EntityTagger(self.trie, EnglishTokenizer(), regex_entities=[regex])
        tags = tagger.tag("the big bang theory")
        assert len(tags) == 3
        event_tags = [tag for tag in tags if tag.get('match') == 'big bang']
        assert len(event_tags) == 1
        assert len(event_tags[0].get('entities')) == 1
        assert len(event_tags[0].get('entities')[0].get('data')) == 1
        assert 'Event' in event_tags[0].get('entities')[0].get('data')
Beispiel #25
0
    def __init__(self, tokenizer=None, trie=None):
        """
        Initialize the IntentDeterminationEngine

        Args:
            tokenizer(tokenizer) : tokenizer used to break up spoken text
                example EnglishTokenizer()
            trie(Trie): tree of matches to Entites
        """
        self.tokenizer = tokenizer or EnglishTokenizer()
        self.trie = trie or Trie()
        self.regular_expressions_entities = []
        self._regex_strings = set()
        self.intent_parsers = []
Beispiel #26
0
 def setUp(self):
     self.tokenizer = EnglishTokenizer()
     self.trie = Trie(max_edit_distance=2)
     self.trie.insert("x-play", "Television Show")
     self.trie.insert("play", "Play Verb")
     self.trie.insert("play season", "Time Period")
     self.trie.insert("play", "Player Control")
     self.trie.insert("season", "Season Prefix")
     self.trie.insert("1", "Number")
     self.trie.insert("the big bang theory", "Television Show")
     self.trie.insert("the big", "Television Show")
     self.trie.insert("big bang", "event")
     self.trie.insert("bang theory", "Scientific Theory")
     self.tagger = EntityTagger(self.trie, self.tokenizer)
Beispiel #27
0
 def test_edit_distance(self):
     trie = Trie(max_edit_distance=1)
     trie.insert("restaurant")
     results = list(trie.lookup("restauran"))
     assert len(results) == 1
     results = list(trie.lookup("estaurant"))
     assert len(results) == 1
     results = list(trie.lookup("estauran"))
     assert len(results) == 0
Beispiel #28
0
    def test_simple_remove(self):
        trie = Trie()
        trie.insert("1", "Number")
        results = list(trie.lookup("1"))
        assert len(results) == 1
        assert len(results[0].get('data')) == 1

        assert trie.remove("1")
        results = list(trie.lookup("1"))
        assert len(results) == 0
Beispiel #29
0
    def test_intent_with_regex_entity(self):
        self.trie = Trie()
        self.tagger = EntityTagger(self.trie, self.tokenizer, self.regex_entities)
        self.parser = Parser(self.tokenizer, self.tagger)
        self.trie.insert("theory", ("theory", "Concept"))
        regex = re.compile(r"the (?P<Event>.*)")
        self.regex_entities.append(regex)
        intent = IntentBuilder("mock intent")\
            .require("Event")\
            .require("Concept").build()

        for result in self.parser.parse("the big bang theory"):
            result_intent = intent.validate(result.get('tags'), result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Event') == 'big bang'
            assert result_intent.get('Concept') == "theory"
Beispiel #30
0
 def test_missing_entities(self):
     trie = Trie()
     trie.insert("restaurant", "Concept")
     trie.insert("rest", "Concept")
     trie.insert("restaurant2", "Fast")
     results = list(trie.gather("restaurant"))
     assert len(results) == 1
     assert trie.root.entities() == ['root', 'Concept', 'Fast']
     assert trie.checkForMissingEntites("Concept") is None
     assert trie.checkForMissingEntites("root") is None
     assert trie.checkForMissingEntites("Fast") is None
     assert trie.checkForMissingEntites(['root', 'Concept', 'Fast']) is None
     assert trie.checkForMissingEntites(['root2', 'Concept',
                                         'Fast']) == ["root2"]
     assert trie.checkForMissingEntites(
         ('root2', 'Concept', 'Fast')) == ["root2"]
Beispiel #31
0
    def tag(self, utterance):
        """
        Tag known entities within the utterance.

        :param utterance: a string of natural language text

        :return: dictionary, with the following keys

        match: str - the proper entity matched

        key: str - the string that was matched to the entity

        start_token: int - 0-based index of the first token matched

        end_token: int - 0-based index of the last token matched

        entities: list - a list of entity kinds as strings (Ex: Artist, Location)
        """
        tokens = self.tokenizer.tokenize(utterance)
        entities = []
        if len(self.regex_entities) > 0:
            for part, idx in self._iterate_subsequences(tokens):
                local_trie = Trie()
                for regex_entity in self.regex_entities:
                    match = regex_entity.match(part)
                    groups = match.groupdict() if match else {}
                    for key in list(groups):
                        match_str = groups.get(key)
                        local_trie.insert(match_str, key)
                sub_tagger = EntityTagger(local_trie,
                                          self.tokenizer,
                                          max_tokens=self.max_tokens)
                for sub_entity in sub_tagger.tag(part):
                    sub_entity['start_token'] += idx
                    sub_entity['end_token'] += idx
                    for e in sub_entity['entities']:
                        e['confidence'] = 0.5
                    entities.append(sub_entity)
        additional_sort = len(entities) > 0

        for i in xrange(len(tokens)):
            part = ' '.join(tokens[i:])

            for new_entity in self.trie.gather(part):
                new_entity['data'] = list(new_entity['data'])
                entities.append({
                    'match':
                    new_entity.get('match'),
                    'key':
                    new_entity.get('key'),
                    'start_token':
                    i,
                    'entities': [new_entity],
                    'end_token':
                    i + len(self.tokenizer.tokenize(new_entity.get('match'))) -
                    1
                })

        if additional_sort:
            entities = self._sort_and_merge_tags(entities)

        return entities
Beispiel #32
0
 def test_basic_retrieval(self):
     trie = Trie()
     trie.insert("restaurant")
     results = list(trie.lookup("restaurant"))
     assert len(results) == 1
Beispiel #33
0
 def test_edit_distance_no_confidence(self):
     trie = Trie(max_edit_distance=2)
     trie.insert("1", "Number")
     results = list(trie.gather("of the big bang theory"))
     assert len(results) == 0
Beispiel #34
0
 def test_insert_single_character_entity(self):
     trie = Trie()
     trie.insert("1", "Number")
     results = list(trie.gather("1 of the big bang theory"))
     assert len(results) == 1
     assert len(results[0].get('data')) == 1
Beispiel #35
0
 def test_basic_retrieval(self):
     trie = Trie()
     trie.insert("restaurant")
     results = list(trie.lookup("restaurant"))
     assert len(results) == 1
Beispiel #36
0
 def test_retrieval_of_multi_word_entity(self):
     trie = Trie()
     trie.insert("play", "PlayVerb")
     trie.insert("the big bang theory", "Television Series")
     results = list(trie.gather("1 of the big bang theory"))
     assert len(results) == 0
Beispiel #37
0
    def parse(self, utterance, context=None, N=1):
        """

        :param utterance:
        :param context: a list of entities
        :param N:
        :return:
        """
        start = time.time()
        context_trie = None
        if context and isinstance(context, list):
            # sort by confidence in ascending order, so
            # highest confidence for an entity is last.
            # see comment on TrieNode ctor
            context.sort(key=lambda x: x.get('confidence'))

            context_trie = Trie()
            for entity in context:
                entity_value, entity_type, metadata = entity.get('data')[0]
                context_trie.insert(entity_value.lower(),
                                    data=(entity_value, entity_type, metadata),
                                    weight=entity.get('confidence'))

        tagged = self._tagger.tag(utterance.lower(), context_trie=context_trie)
        self.emit(
            "tagged_entities", {
                'utterance': utterance,
                'tags': list(tagged),
                'time': time.time() - start
            })
        start = time.time()
        bke = BronKerboschExpander(self._tokenizer)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{
                    'confidence': 0.0
                }])[0].get('confidence')
                score += ec * len(
                    tagged_entity.get('entities', [{
                        'match': ''
                    }])[0].get('match')) / (len(utterance) + 1)
            return score

        parse_results = bke.expand(tagged, clique_scoring_func=score_clique)
        count = 0
        for result in parse_results:
            count += 1
            parse_confidence = 0.0
            for tag in result:
                sample_entity = tag['entities'][0]
                entity_confidence = sample_entity.get(
                    'confidence', 0.0) * float(len(
                        sample_entity.get('match'))) / len(utterance)
                parse_confidence += entity_confidence
            yield {
                'utterance': utterance,
                'tags': result,
                'time': time.time() - start,
                'confidence': parse_confidence
            }

            if count >= N:
                break
Beispiel #38
0
class IntentTest(unittest.TestCase):
    def setUp(self):
        self.trie = Trie()
        self.tokenizer = EnglishTokenizer()
        self.regex_entities = []
        self.tagger = EntityTagger(self.trie,
                                   self.tokenizer,
                                   regex_entities=self.regex_entities)
        self.trie.insert("play", ("play", "PlayVerb"))
        self.trie.insert("the big bang theory",
                         ("the big bang theory", "Television Show"))
        self.trie.insert("the big", ("the big", "Not a Thing"))
        self.trie.insert("barenaked ladies",
                         ("barenaked ladies", "Radio Station"))
        self.trie.insert("show", ("show", "Command"))
        self.trie.insert("what", ("what", "Question"))
        self.parser = Parser(self.tokenizer, self.tagger)

    def tearDown(self):
        pass

    def test_basic_intent(self):
        intent = IntentBuilder("play television intent")\
            .require("PlayVerb")\
            .require("Television Show")\
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('PlayVerb') == 'play'
            assert result_intent.get(
                'Television Show') == "the big bang theory"

    def test_at_least_one(self):
        intent = IntentBuilder("play intent")\
            .require("PlayVerb")\
            .one_of("Television Show", "Radio Station")\
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('PlayVerb') == 'play'
            assert result_intent.get(
                'Television Show') == "the big bang theory"

        for result in self.parser.parse("play the barenaked ladies"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('PlayVerb') == 'play'
            assert result_intent.get('Radio Station') == "barenaked ladies"

    def test_at_least_on_no_required(self):
        intent = IntentBuilder("play intent") \
            .one_of("Television Show", "Radio Station") \
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get(
                'Television Show') == "the big bang theory"

        for result in self.parser.parse("play the barenaked ladies"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Radio Station') == "barenaked ladies"

    def test_at_least_one_alone(self):
        intent = IntentBuilder("OptionsForLunch") \
            .one_of("Question", "Command") \
            .build()

        for result in self.parser.parse("show"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Command') == "show"

    def test_basic_intent_with_alternate_names(self):
        intent = IntentBuilder("play television intent")\
            .require("PlayVerb", "Play Verb")\
            .require("Television Show", "series")\
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Play Verb') == 'play'
            assert result_intent.get('series') == "the big bang theory"

    def test_intent_with_regex_entity(self):
        self.trie = Trie()
        self.tagger = EntityTagger(self.trie, self.tokenizer,
                                   self.regex_entities)
        self.parser = Parser(self.tokenizer, self.tagger)
        self.trie.insert("theory", ("theory", "Concept"))
        regex = re.compile(r"the (?P<Event>.*)")
        self.regex_entities.append(regex)
        intent = IntentBuilder("mock intent")\
            .require("Event")\
            .require("Concept").build()

        for result in self.parser.parse("the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Event') == 'big bang'
            assert result_intent.get('Concept') == "theory"

    def test_intent_using_alias(self):
        self.trie.insert("big bang",
                         ("the big bang theory", "Television Show"))
        intent = IntentBuilder("play television intent")\
            .require("PlayVerb", "Play Verb")\
            .require("Television Show", "series")\
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Play Verb') == 'play'
            assert result_intent.get('series') == "the big bang theory"
Beispiel #39
0
try with the following:
PYTHONPATH=. python examples/multi_intent_parser.py "what's the weather like in tokyo"
PYTHONPATH=. python examples/multi_intent_parser.py "play some music by the clash"
"""

import json
import sys
from adapt.entity_tagger import EntityTagger
from adapt.tools.text.tokenizer import EnglishTokenizer
from adapt.tools.text.trie import Trie
from adapt.intent import IntentBuilder
from adapt.parser import Parser
from adapt.engine import DomainIntentDeterminationEngine

tokenizer = EnglishTokenizer()
trie = Trie()
tagger = EntityTagger(trie, tokenizer)
parser = Parser(tokenizer, tagger)

engine = DomainIntentDeterminationEngine()

engine.register_domain('Domain1')
engine.register_domain('Domain2')

# define vocabulary
weather_keyword = ["weather"]

for wk in weather_keyword:
    engine.register_entity(wk, "WeatherKeyword", domain='Domain1')

weather_types = ["snow", "rain", "wind", "sleet", "sun"]
Beispiel #40
0
 def test_edit_distance_confidence(self):
     trie = Trie(max_edit_distance=2)
     trie.insert("a")
     trie.insert("bb")
     trie.insert("ccc")
     trie.insert("dddd")
     trie.insert("100")
     results = list(trie.gather("b"))
     assert len(results) == 1
     assert results[0].get('confidence') == 0.5
     results = list(trie.gather("1 of"))
     assert len(results) == 3
Beispiel #41
0
 def test_edit_distance_confidence(self):
     trie = Trie(max_edit_distance=2)
     trie.insert("a")
     trie.insert("bb")
     trie.insert("ccc")
     trie.insert("dddd")
     trie.insert("100")
     results = list(trie.gather("b"))
     assert len(results) == 1
     assert results[0].get('confidence') == 0.5
     results = list(trie.gather("1 of"))
     assert len(results) == 3
Beispiel #42
0
 def test_edit_distance_no_confidence(self):
     trie = Trie(max_edit_distance=2)
     trie.insert("1", "Number")
     results = list(trie.gather("of the big bang theory"))
     assert len(results) == 0
Beispiel #43
0
class BronKerboschExpanderTest(unittest.TestCase):
    def setUp(self):
        self.tokenizer = EnglishTokenizer()
        self.trie = Trie(max_edit_distance=2)
        self.trie.insert("x-play", "Television Show")
        self.trie.insert("play", "Play Verb")
        self.trie.insert("play season", "Time Period")
        self.trie.insert("play", "Player Control")
        self.trie.insert("season", "Season Prefix")
        self.trie.insert("1", "Number")
        self.trie.insert("the big bang theory", "Television Show")
        self.trie.insert("the big", "Television Show")
        self.trie.insert("big bang", "event")
        self.trie.insert("bang theory", "Scientific Theory")
        self.tagger = EntityTagger(self.trie, self.tokenizer)

    def testExpander(self):
        self.tagger.trie.max_edit_distance = 0
        tags = self.tagger.tag("play season 1 of the big bang theory")
        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(expander.expand(tags))
        assert len(parse_results) == 6

    def testExpandedResult(self):
        tags = self.tagger.tag("season 1")
        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(expander.expand(tags))
        assert len(parse_results) == 1
        assert len(parse_results[0]) == 2


    def testConsistentExpandWithSameOverlapMultipleTimes(self):
        """
        example: play season 1 of the big bang theory play season one of the big bang theory
        series should contain two instances of the big bang theory
        :return:
        """
        utterance = "play season 1 of the big bang theory"
        tags = self.tagger.tag(utterance)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{'confidence': 0.0}])[0].get('confidence')
                score += ec * len(tagged_entity.get('entities', [{'match': ''}])[0].get('match')) / (
                    len(utterance) + 1)
            return score
        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(expander.expand(tags, clique_scoring_func=score_clique))
        assert len(parse_results) == 6
        result_text = ' '.join([tag.get('entities')[0].get('key') for tag in parse_results[0]])
        result_parse = ', '.join(
            [tag.get('entities')[0].get('data')[0][1] for tag in parse_results[0]]
        )

        assert result_text == 'play season 1 the big bang theory'

    def testExpandWithRegexAndLiteralTokenMatch(self):
        # two tags for the same token, different confidence, should expand to two cliques
        tags = [{'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell',
                 'entities': [{'confidence': 0.5, 'data': [u'SearchTerms'], 'match': u'spell', 'key': u'spell'}]},
                {'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell',
                 'entities': [{'confidence': 1.0, 'data': [u'SpellingKeyword'], 'match': u'spell', 'key': u'spell'}]}]

        expander = BronKerboschExpander(self.tokenizer)

        cliques = list(expander._sub_expand(tags))
        assert len(cliques) == 2
Beispiel #44
0
 def setUp(self):
     self.trie = Trie()
     self.tagger = EntityTagger(self.trie, EnglishTokenizer())
     self.trie.insert("play", "PlayVerb")
     self.trie.insert("the big bang theory", "Television Show")
     self.trie.insert("the big", "Not a Thing")
Beispiel #45
0
    def tag(self, utterance, context_trie=None):
        """
        Tag known entities within the utterance.
        Args:
            utterance(str): a string of natural language text
            context_trie(trie): optional, a trie containing only entities from context
                for this request

        Returns: dictionary, with the following keys
            match(str): the proper entity matched
            key(str): the string that was matched to the entity
            start_token(int): 0-based index of the first token matched
            end_token(int): 0-based index of the last token matched
            entities(list): a list of entity kinds as strings (Ex: Artist, Location)
        """
        tokens = self.tokenizer.tokenize(utterance)
        entities = []
        if len(self.regex_entities) > 0:
            for part, idx in self._iterate_subsequences(tokens):
                local_trie = Trie()
                for regex_entity in self.regex_entities:
                    match = regex_entity.match(part)
                    groups = match.groupdict() if match else {}
                    for key in list(groups):
                        match_str = groups.get(key)
                        local_trie.insert(match_str, (match_str, key))
                sub_tagger = EntityTagger(local_trie, self.tokenizer, max_tokens=self.max_tokens)
                for sub_entity in sub_tagger.tag(part):
                    sub_entity['start_token'] += idx
                    sub_entity['end_token'] += idx
                    for e in sub_entity['entities']:
                        e['confidence'] = 0.5
                    entities.append(sub_entity)
        additional_sort = len(entities) > 0

        context_entities = []
        for i in xrange(len(tokens)):
            part = ' '.join(tokens[i:])

            for new_entity in self.trie.gather(part):
                new_entity['data'] = list(new_entity['data'])
                entities.append({
                    'match': new_entity.get('match'),
                    'key': new_entity.get('key'),
                    'start_token': i,
                    'entities': [new_entity],
                    'end_token': i + len(self.tokenizer.tokenize(new_entity.get('match'))) - 1,
                    'from_context': False
                })

            if context_trie:
                for new_entity in context_trie.gather(part):
                    new_entity['data'] = list(new_entity['data'])
                    new_entity['confidence'] *= 2.0  # context entities get double the weight!
                    context_entities.append({
                        'match': new_entity.get('match'),
                        'key': new_entity.get('key'),
                        'start_token': i,
                        'entities': [new_entity],
                        'end_token': i + len(self.tokenizer.tokenize(new_entity.get('match'))) - 1,
                        'from_context': True
                    })

        additional_sort = additional_sort or len(entities) > 0

        if additional_sort:
            entities = self._sort_and_merge_tags(entities + context_entities)

        return entities
Beispiel #46
0
 def test_retrieval_of_multi_word_entity(self):
     trie = Trie()
     trie.insert("play", "PlayVerb")
     trie.insert("the big bang theory", "Television Series")
     results = list(trie.gather("1 of the big bang theory"))
     assert len(results) == 0
Beispiel #47
0
 def test_insert_single_character_entity(self):
     trie = Trie()
     trie.insert("1", "Number")
     results = list(trie.gather("1 of the big bang theory"))
     assert len(results) == 1
     assert len(results[0].get('data')) == 1
Beispiel #48
0
class IntentTest(unittest.TestCase):

    def setUp(self):
        self.trie = Trie()
        self.tokenizer = EnglishTokenizer()
        self.regex_entities = []
        self.tagger = EntityTagger(self.trie, self.tokenizer, regex_entities=self.regex_entities)
        self.trie.insert("play", ("play", "PlayVerb"))
        self.trie.insert("the big bang theory", ("the big bang theory", "Television Show"))
        self.trie.insert("the big", ("the big", "Not a Thing"))
        self.trie.insert("barenaked ladies", ("barenaked ladies", "Radio Station"))
        self.trie.insert("show", ("show", "Command"))
        self.trie.insert("what", ("what", "Question"))
        self.parser = Parser(self.tokenizer, self.tagger)

    def tearDown(self):
        pass

    def test_basic_intent(self):
        intent = IntentBuilder("play television intent")\
            .require("PlayVerb")\
            .require("Television Show")\
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'), result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('PlayVerb') == 'play'
            assert result_intent.get('Television Show') == "the big bang theory"

    def test_at_least_one(self):
        intent = IntentBuilder("play intent")\
            .require("PlayVerb")\
            .one_of("Television Show", "Radio Station")\
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'), result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('PlayVerb') == 'play'
            assert result_intent.get('Television Show') == "the big bang theory"

        for result in self.parser.parse("play the barenaked ladies"):
            result_intent = intent.validate(result.get('tags'), result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('PlayVerb') == 'play'
            assert result_intent.get('Radio Station') == "barenaked ladies"

    def test_at_least_on_no_required(self):
        intent = IntentBuilder("play intent") \
            .one_of("Television Show", "Radio Station") \
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'), result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Television Show') == "the big bang theory"

        for result in self.parser.parse("play the barenaked ladies"):
            result_intent = intent.validate(result.get('tags'), result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Radio Station') == "barenaked ladies"

    def test_at_least_one_alone(self):
        intent = IntentBuilder("OptionsForLunch") \
            .one_of("Question", "Command") \
            .build()

        for result in self.parser.parse("show"):
            result_intent = intent.validate(result.get('tags'), result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Command') == "show"

    def test_basic_intent_with_alternate_names(self):
        intent = IntentBuilder("play television intent")\
            .require("PlayVerb", "Play Verb")\
            .require("Television Show", "series")\
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'), result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Play Verb') == 'play'
            assert result_intent.get('series') == "the big bang theory"

    def test_intent_with_regex_entity(self):
        self.trie = Trie()
        self.tagger = EntityTagger(self.trie, self.tokenizer, self.regex_entities)
        self.parser = Parser(self.tokenizer, self.tagger)
        self.trie.insert("theory", ("theory", "Concept"))
        regex = re.compile(r"the (?P<Event>.*)")
        self.regex_entities.append(regex)
        intent = IntentBuilder("mock intent")\
            .require("Event")\
            .require("Concept").build()

        for result in self.parser.parse("the big bang theory"):
            result_intent = intent.validate(result.get('tags'), result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Event') == 'big bang'
            assert result_intent.get('Concept') == "theory"

    def test_intent_using_alias(self):
        self.trie.insert("big bang", ("the big bang theory", "Television Show"))
        intent = IntentBuilder("play television intent")\
            .require("PlayVerb", "Play Verb")\
            .require("Television Show", "series")\
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'), result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Play Verb') == 'play'
            assert result_intent.get('series') == "the big bang theory"
Beispiel #49
0
class BronKerboschExpanderTest(unittest.TestCase):
    def setUp(self):
        self.tokenizer = EnglishTokenizer()
        self.trie = Trie(max_edit_distance=2)
        self.trie.insert("x-play", "Television Show")
        self.trie.insert("play", "Play Verb")
        self.trie.insert("play season", "Time Period")
        self.trie.insert("play", "Player Control")
        self.trie.insert("season", "Season Prefix")
        self.trie.insert("1", "Number")
        self.trie.insert("the big bang theory", "Television Show")
        self.trie.insert("the big", "Television Show")
        self.trie.insert("big bang", "event")
        self.trie.insert("bang theory", "Scientific Theory")
        self.tagger = EntityTagger(self.trie, self.tokenizer)

    def testExpander(self):
        self.tagger.trie.max_edit_distance = 0
        tags = self.tagger.tag("play season 1 of the big bang theory")
        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(expander.expand(tags))
        assert len(parse_results) == 6

    def testExpandedResult(self):
        tags = self.tagger.tag("season 1")
        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(expander.expand(tags))
        assert len(parse_results) == 1
        assert len(parse_results[0]) == 2

    def testConsistentExpandWithSameOverlapMultipleTimes(self):
        """
        example: play season 1 of the big bang theory play season one of the big bang theory
        series should contain two instances of the big bang theory
        :return:
        """
        utterance = "play season 1 of the big bang theory"
        tags = self.tagger.tag(utterance)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{
                    'confidence': 0.0
                }])[0].get('confidence')
                score += ec * len(
                    tagged_entity.get('entities', [{
                        'match': ''
                    }])[0].get('match')) / (len(utterance) + 1)
            return score

        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(
            expander.expand(tags, clique_scoring_func=score_clique))
        assert len(parse_results) == 6
        result_text = ' '.join(
            [tag.get('entities')[0].get('key') for tag in parse_results[0]])
        result_parse = ', '.join([
            tag.get('entities')[0].get('data')[0][1]
            for tag in parse_results[0]
        ])

        assert result_text == 'play season 1 the big bang theory'

    def testExpandWithRegexAndLiteralTokenMatch(self):
        # two tags for the same token, different confidence, should expand to two cliques
        tags = [{
            'end_token':
            0,
            'start_token':
            0,
            'key':
            u'spell',
            'match':
            u'spell',
            'entities': [{
                'confidence': 0.5,
                'data': [u'SearchTerms'],
                'match': u'spell',
                'key': u'spell'
            }]
        }, {
            'end_token':
            0,
            'start_token':
            0,
            'key':
            u'spell',
            'match':
            u'spell',
            'entities': [{
                'confidence': 1.0,
                'data': [u'SpellingKeyword'],
                'match': u'spell',
                'key': u'spell'
            }]
        }]

        expander = BronKerboschExpander(self.tokenizer)

        cliques = list(expander._sub_expand(tags))
        assert len(cliques) == 2
Beispiel #50
0
class IntentTest(unittest.TestCase):
    def setUp(self):
        self.trie = Trie()
        self.tokenizer = EnglishTokenizer()
        self.regex_entities = []
        self.tagger = EntityTagger(self.trie,
                                   self.tokenizer,
                                   regex_entities=self.regex_entities)
        self.trie.insert("play", ("play", "PlayVerb"))
        self.trie.insert("stop", ("stop", "StopVerb"))
        self.trie.insert("the big bang theory",
                         ("the big bang theory", "Television Show"))
        self.trie.insert("the big", ("the big", "Not a Thing"))
        self.trie.insert("barenaked ladies",
                         ("barenaked ladies", "Radio Station"))
        self.trie.insert("show", ("show", "Command"))
        self.trie.insert("what", ("what", "Question"))
        self.parser = Parser(self.tokenizer, self.tagger)

    def tearDown(self):
        pass

    def test_basic_intent(self):
        intent = IntentBuilder("play television intent") \
            .require("PlayVerb") \
            .require("Television Show") \
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('PlayVerb') == 'play'
            assert result_intent.get(
                'Television Show') == "the big bang theory"

    def test_at_least_one(self):
        intent = IntentBuilder("play intent") \
            .require("PlayVerb") \
            .one_of("Television Show", "Radio Station") \
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('PlayVerb') == 'play'
            assert result_intent.get(
                'Television Show') == "the big bang theory"

        for result in self.parser.parse("play the barenaked ladies"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('PlayVerb') == 'play'
            assert result_intent.get('Radio Station') == "barenaked ladies"

    def test_at_least_one_with_tag_in_multiple_slots(self):
        self.trie.insert("temperature", ("temperature", "temperature"))
        self.trie.insert("living room", ("living room", "living room"))
        self.trie.insert("what is", ("what is", "what is"))

        intent = IntentBuilder("test intent") \
            .one_of("what is") \
            .one_of("temperature", "living room") \
            .one_of("temperature") \
            .build()

        for result in self.parser.parse(
                "what is the temperature in the living room"):
            result_intent = intent.validate(result.get("tags"),
                                            result.get("confidence"))
            assert result_intent.get("confidence") > 0.0
            assert result_intent.get("temperature") == "temperature"
            assert result_intent.get("living room") == "living room"
            assert result_intent.get("what is") == "what is"

    def test_at_least_on_no_required(self):
        intent = IntentBuilder("play intent") \
            .one_of("Television Show", "Radio Station") \
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get(
                'Television Show') == "the big bang theory"

        for result in self.parser.parse("play the barenaked ladies"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Radio Station') == "barenaked ladies"

    def test_at_least_one_alone(self):
        intent = IntentBuilder("OptionsForLunch") \
            .one_of("Question", "Command") \
            .build()

        for result in self.parser.parse("show"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Command') == "show"

    def test_basic_intent_with_alternate_names(self):
        intent = IntentBuilder("play television intent") \
            .require("PlayVerb", "Play Verb") \
            .require("Television Show", "series") \
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Play Verb') == 'play'
            assert result_intent.get('series') == "the big bang theory"

    def test_intent_with_regex_entity(self):
        self.trie = Trie()
        self.tagger = EntityTagger(self.trie, self.tokenizer,
                                   self.regex_entities)
        self.parser = Parser(self.tokenizer, self.tagger)
        self.trie.insert("theory", ("theory", "Concept"))
        regex = re.compile(r"the (?P<Event>.*)")
        self.regex_entities.append(regex)
        intent = IntentBuilder("mock intent") \
            .require("Event") \
            .require("Concept").build()

        for result in self.parser.parse("the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Event') == 'big bang'
            assert result_intent.get('Concept') == "theory"

    def test_intent_using_alias(self):
        self.trie.insert("big bang",
                         ("the big bang theory", "Television Show"))
        intent = IntentBuilder("play television intent") \
            .require("PlayVerb", "Play Verb") \
            .require("Television Show", "series") \
            .build()
        for result in self.parser.parse("play the big bang theory"):
            result_intent = intent.validate(result.get('tags'),
                                            result.get('confidence'))
            assert result_intent.get('confidence') > 0.0
            assert result_intent.get('Play Verb') == 'play'
            assert result_intent.get('series') == "the big bang theory"

    def test_resolve_one_of(self):
        tags = [{
            "confidence":
            1.0,
            "end_token":
            1,
            "entities": [{
                "confidence":
                1.0,
                "data": [["what is", "skill_iot_controlINFORMATION_QUERY"]],
                "key":
                "what is",
                "match":
                "what is"
            }],
            "from_context":
            False,
            "key":
            "what is",
            "match":
            "what is",
            "start_token":
            0
        }, {
            "end_token":
            3,
            "entities": [{
                "confidence":
                1.0,
                "data": [["temperature", "skill_weatherTemperature"],
                         ["temperature", "skill_iot_controlTEMPERATURE"]],
                "key":
                "temperature",
                "match":
                "temperature"
            }],
            "from_context":
            False,
            "key":
            "temperature",
            "match":
            "temperature",
            "start_token":
            3
        }, {
            "confidence":
            1.0,
            "end_token":
            7,
            "entities": [{
                "confidence": 1.0,
                "data": [["living room", "skill_iot_controlENTITY"]],
                "key": "living room",
                "match": "living room"
            }],
            "from_context":
            False,
            "key":
            "living room",
            "match":
            "living room",
            "start_token":
            6
        }]

        at_least_one = [["skill_iot_controlINFORMATION_QUERY"],
                        [
                            "skill_iot_controlTEMPERATURE",
                            "skill_iot_controlENTITY"
                        ], ["skill_iot_controlTEMPERATURE"]]

        result = {
            "skill_iot_controlENTITY": [{
                "confidence":
                1.0,
                "end_token":
                7,
                "entities": [{
                    "confidence":
                    1.0,
                    "data": [["living room", "skill_iot_controlENTITY"]],
                    "key":
                    "living room",
                    "match":
                    "living room"
                }],
                "from_context":
                False,
                "key":
                "living room",
                "match":
                "living room",
                "start_token":
                6
            }],
            "skill_iot_controlINFORMATION_QUERY": [{
                "confidence":
                1.0,
                "end_token":
                1,
                "entities": [{
                    "confidence":
                    1.0,
                    "data": [["what is",
                              "skill_iot_controlINFORMATION_QUERY"]],
                    "key":
                    "what is",
                    "match":
                    "what is"
                }],
                "from_context":
                False,
                "key":
                "what is",
                "match":
                "what is",
                "start_token":
                0
            }],
            "skill_iot_controlTEMPERATURE": [{
                "end_token":
                3,
                "entities": [{
                    "confidence":
                    1.0,
                    "data": [["temperature", "skill_weatherTemperature"],
                             ["temperature", "skill_iot_controlTEMPERATURE"]],
                    "key":
                    "temperature",
                    "match":
                    "temperature"
                }],
                "from_context":
                False,
                "key":
                "temperature",
                "match":
                "temperature",
                "start_token":
                3
            }]
        }

        assert resolve_one_of(tags, at_least_one) == result
Beispiel #51
0
    def parse(self, utterance, context=None, N=1):
        """Used to find tags within utterance with a given confidence

        Args:
            utterance(str): conversational piece given by the user
            context(list): a list of entities
            N(int): number of results
        Returns: yield an object with the following fields
            utterance(str): the value passed in
            tags(list) : a list of tags found in utterance
            time(time) : duration since call of function
            confidence(float) : float indicating how confident of a match to the
                utterance. This might be used to determan the most likely intent.

        """
        start = time.time()
        context_trie = None
        if context and isinstance(context, list):
            # sort by confidence in ascending order, so
            # highest confidence for an entity is last.
            # see comment on TrieNode ctor
            context.sort(key=lambda x: x.get('confidence'))

            context_trie = Trie()
            for entity in context:
                entity_value, entity_type = entity.get('data')[0]
                context_trie.insert(entity_value.lower(),
                                    data=(entity_value, entity_type),
                                    weight=entity.get('confidence'))

        tagged = self._tagger.tag(utterance.lower(), context_trie=context_trie)
        self.emit("tagged_entities",
                  {
                      'utterance': utterance,
                      'tags': list(tagged),
                      'time': time.time() - start
                  })
        start = time.time()
        bke = BronKerboschExpander(self._tokenizer)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{'confidence': 0.0}])[0].get('confidence')
                score += ec * len(tagged_entity.get('entities', [{'match': ''}])[0].get('match')) / (
                    len(utterance) + 1)
            return score

        parse_results = bke.expand(tagged, clique_scoring_func=score_clique)
        count = 0
        for result in parse_results:
            count += 1
            parse_confidence = 0.0
            for tag in result:
                sample_entity = tag['entities'][0]
                entity_confidence = sample_entity.get('confidence', 0.0) * float(
                    len(sample_entity.get('match'))) / len(utterance)
                parse_confidence += entity_confidence
            yield {
                'utterance': utterance,
                'tags': result,
                'time': time.time() - start,
                'confidence': parse_confidence
            }

            if count >= N:
                break