Example #1
0
    def testConsistentExpandWithSameOverlapMultipleTimes(self):
        """
        example: play season 1 of the big bang theory play season one of the big bang theory
        series should contain two instances of the big bang theory
        :return:
        """
        utterance = "play season 1 of the big bang theory"
        tags = self.tagger.tag(utterance)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{
                    'confidence': 0.0
                }])[0].get('confidence')
                score += ec * len(
                    tagged_entity.get('entities', [{
                        'match': ''
                    }])[0].get('match')) / (len(utterance) + 1)
            return score

        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(
            expander.expand(tags, clique_scoring_func=score_clique))
        assert len(parse_results) == 6
        result_text = ' '.join(
            [tag.get('entities')[0].get('key') for tag in parse_results[0]])
        result_parse = ', '.join([
            tag.get('entities')[0].get('data')[0][1]
            for tag in parse_results[0]
        ])

        assert result_text == 'play season 1 the big bang theory'
Example #2
0
    def parse(self, utterance, context=None, N=1):
        """

        :param utterance:
        :param context: a list of entities
        :param N:
        :return:
        """
        start = time.time()
        context_trie = None
        if context and isinstance(context, list):
            # sort by confidence in ascending order, so
            # highest confidence for an entity is last.
            # see comment on TrieNode ctor
            context.sort(key=lambda x: x.get('confidence'))

            context_trie = Trie()
            for entity in context:
                entity_value, entity_type = entity.get('data')[0]
                context_trie.insert(entity_value.lower(),
                                    data=(entity_value, entity_type),
                                    weight=entity.get('confidence'))

        tagged = self._tagger.tag(utterance.lower(), context_trie=context_trie)
        self.emit("tagged_entities",
                  {
                      'utterance': utterance,
                      'tags': list(tagged),
                      'time': time.time() - start
                  })
        start = time.time()
        bke = BronKerboschExpander(self._tokenizer)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{'confidence': 0.0}])[0].get('confidence')
                score += ec * len(tagged_entity.get('entities', [{'match': ''}])[0].get('match')) / (
                    len(utterance) + 1)
            return score

        parse_results = bke.expand(tagged, clique_scoring_func=score_clique)
        count = 0
        for result in parse_results:
            count += 1
            parse_confidence = 0.0
            for tag in result:
                sample_entity = tag['entities'][0]
                entity_confidence = sample_entity.get('confidence', 0.0) * float(
                    len(sample_entity.get('match'))) / len(utterance)
                parse_confidence += entity_confidence
            yield {
                'utterance': utterance,
                'tags': result,
                'time': time.time() - start,
                'confidence': parse_confidence
            }

            if count >= N:
                break
Example #3
0
    def testConsistentExpandWithSameOverlapMultipleTimes(self):
        """
        example: play season 1 of the big bang theory play season one of the big bang theory
        series should contain two instances of the big bang theory
        :return:
        """
        utterance = "play season 1 of the big bang theory"
        tags = self.tagger.tag(utterance)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{'confidence': 0.0}])[0].get('confidence')
                score += ec * len(tagged_entity.get('entities', [{'match': ''}])[0].get('match')) / (
                    len(utterance) + 1)
            return score
        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(expander.expand(tags, clique_scoring_func=score_clique))
        assert len(parse_results) == 6
        result_text = ' '.join([tag.get('entities')[0].get('key') for tag in parse_results[0]])
        result_parse = ', '.join(
            [tag.get('entities')[0].get('data')[0][1] for tag in parse_results[0]]
        )

        assert result_text == 'play season 1 the big bang theory'
Example #4
0
    def parse(self, utterance, relevance_store=None, N=1):
        start = time.time()
        tagged = self._tagger.tag(utterance.lower())
        self.emit("tagged_entities", {"utterance": utterance, "tags": list(tagged), "time": time.time() - start})
        start = time.time()
        bke = BronKerboschExpander(self._tokenizer)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get("entities", [{"confidence": 0.0}])[0].get("confidence")
                score += ec * len(tagged_entity.get("entities", [{"match": ""}])[0].get("match")) / (len(utterance) + 1)
            return score

        parse_results = bke.expand(tagged, clique_scoring_func=score_clique)
        count = 0
        for result in parse_results:
            count += 1
            parse_confidence = 0.0
            for tag in result:
                sample_entity = tag["entities"][0]
                entity_confidence = (
                    sample_entity.get("confidence", 0.0) * float(len(sample_entity.get("match"))) / len(utterance)
                )
                parse_confidence += entity_confidence
            yield {"utterance": utterance, "tags": result, "time": time.time() - start, "confidence": parse_confidence}

            if count >= N:
                break
Example #5
0
    def testExpandWithRegexAndLiteralTokenMatch(self):
        # two tags for the same token, different confidence, should expand to two cliques
        tags = [{'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell',
                 'entities': [{'confidence': 0.5, 'data': [u'SearchTerms'], 'match': u'spell', 'key': u'spell'}]},
                {'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell',
                 'entities': [{'confidence': 1.0, 'data': [u'SpellingKeyword'], 'match': u'spell', 'key': u'spell'}]}]

        expander = BronKerboschExpander(self.tokenizer)

        cliques = list(expander._sub_expand(tags))
        assert len(cliques) == 2
Example #6
0
    def testExpandWithRegexAndLiteralTokenMatch(self):
        # two tags for the same token, different confidence, should expand to two cliques
        tags = [{'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell',
                 'entities': [{'confidence': 0.5, 'data': [u'SearchTerms'], 'match': u'spell', 'key': u'spell'}]},
                {'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell',
                 'entities': [{'confidence': 1.0, 'data': [u'SpellingKeyword'], 'match': u'spell', 'key': u'spell'}]}]

        expander = BronKerboschExpander(self.tokenizer)

        cliques = list(expander._sub_expand(tags))
        assert len(cliques) == 2
Example #7
0
    def parse(self, utterance, relevance_store=None, N=1):
        start = time.time()
        tagged = self._tagger.tag(utterance.lower())
        self.emit(
            "tagged_entities", {
                'utterance': utterance,
                'tags': list(tagged),
                'time': time.time() - start
            })
        start = time.time()
        bke = BronKerboschExpander(self._tokenizer)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{
                    'confidence': 0.0
                }])[0].get('confidence')
                score += ec * len(
                    tagged_entity.get('entities', [{
                        'match': ''
                    }])[0].get('match')) / (len(utterance) + 1)
            return score

        parse_results = bke.expand(tagged, clique_scoring_func=score_clique)
        count = 0
        for result in parse_results:
            count += 1
            parse_confidence = 0.0
            for tag in result:
                sample_entity = tag['entities'][0]
                entity_confidence = sample_entity.get(
                    'confidence', 0.0) * float(len(
                        sample_entity.get('match'))) / len(utterance)
                parse_confidence += entity_confidence
            yield {
                'utterance': utterance,
                'tags': result,
                'time': time.time() - start,
                'confidence': parse_confidence
            }

            if count >= N:
                break
Example #8
0
    def parse(self, utterance, context=None, N=1):
        """

        :param utterance:
        :param context: a list of entities
        :param N:
        :return:
        """
        start = time.time()
        context_trie = None
        if context and isinstance(context, list):
            # sort by confidence in ascending order, so
            # highest confidence for an entity is last.
            # see comment on TrieNode ctor
            context.sort(key=lambda x: x.get('confidence'))

            context_trie = Trie()
            for entity in context:
                entity_value, entity_type, metadata = entity.get('data')[0]
                context_trie.insert(entity_value.lower(),
                                    data=(entity_value, entity_type, metadata),
                                    weight=entity.get('confidence'))

        tagged = self._tagger.tag(utterance.lower(), context_trie=context_trie)
        self.emit(
            "tagged_entities", {
                'utterance': utterance,
                'tags': list(tagged),
                'time': time.time() - start
            })
        start = time.time()
        bke = BronKerboschExpander(self._tokenizer)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{
                    'confidence': 0.0
                }])[0].get('confidence')
                score += ec * len(
                    tagged_entity.get('entities', [{
                        'match': ''
                    }])[0].get('match')) / (len(utterance) + 1)
            return score

        parse_results = bke.expand(tagged, clique_scoring_func=score_clique)
        count = 0
        for result in parse_results:
            count += 1
            parse_confidence = 0.0
            for tag in result:
                sample_entity = tag['entities'][0]
                entity_confidence = sample_entity.get(
                    'confidence', 0.0) * float(len(
                        sample_entity.get('match'))) / len(utterance)
                parse_confidence += entity_confidence
            yield {
                'utterance': utterance,
                'tags': result,
                'time': time.time() - start,
                'confidence': parse_confidence
            }

            if count >= N:
                break
Example #9
0
 def testExpandedResult(self):
     tags = self.tagger.tag("season 1")
     expander = BronKerboschExpander(self.tokenizer)
     parse_results = list(expander.expand(tags))
     assert len(parse_results) == 1
     assert len(parse_results[0]) == 2
Example #10
0
 def testExpander(self):
     self.tagger.trie.max_edit_distance = 0
     tags = self.tagger.tag("play season 1 of the big bang theory")
     expander = BronKerboschExpander(self.tokenizer)
     parse_results = list(expander.expand(tags))
     assert len(parse_results) == 6
Example #11
0
 def testExpandedResult(self):
     tags = self.tagger.tag("season 1")
     expander = BronKerboschExpander(self.tokenizer)
     parse_results = list(expander.expand(tags))
     assert len(parse_results) == 1
     assert len(parse_results[0]) == 2
Example #12
0
 def testExpander(self):
     self.tagger.trie.max_edit_distance = 0
     tags = self.tagger.tag("play season 1 of the big bang theory")
     expander = BronKerboschExpander(self.tokenizer)
     parse_results = list(expander.expand(tags))
     assert len(parse_results) == 6
Example #13
0
    def parse(self, utterance, context=None, N=1):
        """Used to find tags within utterance with a given confidence

        Args:
            utterance(str): conversational piece given by the user
            context(list): a list of entities
            N(int): number of results
        Returns: yield an object with the following fields
            utterance(str): the value passed in
            tags(list) : a list of tags found in utterance
            time(time) : duration since call of function
            confidence(float) : float indicating how confident of a match to the
                utterance. This might be used to determan the most likely intent.

        """
        start = time.time()
        context_trie = None
        if context and isinstance(context, list):
            # sort by confidence in ascending order, so
            # highest confidence for an entity is last.
            # see comment on TrieNode ctor
            context.sort(key=lambda x: x.get('confidence'))

            context_trie = Trie()
            for entity in context:
                entity_value, entity_type = entity.get('data')[0]
                context_trie.insert(entity_value.lower(),
                                    data=(entity_value, entity_type),
                                    weight=entity.get('confidence'))

        tagged = self._tagger.tag(utterance.lower(), context_trie=context_trie)
        self.emit(
            "tagged_entities", {
                'utterance': utterance,
                'tags': list(tagged),
                'time': time.time() - start
            })
        start = time.time()
        bke = BronKerboschExpander(self._tokenizer)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{
                    'confidence': 0.0
                }])[0].get('confidence')
                score += ec * len(
                    tagged_entity.get('entities', [{
                        'match': ''
                    }])[0].get('match')) / (len(utterance) + 1)
            return score

        parse_results = bke.expand(tagged, clique_scoring_func=score_clique)
        count = 0
        for result in parse_results:
            count += 1
            parse_confidence = 0.0
            for tag in result:
                sample_entity = tag['entities'][0]
                entity_confidence = sample_entity.get(
                    'confidence', 0.0) * float(len(
                        sample_entity.get('match'))) / len(utterance)
                parse_confidence += entity_confidence
            yield {
                'utterance': utterance,
                'tags': result,
                'time': time.time() - start,
                'confidence': parse_confidence
            }

            if count >= N:
                break
Example #14
0
    def parse(self, utterance, context=None, N=1):
        """Used to find tags within utterance with a given confidence

        Args:
            utterance(str): conversational piece given by the user
            context(list): a list of entities
            N(int): number of results
        Returns: yield an object with the following fields
            utterance(str): the value passed in
            tags(list) : a list of tags found in utterance
            time(time) : duration since call of function
            confidence(float) : float indicating how confident of a match to the
                utterance. This might be used to determan the most likely intent.

        """
        start = time.time()
        context_trie = None
        if context and isinstance(context, list):
            # sort by confidence in ascending order, so
            # highest confidence for an entity is last.
            # see comment on TrieNode ctor
            context.sort(key=lambda x: x.get('confidence'))

            context_trie = Trie()
            for entity in context:
                entity_value, entity_type = entity.get('data')[0]
                context_trie.insert(entity_value.lower(),
                                    data=(entity_value, entity_type),
                                    weight=entity.get('confidence'))

        tagged = self._tagger.tag(utterance.lower(), context_trie=context_trie)
        self.emit("tagged_entities",
                  {
                      'utterance': utterance,
                      'tags': list(tagged),
                      'time': time.time() - start
                  })
        start = time.time()
        bke = BronKerboschExpander(self._tokenizer)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{'confidence': 0.0}])[0].get('confidence')
                score += ec * len(tagged_entity.get('entities', [{'match': ''}])[0].get('match')) / (
                    len(utterance) + 1)
            return score

        parse_results = bke.expand(tagged, clique_scoring_func=score_clique)
        count = 0
        for result in parse_results:
            count += 1
            parse_confidence = 0.0
            for tag in result:
                sample_entity = tag['entities'][0]
                entity_confidence = sample_entity.get('confidence', 0.0) * float(
                    len(sample_entity.get('match'))) / len(utterance)
                parse_confidence += entity_confidence
            yield {
                'utterance': utterance,
                'tags': result,
                'time': time.time() - start,
                'confidence': parse_confidence
            }

            if count >= N:
                break