예제 #1
0
class EntityTaggerTest(unittest.TestCase):

    def setUp(self):
        self.trie = Trie()
        self.tagger = EntityTagger(self.trie, EnglishTokenizer())
        self.trie.insert("play", "PlayVerb")
        self.trie.insert("the big bang theory", "Television Show")
        self.trie.insert("the big", "Not a Thing")

    def tearDown(self):
        pass

    def test_tag(self):
        tags = list(self.tagger.tag("play season 1 of the big bang theory"))
        assert len(tags) == 3

    def test_regex_tag(self):
        regex = re.compile(r"the (?P<Event>\w+\s\w+) theory")
        tagger = EntityTagger(self.trie, EnglishTokenizer(), regex_entities=[regex])
        tags = tagger.tag("the big bang theory")
        assert len(tags) == 3
        event_tags = [tag for tag in tags if tag.get('match') == 'big bang']
        assert len(event_tags) == 1
        assert len(event_tags[0].get('entities')) == 1
        assert len(event_tags[0].get('entities')[0].get('data')) == 1
        assert 'Event' in event_tags[0].get('entities')[0].get('data')
예제 #2
0
class EntityTaggerTest(unittest.TestCase):

    def setUp(self):
        self.trie = Trie()
        self.tagger = EntityTagger(self.trie, EnglishTokenizer())
        self.trie.insert("play", "PlayVerb")
        self.trie.insert("the big bang theory", "Television Show")
        self.trie.insert("the big", "Not a Thing")

    def tearDown(self):
        pass

    def test_tag(self):
        tags = list(self.tagger.tag("play season 1 of the big bang theory"))
        assert len(tags) == 3

    def test_regex_tag(self):
        regex = re.compile(r"the (?P<Event>\w+\s\w+) theory")
        tagger = EntityTagger(self.trie, EnglishTokenizer(), regex_entities=[regex])
        tags = tagger.tag("the big bang theory")
        assert len(tags) == 3
        event_tags = [tag for tag in tags if tag.get('match') == 'big bang']
        assert len(event_tags) == 1
        assert len(event_tags[0].get('entities')) == 1
        assert len(event_tags[0].get('entities')[0].get('data')) == 1
        assert ('big bang', 'Event') in event_tags[0].get('entities')[0].get('data')

    def test_start_end_token_match_when_sorting_tagged_entities(self):
        repro_payload = [{"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "SnoozeTime"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 4, "key": "20 minutes", "entities": [{"key": "20 minutes", "data": [["20 minutes", "SnoozeTime"]], "confidence": 0.5, "match": "20 minutes"}], "start_token": 3, "match": "20 minutes"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "Which"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "Which"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 0, "key": "snooze", "entities": [{"key": "snooze", "data": [["snooze", "SnoozeKeyword"]], "confidence": 1.0, "match": "snooze"}], "start_token": 0, "match": "snooze"}, {"end_token": 2, "key": "for", "entities": [{"key": "for", "data": [["for", "SnoozeFiller"]], "confidence": 1.0, "match": "for"}], "start_token": 2, "match": "for"}]
        # just asserting that the sort does not crash in py3
        self.tagger._sort_and_merge_tags(repro_payload)
예제 #3
0
 def test_regex_tag(self):
     regex = re.compile(r"the (?P<Event>\w+\s\w+) theory")
     tagger = EntityTagger(self.trie, EnglishTokenizer(), regex_entities=[regex])
     tags = tagger.tag("the big bang theory")
     assert len(tags) == 3
     event_tags = [tag for tag in tags if tag.get('match') == 'big bang']
     assert len(event_tags) == 1
     assert len(event_tags[0].get('entities')) == 1
     assert len(event_tags[0].get('entities')[0].get('data')) == 1
     assert 'Event' in event_tags[0].get('entities')[0].get('data')
예제 #4
0
class BronKerboschExpanderTest(unittest.TestCase):
    def setUp(self):
        self.tokenizer = EnglishTokenizer()
        self.trie = Trie(max_edit_distance=2)
        self.trie.insert("x-play", "Television Show")
        self.trie.insert("play", "Play Verb")
        self.trie.insert("play season", "Time Period")
        self.trie.insert("play", "Player Control")
        self.trie.insert("season", "Season Prefix")
        self.trie.insert("1", "Number")
        self.trie.insert("the big bang theory", "Television Show")
        self.trie.insert("the big", "Television Show")
        self.trie.insert("big bang", "event")
        self.trie.insert("bang theory", "Scientific Theory")
        self.tagger = EntityTagger(self.trie, self.tokenizer)

    def testExpander(self):
        self.tagger.trie.max_edit_distance = 0
        tags = self.tagger.tag("play season 1 of the big bang theory")
        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(expander.expand(tags))
        assert len(parse_results) == 6

    def testExpandedResult(self):
        tags = self.tagger.tag("season 1")
        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(expander.expand(tags))
        assert len(parse_results) == 1
        assert len(parse_results[0]) == 2

    def testConsistentExpandWithSameOverlapMultipleTimes(self):
        """
        example: play season 1 of the big bang theory play season one of the big bang theory
        series should contain two instances of the big bang theory
        :return:
        """
        utterance = "play season 1 of the big bang theory"
        tags = self.tagger.tag(utterance)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{
                    'confidence': 0.0
                }])[0].get('confidence')
                score += ec * len(
                    tagged_entity.get('entities', [{
                        'match': ''
                    }])[0].get('match')) / (len(utterance) + 1)
            return score

        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(
            expander.expand(tags, clique_scoring_func=score_clique))
        assert len(parse_results) == 6
        result_text = ' '.join(
            [tag.get('entities')[0].get('key') for tag in parse_results[0]])
        result_parse = ', '.join([
            tag.get('entities')[0].get('data')[0][1]
            for tag in parse_results[0]
        ])

        assert result_text == 'play season 1 the big bang theory'

    def testExpandWithRegexAndLiteralTokenMatch(self):
        # two tags for the same token, different confidence, should expand to two cliques
        tags = [{
            'end_token':
            0,
            'start_token':
            0,
            'key':
            u'spell',
            'match':
            u'spell',
            'entities': [{
                'confidence': 0.5,
                'data': [u'SearchTerms'],
                'match': u'spell',
                'key': u'spell'
            }]
        }, {
            'end_token':
            0,
            'start_token':
            0,
            'key':
            u'spell',
            'match':
            u'spell',
            'entities': [{
                'confidence': 1.0,
                'data': [u'SpellingKeyword'],
                'match': u'spell',
                'key': u'spell'
            }]
        }]

        expander = BronKerboschExpander(self.tokenizer)

        cliques = list(expander._sub_expand(tags))
        assert len(cliques) == 2
예제 #5
0
class BronKerboschExpanderTest(unittest.TestCase):
    def setUp(self):
        self.tokenizer = EnglishTokenizer()
        self.trie = Trie(max_edit_distance=2)
        self.trie.insert("x-play", "Television Show")
        self.trie.insert("play", "Play Verb")
        self.trie.insert("play season", "Time Period")
        self.trie.insert("play", "Player Control")
        self.trie.insert("season", "Season Prefix")
        self.trie.insert("1", "Number")
        self.trie.insert("the big bang theory", "Television Show")
        self.trie.insert("the big", "Television Show")
        self.trie.insert("big bang", "event")
        self.trie.insert("bang theory", "Scientific Theory")
        self.tagger = EntityTagger(self.trie, self.tokenizer)

    def testExpander(self):
        self.tagger.trie.max_edit_distance = 0
        tags = self.tagger.tag("play season 1 of the big bang theory")
        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(expander.expand(tags))
        assert len(parse_results) == 6

    def testExpandedResult(self):
        tags = self.tagger.tag("season 1")
        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(expander.expand(tags))
        assert len(parse_results) == 1
        assert len(parse_results[0]) == 2


    def testConsistentExpandWithSameOverlapMultipleTimes(self):
        """
        example: play season 1 of the big bang theory play season one of the big bang theory
        series should contain two instances of the big bang theory
        :return:
        """
        utterance = "play season 1 of the big bang theory"
        tags = self.tagger.tag(utterance)

        def score_clique(clique):
            score = 0.0
            for tagged_entity in clique:
                ec = tagged_entity.get('entities', [{'confidence': 0.0}])[0].get('confidence')
                score += ec * len(tagged_entity.get('entities', [{'match': ''}])[0].get('match')) / (
                    len(utterance) + 1)
            return score
        expander = BronKerboschExpander(self.tokenizer)
        parse_results = list(expander.expand(tags, clique_scoring_func=score_clique))
        assert len(parse_results) == 6
        result_text = ' '.join([tag.get('entities')[0].get('key') for tag in parse_results[0]])
        result_parse = ', '.join(
            [tag.get('entities')[0].get('data')[0][1] for tag in parse_results[0]]
        )

        assert result_text == 'play season 1 the big bang theory'

    def testExpandWithRegexAndLiteralTokenMatch(self):
        # two tags for the same token, different confidence, should expand to two cliques
        tags = [{'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell',
                 'entities': [{'confidence': 0.5, 'data': [u'SearchTerms'], 'match': u'spell', 'key': u'spell'}]},
                {'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell',
                 'entities': [{'confidence': 1.0, 'data': [u'SpellingKeyword'], 'match': u'spell', 'key': u'spell'}]}]

        expander = BronKerboschExpander(self.tokenizer)

        cliques = list(expander._sub_expand(tags))
        assert len(cliques) == 2
예제 #6
0
    from adapt.tools.text.tokenizer import EnglishTokenizer
    from adapt.entity_tagger import EntityTagger
    from adapt.tools.text.trie import Trie
    import pprint
    tokenizer = EnglishTokenizer()
    trie = Trie()
    trie.insert("x-play", "Television Show")
    trie.insert("play", "Play Verb")
    trie.insert("play season", "Time Period")
    trie.insert("play", "Player Control")
    trie.insert("season", "Season Prefix")
    trie.insert("1", "Number")
    trie.insert("the big bang theory", "Television Show")
    trie.insert("the big", "Television Show")
    trie.insert("big bang", "event")
    trie.insert("bang theory", "Scientific Theory")
    tagger = EntityTagger(trie, tokenizer)
    tags = tagger.tag("play season 2 the big 1 of the big bang theory")
    Bke = BronKerboschExpander(tokenizer)
    graphA = Bke._build_graph(tags)
    for v in graphA.vertex_set():
        print("vertex", v, list(graphA.get_neighbors_of(v)))
    parse_results = list(Bke.expand(tags))
    for r in parse_results:
        print("Bke ----- ")
        for x in r:
            pprint.pprint(x)
    # print "Results X ",parse_results
    # print "GraphA",graphA
    # print "vertexes",graphA.vertex_set()