class EntityTaggerTest(unittest.TestCase): def setUp(self): self.trie = Trie() self.tagger = EntityTagger(self.trie, EnglishTokenizer()) self.trie.insert("play", "PlayVerb") self.trie.insert("the big bang theory", "Television Show") self.trie.insert("the big", "Not a Thing") def tearDown(self): pass def test_tag(self): tags = list(self.tagger.tag("play season 1 of the big bang theory")) assert len(tags) == 3 def test_regex_tag(self): regex = re.compile(r"the (?P<Event>\w+\s\w+) theory") tagger = EntityTagger(self.trie, EnglishTokenizer(), regex_entities=[regex]) tags = tagger.tag("the big bang theory") assert len(tags) == 3 event_tags = [tag for tag in tags if tag.get('match') == 'big bang'] assert len(event_tags) == 1 assert len(event_tags[0].get('entities')) == 1 assert len(event_tags[0].get('entities')[0].get('data')) == 1 assert 'Event' in event_tags[0].get('entities')[0].get('data')
class EntityTaggerTest(unittest.TestCase): def setUp(self): self.trie = Trie() self.tagger = EntityTagger(self.trie, EnglishTokenizer()) self.trie.insert("play", "PlayVerb") self.trie.insert("the big bang theory", "Television Show") self.trie.insert("the big", "Not a Thing") def tearDown(self): pass def test_tag(self): tags = list(self.tagger.tag("play season 1 of the big bang theory")) assert len(tags) == 3 def test_regex_tag(self): regex = re.compile(r"the (?P<Event>\w+\s\w+) theory") tagger = EntityTagger(self.trie, EnglishTokenizer(), regex_entities=[regex]) tags = tagger.tag("the big bang theory") assert len(tags) == 3 event_tags = [tag for tag in tags if tag.get('match') == 'big bang'] assert len(event_tags) == 1 assert len(event_tags[0].get('entities')) == 1 assert len(event_tags[0].get('entities')[0].get('data')) == 1 assert ('big bang', 'Event') in event_tags[0].get('entities')[0].get('data') def test_start_end_token_match_when_sorting_tagged_entities(self): repro_payload = [{"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "SnoozeTime"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 4, "key": "20 minutes", "entities": [{"key": "20 minutes", "data": [["20 minutes", "SnoozeTime"]], "confidence": 0.5, "match": "20 minutes"}], "start_token": 3, "match": "20 minutes"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "Which"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "Which"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 0, "key": "snooze", "entities": [{"key": "snooze", "data": [["snooze", "SnoozeKeyword"]], "confidence": 1.0, "match": "snooze"}], "start_token": 0, "match": "snooze"}, {"end_token": 2, "key": "for", "entities": [{"key": "for", "data": [["for", "SnoozeFiller"]], "confidence": 1.0, "match": "for"}], "start_token": 2, "match": "for"}] # just asserting that the sort does not crash in py3 self.tagger._sort_and_merge_tags(repro_payload)
def test_regex_tag(self): regex = re.compile(r"the (?P<Event>\w+\s\w+) theory") tagger = EntityTagger(self.trie, EnglishTokenizer(), regex_entities=[regex]) tags = tagger.tag("the big bang theory") assert len(tags) == 3 event_tags = [tag for tag in tags if tag.get('match') == 'big bang'] assert len(event_tags) == 1 assert len(event_tags[0].get('entities')) == 1 assert len(event_tags[0].get('entities')[0].get('data')) == 1 assert 'Event' in event_tags[0].get('entities')[0].get('data')
class BronKerboschExpanderTest(unittest.TestCase): def setUp(self): self.tokenizer = EnglishTokenizer() self.trie = Trie(max_edit_distance=2) self.trie.insert("x-play", "Television Show") self.trie.insert("play", "Play Verb") self.trie.insert("play season", "Time Period") self.trie.insert("play", "Player Control") self.trie.insert("season", "Season Prefix") self.trie.insert("1", "Number") self.trie.insert("the big bang theory", "Television Show") self.trie.insert("the big", "Television Show") self.trie.insert("big bang", "event") self.trie.insert("bang theory", "Scientific Theory") self.tagger = EntityTagger(self.trie, self.tokenizer) def testExpander(self): self.tagger.trie.max_edit_distance = 0 tags = self.tagger.tag("play season 1 of the big bang theory") expander = BronKerboschExpander(self.tokenizer) parse_results = list(expander.expand(tags)) assert len(parse_results) == 6 def testExpandedResult(self): tags = self.tagger.tag("season 1") expander = BronKerboschExpander(self.tokenizer) parse_results = list(expander.expand(tags)) assert len(parse_results) == 1 assert len(parse_results[0]) == 2 def testConsistentExpandWithSameOverlapMultipleTimes(self): """ example: play season 1 of the big bang theory play season one of the big bang theory series should contain two instances of the big bang theory :return: """ utterance = "play season 1 of the big bang theory" tags = self.tagger.tag(utterance) def score_clique(clique): score = 0.0 for tagged_entity in clique: ec = tagged_entity.get('entities', [{ 'confidence': 0.0 }])[0].get('confidence') score += ec * len( tagged_entity.get('entities', [{ 'match': '' }])[0].get('match')) / (len(utterance) + 1) return score expander = BronKerboschExpander(self.tokenizer) parse_results = list( expander.expand(tags, clique_scoring_func=score_clique)) assert len(parse_results) == 6 result_text = ' '.join( [tag.get('entities')[0].get('key') for tag in parse_results[0]]) result_parse = ', '.join([ tag.get('entities')[0].get('data')[0][1] for tag in parse_results[0] ]) assert result_text == 'play season 1 the big bang theory' def testExpandWithRegexAndLiteralTokenMatch(self): # two tags for the same token, different confidence, should expand to two cliques tags = [{ 'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell', 'entities': [{ 'confidence': 0.5, 'data': [u'SearchTerms'], 'match': u'spell', 'key': u'spell' }] }, { 'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell', 'entities': [{ 'confidence': 1.0, 'data': [u'SpellingKeyword'], 'match': u'spell', 'key': u'spell' }] }] expander = BronKerboschExpander(self.tokenizer) cliques = list(expander._sub_expand(tags)) assert len(cliques) == 2
class BronKerboschExpanderTest(unittest.TestCase): def setUp(self): self.tokenizer = EnglishTokenizer() self.trie = Trie(max_edit_distance=2) self.trie.insert("x-play", "Television Show") self.trie.insert("play", "Play Verb") self.trie.insert("play season", "Time Period") self.trie.insert("play", "Player Control") self.trie.insert("season", "Season Prefix") self.trie.insert("1", "Number") self.trie.insert("the big bang theory", "Television Show") self.trie.insert("the big", "Television Show") self.trie.insert("big bang", "event") self.trie.insert("bang theory", "Scientific Theory") self.tagger = EntityTagger(self.trie, self.tokenizer) def testExpander(self): self.tagger.trie.max_edit_distance = 0 tags = self.tagger.tag("play season 1 of the big bang theory") expander = BronKerboschExpander(self.tokenizer) parse_results = list(expander.expand(tags)) assert len(parse_results) == 6 def testExpandedResult(self): tags = self.tagger.tag("season 1") expander = BronKerboschExpander(self.tokenizer) parse_results = list(expander.expand(tags)) assert len(parse_results) == 1 assert len(parse_results[0]) == 2 def testConsistentExpandWithSameOverlapMultipleTimes(self): """ example: play season 1 of the big bang theory play season one of the big bang theory series should contain two instances of the big bang theory :return: """ utterance = "play season 1 of the big bang theory" tags = self.tagger.tag(utterance) def score_clique(clique): score = 0.0 for tagged_entity in clique: ec = tagged_entity.get('entities', [{'confidence': 0.0}])[0].get('confidence') score += ec * len(tagged_entity.get('entities', [{'match': ''}])[0].get('match')) / ( len(utterance) + 1) return score expander = BronKerboschExpander(self.tokenizer) parse_results = list(expander.expand(tags, clique_scoring_func=score_clique)) assert len(parse_results) == 6 result_text = ' '.join([tag.get('entities')[0].get('key') for tag in parse_results[0]]) result_parse = ', '.join( [tag.get('entities')[0].get('data')[0][1] for tag in parse_results[0]] ) assert result_text == 'play season 1 the big bang theory' def testExpandWithRegexAndLiteralTokenMatch(self): # two tags for the same token, different confidence, should expand to two cliques tags = [{'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell', 'entities': [{'confidence': 0.5, 'data': [u'SearchTerms'], 'match': u'spell', 'key': u'spell'}]}, {'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell', 'entities': [{'confidence': 1.0, 'data': [u'SpellingKeyword'], 'match': u'spell', 'key': u'spell'}]}] expander = BronKerboschExpander(self.tokenizer) cliques = list(expander._sub_expand(tags)) assert len(cliques) == 2
from adapt.tools.text.tokenizer import EnglishTokenizer from adapt.entity_tagger import EntityTagger from adapt.tools.text.trie import Trie import pprint tokenizer = EnglishTokenizer() trie = Trie() trie.insert("x-play", "Television Show") trie.insert("play", "Play Verb") trie.insert("play season", "Time Period") trie.insert("play", "Player Control") trie.insert("season", "Season Prefix") trie.insert("1", "Number") trie.insert("the big bang theory", "Television Show") trie.insert("the big", "Television Show") trie.insert("big bang", "event") trie.insert("bang theory", "Scientific Theory") tagger = EntityTagger(trie, tokenizer) tags = tagger.tag("play season 2 the big 1 of the big bang theory") Bke = BronKerboschExpander(tokenizer) graphA = Bke._build_graph(tags) for v in graphA.vertex_set(): print("vertex", v, list(graphA.get_neighbors_of(v))) parse_results = list(Bke.expand(tags)) for r in parse_results: print("Bke ----- ") for x in r: pprint.pprint(x) # print "Results X ",parse_results # print "GraphA",graphA # print "vertexes",graphA.vertex_set()