def get_tokens(self, tokenizer: Tokenizer) -> List[Token]: """ Tokenize this Extractable. If the value is a string, it returns the tokenized version of the string. Else, convert to string with get_string method As it is common to need the same tokens for multiple extractors, the Extractable should cache the tokenize results, keyed by segment and tokenizer so that given the same segment and tokenizer, the same results are returned. If the same segment is given, but different tokenizer, the different results are cached separately. Args: tokenizer (Tokenizer) Returns: a sequence of tokens. """ if (self, tokenizer) in self.tokenize_results: return self.tokenize_results[(self, tokenizer)] else: segment_value_for_tokenize = self.get_string() tokens = tokenizer.tokenize(segment_value_for_tokenize) self.tokenize_results[(self, tokenizer)] = tokens return tokens
def test_tokenizer(self) -> None: text = "[email protected] 32.4 -32.1 (123)-345-6789, #1 \n \n " reconstruct_text = re.sub(' +', ' ', text) t = Tokenizer() t.keep_multi_space = False tokens = t.tokenize(text) token_attrs = [] for i in tokens: token_attrs.append({"orth": i.orth_, "offset": i.idx, "full_shape": i._.full_shape}) expected = [ {'orth': 'dsa', 'offset': 0, 'full_shape': 'xxx'}, {'orth': '@', 'offset': 3, 'full_shape': '@'}, {'orth': 'isi', 'offset': 4, 'full_shape': 'xxx'}, {'orth': '.', 'offset': 7, 'full_shape': '.'}, {'orth': 'edu', 'offset': 8, 'full_shape': 'xxx'}, {'orth': '32.4', 'offset': 12, 'full_shape': 'dd.d'}, {'orth': '-', 'offset': 17, 'full_shape': '-'}, {'orth': '32.1', 'offset': 18, 'full_shape': 'dd.d'}, {'orth': '(', 'offset': 23, 'full_shape': '('}, {'orth': '123', 'offset': 24, 'full_shape': 'ddd'}, {'orth': ')', 'offset': 27, 'full_shape': ')'}, {'orth': '-', 'offset': 28, 'full_shape': '-'}, {'orth': '345', 'offset': 29, 'full_shape': 'ddd'}, {'orth': '-', 'offset': 32, 'full_shape': '-'}, {'orth': '6789', 'offset': 33, 'full_shape': 'dddd'}, {'orth': ',', 'offset': 37, 'full_shape': ','}, {'orth': '#', 'offset': 39, 'full_shape': '#'}, {'orth': '1', 'offset': 40, 'full_shape': 'd'}, {'orth': '\n ', 'offset': 42, 'full_shape': '\n '}, {'orth': '\n ', 'offset': 44, 'full_shape': '\n '} ] self.assertEqual(token_attrs, expected) self.assertEqual(t.reconstruct_text(tokens), reconstruct_text)
def test_glossary_extractor(self) -> None: t = Tokenizer() g = ['New York', 'Shanghai', 'Los Angeles', 'Beijing'] ge = GlossaryExtractor(g, 'test_glossary', t, 2, False) text = 'i live in los angeles. my hometown is Beijing' tokens = t.tokenize(text) test_result = [i.value for i in ge.extract(tokens)] expected = ["Beijing", "Los Angeles"] self.assertEqual(test_result, expected)
def test_glossary_extractor(self) -> None: t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) ge = GlossaryExtractor(self.glossary_1, 'test_glossary', t, 3, False) results = [i.value for i in ge.extract(tokens)] expected = ['Beijing', 'los angeles', 'New York'] self.assertEqual(results, expected)
def test_case_sensitive(self) -> None: t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) g = ['Beijing', 'Los Angeles', 'New York', 'Shanghai'] ge = GlossaryExtractor(g, 'test_glossary', t, 2, True) results = [i.value for i in ge.extract(tokens)] expected = ['Beijing', 'New York'] self.assertEqual(results, expected)
def setUp(self): self.text = 'Napoléon Bonaparte was a French statesman and military leader who rose to prominence during the ' \ 'French Revolution and led several successful campaigns during the French Revolutionary Wars. ' \ 'As Napoleon, he was Emperor of the French from 1804 until 1814, and again briefly in 1815 during ' \ 'the Hundred Days. Napoleon dominated European and global affairs for more than a decade while ' \ 'leading France against a series of coalitions in the Napoleonic Wars. He won most of these wars ' \ 'and the vast majority of his battles, building a large empire that ruled over continental Europe ' \ 'before its final collapse in 1815. He is considered one of the greatest commanders in history, ' \ 'and his wars and campaigns are studied at military schools worldwide. Napoleon\'s political and ' \ 'cultural legacy has endured as one of the most celebrated and controversial leaders in human history.' extractor = SpacyNerExtractor(extractor_name='spacy_ner_extractor') self.results = extractor.extract(self.text) glossary_1 = ['Beijing', 'Los Angeles', 'New York', 'Shanghai'] t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) ge = GlossaryExtractor(glossary_1, 'test_glossary', t, 3, False) self.results2 = ge.extract(tokens)