def test_glossary_extractor(self) -> None: t = Tokenizer() g = ['New York', 'Shanghai', 'Los Angeles', 'Beijing'] ge = GlossaryExtractor(g, 'test_glossary', t, 2, False) text = 'i live in los angeles. my hometown is Beijing' tokens = t.tokenize(text) test_result = [i.value for i in ge.extract(tokens)] expected = ["Beijing", "Los Angeles"] self.assertEqual(test_result, expected)
def test_glossary_extractor(self) -> None: t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) ge = GlossaryExtractor(self.glossary_1, 'test_glossary', t, 3, False) results = [i.value for i in ge.extract(tokens)] expected = ['Beijing', 'los angeles', 'New York'] self.assertEqual(results, expected)
def test_case_sensitive(self) -> None: t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) g = ['Beijing', 'Los Angeles', 'New York', 'Shanghai'] ge = GlossaryExtractor(g, 'test_glossary', t, 2, True) results = [i.value for i in ge.extract(tokens)] expected = ['Beijing', 'New York'] self.assertEqual(results, expected)
def setUp(self): self.text = 'Napoléon Bonaparte was a French statesman and military leader who rose to prominence during the ' \ 'French Revolution and led several successful campaigns during the French Revolutionary Wars. ' \ 'As Napoleon, he was Emperor of the French from 1804 until 1814, and again briefly in 1815 during ' \ 'the Hundred Days. Napoleon dominated European and global affairs for more than a decade while ' \ 'leading France against a series of coalitions in the Napoleonic Wars. He won most of these wars ' \ 'and the vast majority of his battles, building a large empire that ruled over continental Europe ' \ 'before its final collapse in 1815. He is considered one of the greatest commanders in history, ' \ 'and his wars and campaigns are studied at military schools worldwide. Napoleon\'s political and ' \ 'cultural legacy has endured as one of the most celebrated and controversial leaders in human history.' extractor = SpacyNerExtractor(extractor_name='spacy_ner_extractor') self.results = extractor.extract(self.text) glossary_1 = ['Beijing', 'Los Angeles', 'New York', 'Shanghai'] t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) ge = GlossaryExtractor(glossary_1, 'test_glossary', t, 3, False) self.results2 = ge.extract(tokens)