Ejemplo n.º 1
0
    def get_tokens(self, tokenizer: Tokenizer) -> List[Token]:
        """
        Tokenize this Extractable.

        If the value is a string, it returns the tokenized version of the string. Else, convert to string with
        get_string method

        As it is common to need the same tokens for multiple extractors, the Extractable should cache the
        tokenize results, keyed by segment and tokenizer so that given the same segment and tokenizer,
        the same results are returned. If the same segment is given, but different tokenizer, the different
        results are cached separately.

        Args:
            tokenizer (Tokenizer)

        Returns: a sequence of tokens.
        """

        if (self, tokenizer) in self.tokenize_results:
            return self.tokenize_results[(self, tokenizer)]
        else:
            segment_value_for_tokenize = self.get_string()
            tokens = tokenizer.tokenize(segment_value_for_tokenize)
            self.tokenize_results[(self, tokenizer)] = tokens
            return tokens
Ejemplo n.º 2
0
    def test_tokenizer(self) -> None:
        text = "[email protected] 32.4 -32.1 (123)-345-6789, #1  \n \n   "
        reconstruct_text = re.sub(' +', ' ', text)
        t = Tokenizer()
        t.keep_multi_space = False
        tokens = t.tokenize(text)
        token_attrs = []
        for i in tokens:
            token_attrs.append({"orth": i.orth_, "offset": i.idx, "full_shape": i._.full_shape})
        expected = [
            {'orth': 'dsa', 'offset': 0, 'full_shape': 'xxx'},
            {'orth': '@', 'offset': 3, 'full_shape': '@'},
            {'orth': 'isi', 'offset': 4, 'full_shape': 'xxx'},
            {'orth': '.', 'offset': 7, 'full_shape': '.'},
            {'orth': 'edu', 'offset': 8, 'full_shape': 'xxx'},
            {'orth': '32.4', 'offset': 12, 'full_shape': 'dd.d'},
            {'orth': '-', 'offset': 17, 'full_shape': '-'},
            {'orth': '32.1', 'offset': 18, 'full_shape': 'dd.d'},
            {'orth': '(', 'offset': 23, 'full_shape': '('},
            {'orth': '123', 'offset': 24, 'full_shape': 'ddd'},
            {'orth': ')', 'offset': 27, 'full_shape': ')'},
            {'orth': '-', 'offset': 28, 'full_shape': '-'},
            {'orth': '345', 'offset': 29, 'full_shape': 'ddd'},
            {'orth': '-', 'offset': 32, 'full_shape': '-'},
            {'orth': '6789', 'offset': 33, 'full_shape': 'dddd'},
            {'orth': ',', 'offset': 37, 'full_shape': ','},
            {'orth': '#', 'offset': 39, 'full_shape': '#'},
            {'orth': '1', 'offset': 40, 'full_shape': 'd'},
            {'orth': '\n ', 'offset': 42, 'full_shape': '\n '},
            {'orth': '\n ', 'offset': 44, 'full_shape': '\n '}
        ]

        self.assertEqual(token_attrs, expected)
        self.assertEqual(t.reconstruct_text(tokens), reconstruct_text)
Ejemplo n.º 3
0
 def test_glossary_extractor(self) -> None:
     t = Tokenizer()
     g = ['New York', 'Shanghai', 'Los Angeles', 'Beijing']
     ge = GlossaryExtractor(g, 'test_glossary', t, 2, False)
     text = 'i live in los angeles. my hometown is Beijing'
     tokens = t.tokenize(text)
     test_result = [i.value for i in ge.extract(tokens)]
     expected = ["Beijing", "Los Angeles"]
     self.assertEqual(test_result, expected)
Ejemplo n.º 4
0
    def test_glossary_extractor(self) -> None:
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)

        ge = GlossaryExtractor(self.glossary_1, 'test_glossary', t, 3, False)
        results = [i.value for i in ge.extract(tokens)]
        expected = ['Beijing', 'los angeles', 'New York']

        self.assertEqual(results, expected)
Ejemplo n.º 5
0
    def test_case_sensitive(self) -> None:
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)

        g = ['Beijing', 'Los Angeles', 'New York', 'Shanghai']
        ge = GlossaryExtractor(g, 'test_glossary', t, 2, True)

        results = [i.value for i in ge.extract(tokens)]
        expected = ['Beijing', 'New York']

        self.assertEqual(results, expected)
Ejemplo n.º 6
0
    def setUp(self):
        self.text = 'Napoléon Bonaparte was a French statesman and military leader who rose to prominence during the ' \
                    'French Revolution and led several successful campaigns during the French Revolutionary Wars. ' \
                    'As Napoleon, he was Emperor of the French from 1804 until 1814, and again briefly in 1815 during ' \
                    'the Hundred Days. Napoleon dominated European and global affairs for more than a decade while ' \
                    'leading France against a series of coalitions in the Napoleonic Wars. He won most of these wars ' \
                    'and the vast majority of his battles, building a large empire that ruled over continental Europe ' \
                    'before its final collapse in 1815. He is considered one of the greatest commanders in history, ' \
                    'and his wars and campaigns are studied at military schools worldwide. Napoleon\'s political and ' \
                    'cultural legacy has endured as one of the most celebrated and controversial leaders in human history.'
        extractor = SpacyNerExtractor(extractor_name='spacy_ner_extractor')
        self.results = extractor.extract(self.text)

        glossary_1 = ['Beijing', 'Los Angeles', 'New York', 'Shanghai']
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)
        ge = GlossaryExtractor(glossary_1, 'test_glossary', t, 3, False)
        self.results2 = ge.extract(tokens)