Python Tokenizer.Tokenizer Examples, etk.tokenizer.Tokenizer.Tokenizer Python Examples

Example #1

0

Show file

    def __init__(self, kg_schema=None, modules=None, extract_error_policy="process", logger=None,
                 logger_path=os.path.join(TEMP_DIR, 'etk.log'), ontology=None, generate_json_ld=False,
                 output_kg_only=False, use_spacy_tokenizer=False):

        self.generate_json_ld = generate_json_ld
        self.output_kg_only = output_kg_only

        if logger:
            self.logger = logger
        else:
            logging.basicConfig(
                level=logging.DEBUG,
                format='%(asctime)s %(name)-6s %(levelname)s %(message)s',
                datefmt='%m-%d %H:%M',
                filename=logger_path,
                filemode='w'
            )
            self.logger = logging.getLogger('ETK')

        self.parser = jsonpath_ng.ext.parse
        self.default_nlp = spacy.load('en_core_web_sm')
        if use_spacy_tokenizer:
            self.default_tokenizer = Tokenizer(copy.deepcopy(self.default_nlp))
        else:
            self.default_tokenizer = CrfTokenizer()
        self.default_tokenizer = Tokenizer()
        self.parsed = dict()
        self.kg_schema = kg_schema
        self.ontology = ontology
        self.em_lst = list()
        if modules:
            if isinstance(modules, list):
                for module in modules:
                    if isinstance(module, str):
                        self.em_lst.extend(self.load_ems(modules))
                    elif issubclass(module, ETKModule):
                        self.em_lst.append(module(self))
            elif issubclass(modules, ETKModule):
                self.em_lst = [modules(self)]
            else:
                raise NotGetETKModuleError("Not getting extraction module")

        if extract_error_policy.lower() == "throw_extraction":
            self.error_policy = ErrorPolicy.THROW_EXTRACTION
        if extract_error_policy.lower() == "throw_document":
            self.error_policy = ErrorPolicy.THROW_DOCUMENT
        if extract_error_policy.lower() == "raise_error":
            self.error_policy = ErrorPolicy.RAISE
        else:
            self.error_policy = ErrorPolicy.PROCESS

Example #2

0

Show file

    def test_tokenizer(self) -> None:
        text = "[email protected] 32.4 -32.1 (123)-345-6789, #1  \n \n   "
        reconstruct_text = re.sub(' +', ' ', text)
        t = Tokenizer()
        t.keep_multi_space = False
        tokens = t.tokenize(text)
        token_attrs = []
        for i in tokens:
            token_attrs.append({"orth": i.orth_, "offset": i.idx, "full_shape": i._.full_shape})
        expected = [
            {'orth': 'dsa', 'offset': 0, 'full_shape': 'xxx'},
            {'orth': '@', 'offset': 3, 'full_shape': '@'},
            {'orth': 'isi', 'offset': 4, 'full_shape': 'xxx'},
            {'orth': '.', 'offset': 7, 'full_shape': '.'},
            {'orth': 'edu', 'offset': 8, 'full_shape': 'xxx'},
            {'orth': '32.4', 'offset': 12, 'full_shape': 'dd.d'},
            {'orth': '-', 'offset': 17, 'full_shape': '-'},
            {'orth': '32.1', 'offset': 18, 'full_shape': 'dd.d'},
            {'orth': '(', 'offset': 23, 'full_shape': '('},
            {'orth': '123', 'offset': 24, 'full_shape': 'ddd'},
            {'orth': ')', 'offset': 27, 'full_shape': ')'},
            {'orth': '-', 'offset': 28, 'full_shape': '-'},
            {'orth': '345', 'offset': 29, 'full_shape': 'ddd'},
            {'orth': '-', 'offset': 32, 'full_shape': '-'},
            {'orth': '6789', 'offset': 33, 'full_shape': 'dddd'},
            {'orth': ',', 'offset': 37, 'full_shape': ','},
            {'orth': '#', 'offset': 39, 'full_shape': '#'},
            {'orth': '1', 'offset': 40, 'full_shape': 'd'},
            {'orth': '\n ', 'offset': 42, 'full_shape': '\n '},
            {'orth': '\n ', 'offset': 44, 'full_shape': '\n '}
        ]

        self.assertEqual(token_attrs, expected)
        self.assertEqual(t.reconstruct_text(tokens), reconstruct_text)

Example #3

0

Show file

    def __init__(self, nlp, rules: Dict, extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp
            rules (Dict): spacy rules
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="spacy_rule_extractor",
                           name=extractor_name)
        self.rules = rules["rules"]
        self.nlp = copy.deepcopy(nlp)
        self.tokenizer = Tokenizer(self.nlp)
        self.matcher = Matcher(self.nlp.vocab)
        self.field_name = rules[
            "field_name"] if "field_name" in rules else extractor_name
        self.rule_lst = {}
        self.hash_map = {}
        for idx, a_rule in enumerate(self.rules):
            this_rule = Rule(a_rule, self.nlp)
            self.rule_lst[this_rule.identifier + "rule_id##" +
                          str(idx)] = this_rule

Example #4

0

Show file

File: spacy_rule_extractor.py Project: userxiname/etk-isi

    def __init__(self,
                 nlp,
                 rules: Dict,
                 extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp
            rules: Dict
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="spacy_rule_extractor",
                           name=extractor_name)
        self.rules = rules["rules"]
        self.nlp = copy.deepcopy(nlp)
        self.tokenizer = Tokenizer(self.nlp)
        self.matcher = Matcher(self.nlp.vocab)
        self.field_name = rules["field_name"]
        self.rule_lst = []
        for a_rule in self.rules:
            this_rule = Rule(a_rule, self.nlp)
            self.rule_lst.append(this_rule)

Example #5

0

Show file

File: test_glossary_extractor.py Project: userxiname/etk-isi

 def test_glossary_extractor(self) -> None:
     t = Tokenizer()
     g = ['New York', 'Shanghai', 'Los Angeles', 'Beijing']
     ge = GlossaryExtractor(g, 'test_glossary', t, 2, False)
     text = 'i live in los angeles. my hometown is Beijing'
     tokens = t.tokenize(text)
     test_result = [i.value for i in ge.extract(tokens)]
     expected = ["Beijing", "Los Angeles"]
     self.assertEqual(test_result, expected)

Example #6

0

Show file

File: test_glossary_extractor.py Project: xkgoodbest/etk

    def test_glossary_extractor(self) -> None:
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)

        ge = GlossaryExtractor(self.glossary_1, 'test_glossary', t, 3, False)
        results = [i.value for i in ge.extract(tokens)]
        expected = ['Beijing', 'los angeles', 'New York']

        self.assertEqual(results, expected)

Example #7

0

Show file

    def test_case_sensitive(self) -> None:
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)

        g = ['Beijing', 'Los Angeles', 'New York', 'Shanghai']
        ge = GlossaryExtractor(g, 'test_glossary', t, 2, True)

        results = [i.value for i in ge.extract(tokens)]
        expected = ['Beijing', 'New York']

        self.assertEqual(results, expected)

Example #8

0

Show file

File: test_black_list_filter.py Project: xkgoodbest/etk

    def setUp(self):
        self.text = 'Napoléon Bonaparte was a French statesman and military leader who rose to prominence during the ' \
                    'French Revolution and led several successful campaigns during the French Revolutionary Wars. ' \
                    'As Napoleon, he was Emperor of the French from 1804 until 1814, and again briefly in 1815 during ' \
                    'the Hundred Days. Napoleon dominated European and global affairs for more than a decade while ' \
                    'leading France against a series of coalitions in the Napoleonic Wars. He won most of these wars ' \
                    'and the vast majority of his battles, building a large empire that ruled over continental Europe ' \
                    'before its final collapse in 1815. He is considered one of the greatest commanders in history, ' \
                    'and his wars and campaigns are studied at military schools worldwide. Napoleon\'s political and ' \
                    'cultural legacy has endured as one of the most celebrated and controversial leaders in human history.'
        extractor = SpacyNerExtractor(extractor_name='spacy_ner_extractor')
        self.results = extractor.extract(self.text)

        glossary_1 = ['Beijing', 'Los Angeles', 'New York', 'Shanghai']
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)
        ge = GlossaryExtractor(glossary_1, 'test_glossary', t, 3, False)
        self.results2 = ge.extract(tokens)

Example #9

0

Show file

 def __init__(self):
     self.parser = jsonpath_ng.parse
     self.default_nlp = spacy.load('en_core_web_sm')
     self.default_tokenizer = Tokenizer(self.default_nlp)
     self.parsed = dict()

Example #10

0

Show file

File: test_extractable.py Project: userxiname/etk-isi

    def test_Extractable(self) -> None:
        e = Extractable({
            'extracted_value': [{
                1: 2,
                'das': [1, 2, 3]
            }],
            'confidence': 2.3
        })
        t = Tokenizer()
        tokens = e.get_tokens(t)
        token_attrs = []
        for i in tokens:
            token_attrs.append({
                "orth": i.orth_,
                "offset": i.idx,
                "full_shape": i._.full_shape
            })
        expected_token = [{
            'orth': 'extracted',
            'offset': 0,
            'full_shape': 'xxxxxxxxx'
        }, {
            'orth': '_',
            'offset': 9,
            'full_shape': '_'
        }, {
            'orth': 'value',
            'offset': 10,
            'full_shape': 'xxxxx'
        }, {
            'orth': ':',
            'offset': 16,
            'full_shape': ':'
        }, {
            'orth': '1',
            'offset': 18,
            'full_shape': 'd'
        }, {
            'orth': ':',
            'offset': 20,
            'full_shape': ':'
        }, {
            'orth': '2',
            'offset': 22,
            'full_shape': 'd'
        }, {
            'orth': 'das',
            'offset': 24,
            'full_shape': 'xxx'
        }, {
            'orth': ':',
            'offset': 28,
            'full_shape': ':'
        }, {
            'orth': '1',
            'offset': 30,
            'full_shape': 'd'
        }, {
            'orth': '2',
            'offset': 32,
            'full_shape': 'd'
        }, {
            'orth': '3',
            'offset': 34,
            'full_shape': 'd'
        }, {
            'orth': 'confidence',
            'offset': 36,
            'full_shape': 'xxxxxxxxxx'
        }, {
            'orth': ':',
            'offset': 47,
            'full_shape': ':'
        }, {
            'orth': '2.3',
            'offset': 49,
            'full_shape': 'd.d'
        }]

        self.assertEqual(token_attrs, expected_token)
        text = e.get_string()
        expected_str = "extracted_value : 1 : 2  das : 1  2  3        confidence : 2.3  "

        self.assertEqual(text, expected_str)