def __init__(self, kg_schema=None, modules=None, extract_error_policy="process", logger=None, logger_path=os.path.join(TEMP_DIR, 'etk.log'), ontology=None, generate_json_ld=False, output_kg_only=False, use_spacy_tokenizer=False): self.generate_json_ld = generate_json_ld self.output_kg_only = output_kg_only if logger: self.logger = logger else: logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(name)-6s %(levelname)s %(message)s', datefmt='%m-%d %H:%M', filename=logger_path, filemode='w' ) self.logger = logging.getLogger('ETK') self.parser = jsonpath_ng.ext.parse self.default_nlp = spacy.load('en_core_web_sm') if use_spacy_tokenizer: self.default_tokenizer = Tokenizer(copy.deepcopy(self.default_nlp)) else: self.default_tokenizer = CrfTokenizer() self.default_tokenizer = Tokenizer() self.parsed = dict() self.kg_schema = kg_schema self.ontology = ontology self.em_lst = list() if modules: if isinstance(modules, list): for module in modules: if isinstance(module, str): self.em_lst.extend(self.load_ems(modules)) elif issubclass(module, ETKModule): self.em_lst.append(module(self)) elif issubclass(modules, ETKModule): self.em_lst = [modules(self)] else: raise NotGetETKModuleError("Not getting extraction module") if extract_error_policy.lower() == "throw_extraction": self.error_policy = ErrorPolicy.THROW_EXTRACTION if extract_error_policy.lower() == "throw_document": self.error_policy = ErrorPolicy.THROW_DOCUMENT if extract_error_policy.lower() == "raise_error": self.error_policy = ErrorPolicy.RAISE else: self.error_policy = ErrorPolicy.PROCESS
def test_tokenizer(self) -> None: text = "[email protected] 32.4 -32.1 (123)-345-6789, #1 \n \n " reconstruct_text = re.sub(' +', ' ', text) t = Tokenizer() t.keep_multi_space = False tokens = t.tokenize(text) token_attrs = [] for i in tokens: token_attrs.append({"orth": i.orth_, "offset": i.idx, "full_shape": i._.full_shape}) expected = [ {'orth': 'dsa', 'offset': 0, 'full_shape': 'xxx'}, {'orth': '@', 'offset': 3, 'full_shape': '@'}, {'orth': 'isi', 'offset': 4, 'full_shape': 'xxx'}, {'orth': '.', 'offset': 7, 'full_shape': '.'}, {'orth': 'edu', 'offset': 8, 'full_shape': 'xxx'}, {'orth': '32.4', 'offset': 12, 'full_shape': 'dd.d'}, {'orth': '-', 'offset': 17, 'full_shape': '-'}, {'orth': '32.1', 'offset': 18, 'full_shape': 'dd.d'}, {'orth': '(', 'offset': 23, 'full_shape': '('}, {'orth': '123', 'offset': 24, 'full_shape': 'ddd'}, {'orth': ')', 'offset': 27, 'full_shape': ')'}, {'orth': '-', 'offset': 28, 'full_shape': '-'}, {'orth': '345', 'offset': 29, 'full_shape': 'ddd'}, {'orth': '-', 'offset': 32, 'full_shape': '-'}, {'orth': '6789', 'offset': 33, 'full_shape': 'dddd'}, {'orth': ',', 'offset': 37, 'full_shape': ','}, {'orth': '#', 'offset': 39, 'full_shape': '#'}, {'orth': '1', 'offset': 40, 'full_shape': 'd'}, {'orth': '\n ', 'offset': 42, 'full_shape': '\n '}, {'orth': '\n ', 'offset': 44, 'full_shape': '\n '} ] self.assertEqual(token_attrs, expected) self.assertEqual(t.reconstruct_text(tokens), reconstruct_text)
def __init__(self, nlp, rules: Dict, extractor_name: str) -> None: """ Initialize the extractor, storing the rule information and construct spacy rules Args: nlp rules (Dict): spacy rules extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="spacy_rule_extractor", name=extractor_name) self.rules = rules["rules"] self.nlp = copy.deepcopy(nlp) self.tokenizer = Tokenizer(self.nlp) self.matcher = Matcher(self.nlp.vocab) self.field_name = rules[ "field_name"] if "field_name" in rules else extractor_name self.rule_lst = {} self.hash_map = {} for idx, a_rule in enumerate(self.rules): this_rule = Rule(a_rule, self.nlp) self.rule_lst[this_rule.identifier + "rule_id##" + str(idx)] = this_rule
def __init__(self, nlp, rules: Dict, extractor_name: str) -> None: """ Initialize the extractor, storing the rule information and construct spacy rules Args: nlp rules: Dict extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="spacy_rule_extractor", name=extractor_name) self.rules = rules["rules"] self.nlp = copy.deepcopy(nlp) self.tokenizer = Tokenizer(self.nlp) self.matcher = Matcher(self.nlp.vocab) self.field_name = rules["field_name"] self.rule_lst = [] for a_rule in self.rules: this_rule = Rule(a_rule, self.nlp) self.rule_lst.append(this_rule)
def test_glossary_extractor(self) -> None: t = Tokenizer() g = ['New York', 'Shanghai', 'Los Angeles', 'Beijing'] ge = GlossaryExtractor(g, 'test_glossary', t, 2, False) text = 'i live in los angeles. my hometown is Beijing' tokens = t.tokenize(text) test_result = [i.value for i in ge.extract(tokens)] expected = ["Beijing", "Los Angeles"] self.assertEqual(test_result, expected)
def test_glossary_extractor(self) -> None: t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) ge = GlossaryExtractor(self.glossary_1, 'test_glossary', t, 3, False) results = [i.value for i in ge.extract(tokens)] expected = ['Beijing', 'los angeles', 'New York'] self.assertEqual(results, expected)
def test_case_sensitive(self) -> None: t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) g = ['Beijing', 'Los Angeles', 'New York', 'Shanghai'] ge = GlossaryExtractor(g, 'test_glossary', t, 2, True) results = [i.value for i in ge.extract(tokens)] expected = ['Beijing', 'New York'] self.assertEqual(results, expected)
def setUp(self): self.text = 'Napoléon Bonaparte was a French statesman and military leader who rose to prominence during the ' \ 'French Revolution and led several successful campaigns during the French Revolutionary Wars. ' \ 'As Napoleon, he was Emperor of the French from 1804 until 1814, and again briefly in 1815 during ' \ 'the Hundred Days. Napoleon dominated European and global affairs for more than a decade while ' \ 'leading France against a series of coalitions in the Napoleonic Wars. He won most of these wars ' \ 'and the vast majority of his battles, building a large empire that ruled over continental Europe ' \ 'before its final collapse in 1815. He is considered one of the greatest commanders in history, ' \ 'and his wars and campaigns are studied at military schools worldwide. Napoleon\'s political and ' \ 'cultural legacy has endured as one of the most celebrated and controversial leaders in human history.' extractor = SpacyNerExtractor(extractor_name='spacy_ner_extractor') self.results = extractor.extract(self.text) glossary_1 = ['Beijing', 'Los Angeles', 'New York', 'Shanghai'] t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) ge = GlossaryExtractor(glossary_1, 'test_glossary', t, 3, False) self.results2 = ge.extract(tokens)
def __init__(self): self.parser = jsonpath_ng.parse self.default_nlp = spacy.load('en_core_web_sm') self.default_tokenizer = Tokenizer(self.default_nlp) self.parsed = dict()
def test_Extractable(self) -> None: e = Extractable({ 'extracted_value': [{ 1: 2, 'das': [1, 2, 3] }], 'confidence': 2.3 }) t = Tokenizer() tokens = e.get_tokens(t) token_attrs = [] for i in tokens: token_attrs.append({ "orth": i.orth_, "offset": i.idx, "full_shape": i._.full_shape }) expected_token = [{ 'orth': 'extracted', 'offset': 0, 'full_shape': 'xxxxxxxxx' }, { 'orth': '_', 'offset': 9, 'full_shape': '_' }, { 'orth': 'value', 'offset': 10, 'full_shape': 'xxxxx' }, { 'orth': ':', 'offset': 16, 'full_shape': ':' }, { 'orth': '1', 'offset': 18, 'full_shape': 'd' }, { 'orth': ':', 'offset': 20, 'full_shape': ':' }, { 'orth': '2', 'offset': 22, 'full_shape': 'd' }, { 'orth': 'das', 'offset': 24, 'full_shape': 'xxx' }, { 'orth': ':', 'offset': 28, 'full_shape': ':' }, { 'orth': '1', 'offset': 30, 'full_shape': 'd' }, { 'orth': '2', 'offset': 32, 'full_shape': 'd' }, { 'orth': '3', 'offset': 34, 'full_shape': 'd' }, { 'orth': 'confidence', 'offset': 36, 'full_shape': 'xxxxxxxxxx' }, { 'orth': ':', 'offset': 47, 'full_shape': ':' }, { 'orth': '2.3', 'offset': 49, 'full_shape': 'd.d' }] self.assertEqual(token_attrs, expected_token) text = e.get_string() expected_str = "extracted_value : 1 : 2 das : 1 2 3 confidence : 2.3 " self.assertEqual(text, expected_str)