Beispiel #1
0
    def __init__(self, nlp, rules: Dict, extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp
            rules (Dict): spacy rules
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="spacy_rule_extractor",
                           name=extractor_name)
        self.rules = rules["rules"]
        self.nlp = copy.deepcopy(nlp)
        self.tokenizer = Tokenizer(self.nlp)
        self.matcher = Matcher(self.nlp.vocab)
        self.field_name = rules[
            "field_name"] if "field_name" in rules else extractor_name
        self.rule_lst = {}
        self.hash_map = {}
        for idx, a_rule in enumerate(self.rules):
            this_rule = Rule(a_rule, self.nlp)
            self.rule_lst[this_rule.identifier + "rule_id##" +
                          str(idx)] = this_rule
Beispiel #2
0
    def test_tokenizer(self) -> None:
        text = "[email protected] 32.4 -32.1 (123)-345-6789, #1  \n \n   "
        reconstruct_text = re.sub(' +', ' ', text)
        t = Tokenizer()
        t.keep_multi_space = False
        tokens = t.tokenize(text)
        token_attrs = []
        for i in tokens:
            token_attrs.append({"orth": i.orth_, "offset": i.idx, "full_shape": i._.full_shape})
        expected = [
            {'orth': 'dsa', 'offset': 0, 'full_shape': 'xxx'},
            {'orth': '@', 'offset': 3, 'full_shape': '@'},
            {'orth': 'isi', 'offset': 4, 'full_shape': 'xxx'},
            {'orth': '.', 'offset': 7, 'full_shape': '.'},
            {'orth': 'edu', 'offset': 8, 'full_shape': 'xxx'},
            {'orth': '32.4', 'offset': 12, 'full_shape': 'dd.d'},
            {'orth': '-', 'offset': 17, 'full_shape': '-'},
            {'orth': '32.1', 'offset': 18, 'full_shape': 'dd.d'},
            {'orth': '(', 'offset': 23, 'full_shape': '('},
            {'orth': '123', 'offset': 24, 'full_shape': 'ddd'},
            {'orth': ')', 'offset': 27, 'full_shape': ')'},
            {'orth': '-', 'offset': 28, 'full_shape': '-'},
            {'orth': '345', 'offset': 29, 'full_shape': 'ddd'},
            {'orth': '-', 'offset': 32, 'full_shape': '-'},
            {'orth': '6789', 'offset': 33, 'full_shape': 'dddd'},
            {'orth': ',', 'offset': 37, 'full_shape': ','},
            {'orth': '#', 'offset': 39, 'full_shape': '#'},
            {'orth': '1', 'offset': 40, 'full_shape': 'd'},
            {'orth': '\n ', 'offset': 42, 'full_shape': '\n '},
            {'orth': '\n ', 'offset': 44, 'full_shape': '\n '}
        ]

        self.assertEqual(token_attrs, expected)
        self.assertEqual(t.reconstruct_text(tokens), reconstruct_text)
    def __init__(self,
                 nlp,
                 rules: Dict,
                 extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp
            rules: Dict
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="spacy_rule_extractor",
                           name=extractor_name)
        self.rules = rules["rules"]
        self.nlp = copy.deepcopy(nlp)
        self.tokenizer = Tokenizer(self.nlp)
        self.matcher = Matcher(self.nlp.vocab)
        self.field_name = rules["field_name"]
        self.rule_lst = []
        for a_rule in self.rules:
            this_rule = Rule(a_rule, self.nlp)
            self.rule_lst.append(this_rule)
 def test_glossary_extractor(self) -> None:
     t = Tokenizer()
     g = ['New York', 'Shanghai', 'Los Angeles', 'Beijing']
     ge = GlossaryExtractor(g, 'test_glossary', t, 2, False)
     text = 'i live in los angeles. my hometown is Beijing'
     tokens = t.tokenize(text)
     test_result = [i.value for i in ge.extract(tokens)]
     expected = ["Beijing", "Los Angeles"]
     self.assertEqual(test_result, expected)
    def test_glossary_extractor(self) -> None:
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)

        ge = GlossaryExtractor(self.glossary_1, 'test_glossary', t, 3, False)
        results = [i.value for i in ge.extract(tokens)]
        expected = ['Beijing', 'los angeles', 'New York']

        self.assertEqual(results, expected)
Beispiel #6
0
    def test_case_sensitive(self) -> None:
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)

        g = ['Beijing', 'Los Angeles', 'New York', 'Shanghai']
        ge = GlossaryExtractor(g, 'test_glossary', t, 2, True)

        results = [i.value for i in ge.extract(tokens)]
        expected = ['Beijing', 'New York']

        self.assertEqual(results, expected)
Beispiel #7
0
    def __init__(self, kg_schema=None, modules=None, extract_error_policy="process", logger=None,
                 logger_path=os.path.join(TEMP_DIR, 'etk.log'), ontology=None, generate_json_ld=False,
                 output_kg_only=False, use_spacy_tokenizer=False):

        self.generate_json_ld = generate_json_ld
        self.output_kg_only = output_kg_only

        if logger:
            self.logger = logger
        else:
            logging.basicConfig(
                level=logging.DEBUG,
                format='%(asctime)s %(name)-6s %(levelname)s %(message)s',
                datefmt='%m-%d %H:%M',
                filename=logger_path,
                filemode='w'
            )
            self.logger = logging.getLogger('ETK')

        self.parser = jsonpath_ng.ext.parse
        self.default_nlp = spacy.load('en_core_web_sm')
        if use_spacy_tokenizer:
            self.default_tokenizer = Tokenizer(copy.deepcopy(self.default_nlp))
        else:
            self.default_tokenizer = CrfTokenizer()
        self.default_tokenizer = Tokenizer()
        self.parsed = dict()
        self.kg_schema = kg_schema
        self.ontology = ontology
        self.em_lst = list()
        if modules:
            if isinstance(modules, list):
                for module in modules:
                    if isinstance(module, str):
                        self.em_lst.extend(self.load_ems(modules))
                    elif issubclass(module, ETKModule):
                        self.em_lst.append(module(self))
            elif issubclass(modules, ETKModule):
                self.em_lst = [modules(self)]
            else:
                raise NotGetETKModuleError("Not getting extraction module")

        if extract_error_policy.lower() == "throw_extraction":
            self.error_policy = ErrorPolicy.THROW_EXTRACTION
        if extract_error_policy.lower() == "throw_document":
            self.error_policy = ErrorPolicy.THROW_DOCUMENT
        if extract_error_policy.lower() == "raise_error":
            self.error_policy = ErrorPolicy.RAISE
        else:
            self.error_policy = ErrorPolicy.PROCESS
Beispiel #8
0
    def get_tokens(self, tokenizer: Tokenizer) -> List[Token]:
        """
        Tokenize this Extractable.

        If the value is a string, it returns the tokenized version of the string. Else, convert to string with
        get_string method

        As it is common to need the same tokens for multiple extractors, the Extractable should cache the
        tokenize results, keyed by segment and tokenizer so that given the same segment and tokenizer,
        the same results are returned. If the same segment is given, but different tokenizer, the different
        results are cached separately.

        Args:
            tokenizer (Tokenizer)

        Returns: a sequence of tokens.
        """

        if (self, tokenizer) in self.tokenize_results:
            return self.tokenize_results[(self, tokenizer)]
        else:
            segment_value_for_tokenize = self.get_string()
            tokens = tokenizer.tokenize(segment_value_for_tokenize)
            self.tokenize_results[(self, tokenizer)] = tokens
            return tokens
class SpacyRuleExtractor(Extractor):
    def __init__(self,
                 nlp,
                 rules: Dict,
                 extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp
            rules: Dict
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="spacy_rule_extractor",
                           name=extractor_name)
        self.rules = rules["rules"]
        self.nlp = copy.deepcopy(nlp)
        self.tokenizer = Tokenizer(self.nlp)
        self.matcher = Matcher(self.nlp.vocab)
        self.field_name = rules["field_name"]
        self.rule_lst = []
        for a_rule in self.rules:
            this_rule = Rule(a_rule, self.nlp)
            self.rule_lst.append(this_rule)

    def extract(self, text: str) -> List[Extraction]:
        """
        Extract from text
        Args:
            text: str

        Returns: List[Extraction]
        """
        doc = self.tokenizer.tokenize_to_spacy_doc(text)
        self.load_matcher()
        """TODO: add callback function to filter custom constrains
        1. Prefix, suffix
        2. min, max
        3. full shape
        4. is in output
        5. filter overlap
        """
        for idx, start, end in self.matcher(doc):
            print(idx, doc[start:end])

    def load_matcher(self) -> None:
        for idx, a_rule in enumerate(self.rule_lst):
            pattern_flat_lst = [a_pattern.spacy_token_lst for a_pattern in a_rule.patterns]
            for element in itertools.product(*pattern_flat_lst):
                x = list(element)
                print(x)
                self.matcher.add(idx, None, x)
    def setUp(self):
        self.text = 'Napoléon Bonaparte was a French statesman and military leader who rose to prominence during the ' \
                    'French Revolution and led several successful campaigns during the French Revolutionary Wars. ' \
                    'As Napoleon, he was Emperor of the French from 1804 until 1814, and again briefly in 1815 during ' \
                    'the Hundred Days. Napoleon dominated European and global affairs for more than a decade while ' \
                    'leading France against a series of coalitions in the Napoleonic Wars. He won most of these wars ' \
                    'and the vast majority of his battles, building a large empire that ruled over continental Europe ' \
                    'before its final collapse in 1815. He is considered one of the greatest commanders in history, ' \
                    'and his wars and campaigns are studied at military schools worldwide. Napoleon\'s political and ' \
                    'cultural legacy has endured as one of the most celebrated and controversial leaders in human history.'
        extractor = SpacyNerExtractor(extractor_name='spacy_ner_extractor')
        self.results = extractor.extract(self.text)

        glossary_1 = ['Beijing', 'Los Angeles', 'New York', 'Shanghai']
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)
        ge = GlossaryExtractor(glossary_1, 'test_glossary', t, 3, False)
        self.results2 = ge.extract(tokens)
Beispiel #11
0
 def __init__(self):
     self.parser = jsonpath_ng.parse
     self.default_nlp = spacy.load('en_core_web_sm')
     self.default_tokenizer = Tokenizer(self.default_nlp)
     self.parsed = dict()
Beispiel #12
0
class SpacyRuleExtractor(Extractor):
    def __init__(self, nlp, rules: Dict, extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp
            rules (Dict): spacy rules
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="spacy_rule_extractor",
                           name=extractor_name)
        self.rules = rules["rules"]
        self.nlp = copy.deepcopy(nlp)
        self.tokenizer = Tokenizer(self.nlp)
        self.matcher = Matcher(self.nlp.vocab)
        self.field_name = rules[
            "field_name"] if "field_name" in rules else extractor_name
        self.rule_lst = {}
        self.hash_map = {}
        for idx, a_rule in enumerate(self.rules):
            this_rule = Rule(a_rule, self.nlp)
            self.rule_lst[this_rule.identifier + "rule_id##" +
                          str(idx)] = this_rule

    def extract(self, text: str) -> List[Extraction]:
        """
        Extract from text
        Args:
            text: str

        Returns: List[Extraction]
        """

        doc = self.tokenizer.tokenize_to_spacy_doc(text)
        self.load_matcher()

        matches = [x for x in self.matcher(doc) if x[1] != x[2]]
        pos_filtered_matches = []
        neg_filtered_matches = []
        for idx, start, end in matches:
            span_doc = self.tokenizer.tokenize_to_spacy_doc(
                doc[start:end].text)
            this_spacy_rule = self.matcher.get(idx)
            relations = self.find_relation(span_doc, this_spacy_rule)
            rule_id, _ = self.hash_map[idx]
            this_rule = self.rule_lst[rule_id]
            if self.filter_match(doc[start:end], relations,
                                 this_rule.patterns):
                value = self.form_output(doc[start:end],
                                         this_rule.output_format, relations,
                                         this_rule.patterns)
                if this_rule.polarity:
                    pos_filtered_matches.append(
                        (start, end, value, rule_id, relations))
                else:
                    neg_filtered_matches.append(
                        (start, end, value, rule_id, relations))

        return_lst = []
        if pos_filtered_matches:
            longest_lst_pos = self.get_longest(pos_filtered_matches)
            if neg_filtered_matches:
                longest_lst_neg = self.get_longest(neg_filtered_matches)
                return_lst = self.reject_neg(longest_lst_pos, longest_lst_neg)
            else:
                return_lst = longest_lst_pos

        extractions = []
        for (start, end, value, rule_id, relation) in return_lst:
            this_extraction = Extraction(value=value,
                                         extractor_name=self.name,
                                         start_token=start,
                                         end_token=end,
                                         start_char=doc[start].idx,
                                         end_char=doc[end - 1].idx +
                                         len(doc[end - 1]),
                                         rule_id=rule_id.split("rule_id##")[0],
                                         match_mapping=relation)
            extractions.append(this_extraction)

        return extractions

    def load_matcher(self) -> None:
        """
        Add constructed spacy rule to Matcher
        """
        for id_key in self.rule_lst:
            if self.rule_lst[id_key].active:
                pattern_lst = [
                    a_pattern.spacy_token_lst
                    for a_pattern in self.rule_lst[id_key].patterns
                ]

                for spacy_rule_id, spacy_rule in enumerate(
                        itertools.product(*pattern_lst)):
                    self.matcher.add(self.construct_key(id_key, spacy_rule_id),
                                     None, list(spacy_rule))

    def filter_match(self, span: span, relations: Dict,
                     patterns: List) -> bool:
        """
        Filter the match result according to prefix, suffix, min, max ...
        Args:
            span: span
            relations: Dict
            patterns: List of pattern

        Returns: bool
        """

        for pattern_id, a_pattern in enumerate(patterns):
            token_range = relations[pattern_id]
            if token_range:
                tokens = [x for x in span[token_range[0]:token_range[1]]]
                if a_pattern.type == "word":
                    if not self.pre_suf_fix_filter(tokens, a_pattern.prefix,
                                                   a_pattern.suffix):
                        return False
                if a_pattern.type == "shape":
                    if not (self.full_shape_filter(tokens,
                                                   a_pattern.full_shape)
                            and self.pre_suf_fix_filter(
                                tokens, a_pattern.prefix, a_pattern.suffix)):
                        return False
                if a_pattern.type == "number":
                    if not self.min_max_filter(tokens, a_pattern.min,
                                               a_pattern.max):
                        return False
        return True

    @staticmethod
    def get_longest(value_lst: List) -> List:
        """
        Get the longest match for overlap
        Args:
            value_lst: List

        Returns: List
        """

        value_lst.sort()
        result = []
        pivot = value_lst[0]
        start, end = pivot[0], pivot[1]
        pivot_e = end
        pivot_s = start
        for idx, (s, e, v, rule_id, _) in enumerate(value_lst):
            if s == pivot_s and pivot_e < e:
                pivot_e = e
                pivot = value_lst[idx]
            elif s != pivot_s and pivot_e < e:
                result.append(pivot)
                pivot = value_lst[idx]
                pivot_e = e
                pivot_s = s
        result.append(pivot)
        return result

    @staticmethod
    def reject_neg(pos_lst: List, neg_lst: List) -> List:
        """
        Reject some positive matches according to negative matches
        Args:
            pos_lst: List
            neg_lst: List

        Returns: List
        """

        pos_lst.sort()
        neg_lst.sort()
        result = []
        pivot_pos = pos_lst[0]
        pivot_neg = neg_lst[0]
        while pos_lst:
            if pivot_pos[1] <= pivot_neg[0]:
                result.append(pivot_pos)
                pos_lst.pop(0)
                if pos_lst:
                    pivot_pos = pos_lst[0]
            elif pivot_pos[0] >= pivot_neg[1]:
                neg_lst.pop(0)
                if not neg_lst:
                    result += pos_lst
                    break
                else:
                    pivot_neg = neg_lst[0]
            else:
                pos_lst.pop(0)
                if pos_lst:
                    pivot_pos = pos_lst[0]
        return result

    @staticmethod
    def pre_suf_fix_filter(t: List, prefix: str, suffix: str) -> bool:
        """
        Prefix and Suffix filter
        Args:
            t: List, list of tokens
            prefix: str
            suffix: str

        Returns: bool
        """

        if prefix:
            for a_token in t:
                if a_token._.n_prefix(len(prefix)) != prefix:
                    return False
        if suffix:
            for a_token in t:
                if a_token._.n_suffix(len(suffix)) != suffix:
                    return False

        return True

    @staticmethod
    def min_max_filter(t: List, min_v: str, max_v: str) -> bool:
        """
        Min and Max filter
        Args:
            t: List, list of tokens
            min_v: str
            max_v: str

        Returns: bool
        """
        def tofloat(value):
            try:
                float(value)
                return float(value)
            except ValueError:
                return False

        for a_token in t:
            if not tofloat(a_token.text):
                return False
            else:
                if min_v and tofloat(min_v):
                    this_v = tofloat(a_token.text)
                    if this_v < tofloat(min_v):
                        return False
                if max_v and tofloat(max_v):
                    this_v = tofloat(a_token.text)
                    if this_v > tofloat(max_v):
                        return False

        return True

    @staticmethod
    def full_shape_filter(t: List, shapes: List) -> bool:
        """
        Shape filter
        Args:
            t: List, list of tokens
            shapes: List

        Returns: bool
        """

        if shapes:
            for a_token in t:
                if a_token._.full_shape not in shapes:
                    return False

        return True

    @staticmethod
    def form_output(span_doc: span, output_format: str, relations: Dict,
                    patterns: List) -> str:
        """
        Form an output value according to user input of output_format
        Args:
            span_doc: span
            format: str
            relations: Dict
            patterns: List

        Returns: str
        """

        format_value = []
        output_inf = [a_pattern.in_output for a_pattern in patterns]
        for i in range(len(output_inf)):
            token_range = relations[i]
            if token_range and output_inf[i]:
                format_value.append(
                    span_doc[token_range[0]:token_range[1]].text)

        if not output_format:
            return " ".join(format_value)

        result_str = ""
        s = list(output_format)
        t1 = s.pop(0)
        t2 = s.pop(0)
        while 1:
            t3 = s.pop(0)
            if t1 == '{' and t2.isdigit() and t3 == '}':
                if int(t2) > len(format_value):
                    return result_str + t1 + t2 + t3 + "".join(s)
                result_str += format_value[int(t2) - 1]
                if not s:
                    break
                t1 = s.pop(0)
                if not s:
                    result_str += t1
                    break
                t2 = s.pop(0)
                if not s:
                    result_str += t2
                    break
            else:
                result_str += t1
                t1 = t2
                t2 = t3
                if not s:
                    result_str += t1
                    result_str += t2
                    break
        return result_str

    def construct_key(self, rule_id: str, spacy_rule_id: int) -> int:
        """
        Use a mapping to store the information about rule_id for each matches, create the mapping key here
        Args:
            rule_id: str
            spacy_rule_id:int

        Returns: int
        """

        hash_key = (rule_id, spacy_rule_id)
        hash_v = hash(hash_key) + sys.maxsize + 1
        self.hash_map[hash_v] = hash_key
        return hash_v

    def find_relation(self, span_doc: doc, r: List) -> Dict:
        """
        Get the relations between the each pattern in the spacy rule and the matches
        Args:
            span_doc: doc
            r: List

        Returns: Dict
        """

        rule = r[1][0]
        span_pivot = 0
        relation = {}
        for e_id, element in enumerate(rule):
            if not span_doc[span_pivot:]:
                for extra_id, _, in enumerate(rule[e_id:]):
                    relation[e_id + extra_id] = None
                break
            new_doc = self.tokenizer.tokenize_to_spacy_doc(
                span_doc[span_pivot:].text)
            if "OP" not in element:
                relation[e_id] = (span_pivot, span_pivot + 1)
                span_pivot += 1
            else:
                if e_id < len(rule) - 1:
                    tmp_rule_1 = [rule[e_id]]
                    tmp_rule_2 = [rule[e_id + 1]]
                    tmp_matcher = Matcher(self.nlp.vocab)
                    tmp_matcher.add(0, None, tmp_rule_1)
                    tmp_matcher.add(1, None, tmp_rule_2)
                    tmp_matches = sorted(
                        [x for x in tmp_matcher(new_doc) if x[1] != x[2]],
                        key=lambda a: a[1])

                    if not tmp_matches:
                        relation[e_id] = None
                    else:
                        matches_1 = [
                            x for x in tmp_matches if x[0] == 0 and x[1] == 0
                        ]
                        if not matches_1:
                            relation[e_id] = None
                        else:
                            _, s1, e1 = matches_1[0]
                            matches_2 = [x for x in tmp_matches if x[0] == 1]
                            if not matches_2:
                                relation[e_id] = (span_pivot, span_pivot + e1)
                                span_pivot += e1
                            else:
                                _, s2, e2 = matches_2[0]
                                if e1 <= s2:
                                    relation[e_id] = (span_pivot,
                                                      span_pivot + e1)
                                    span_pivot += e1
                                else:
                                    relation[e_id] = (span_pivot,
                                                      span_pivot + s2)
                                    span_pivot += s2
                else:
                    relation[e_id] = (span_pivot, len(span_doc))

        return relation
Beispiel #13
0
    def test_Extractable(self) -> None:
        e = Extractable({
            'extracted_value': [{
                1: 2,
                'das': [1, 2, 3]
            }],
            'confidence': 2.3
        })
        t = Tokenizer()
        tokens = e.get_tokens(t)
        token_attrs = []
        for i in tokens:
            token_attrs.append({
                "orth": i.orth_,
                "offset": i.idx,
                "full_shape": i._.full_shape
            })
        expected_token = [{
            'orth': 'extracted',
            'offset': 0,
            'full_shape': 'xxxxxxxxx'
        }, {
            'orth': '_',
            'offset': 9,
            'full_shape': '_'
        }, {
            'orth': 'value',
            'offset': 10,
            'full_shape': 'xxxxx'
        }, {
            'orth': ':',
            'offset': 16,
            'full_shape': ':'
        }, {
            'orth': '1',
            'offset': 18,
            'full_shape': 'd'
        }, {
            'orth': ':',
            'offset': 20,
            'full_shape': ':'
        }, {
            'orth': '2',
            'offset': 22,
            'full_shape': 'd'
        }, {
            'orth': 'das',
            'offset': 24,
            'full_shape': 'xxx'
        }, {
            'orth': ':',
            'offset': 28,
            'full_shape': ':'
        }, {
            'orth': '1',
            'offset': 30,
            'full_shape': 'd'
        }, {
            'orth': '2',
            'offset': 32,
            'full_shape': 'd'
        }, {
            'orth': '3',
            'offset': 34,
            'full_shape': 'd'
        }, {
            'orth': 'confidence',
            'offset': 36,
            'full_shape': 'xxxxxxxxxx'
        }, {
            'orth': ':',
            'offset': 47,
            'full_shape': ':'
        }, {
            'orth': '2.3',
            'offset': 49,
            'full_shape': 'd.d'
        }]

        self.assertEqual(token_attrs, expected_token)
        text = e.get_string()
        expected_str = "extracted_value : 1 : 2  das : 1  2  3        confidence : 2.3  "

        self.assertEqual(text, expected_str)