Beispiel #1
0
def test_matcher_from_api_docs(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "test"}]
    assert len(matcher) == 0
    matcher.add("Rule", None, pattern)
    assert len(matcher) == 1
    matcher.remove("Rule")
    assert "Rule" not in matcher
    matcher.add("Rule", None, pattern)
    assert "Rule" in matcher
    on_match, patterns = matcher.get("Rule")
    assert len(patterns[0])
Beispiel #2
0
def test_matcher_from_api_docs(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "test"}]
    assert len(matcher) == 0
    matcher.add("Rule", None, pattern)
    assert len(matcher) == 1
    matcher.remove("Rule")
    assert "Rule" not in matcher
    matcher.add("Rule", None, pattern)
    assert "Rule" in matcher
    on_match, patterns = matcher.get("Rule")
    assert len(patterns[0])
Beispiel #3
0
class nerutils:
    '''
    Class to implement NER on the text input
    Attributes
    ------------
    self.nlp: NLP Object
    self.matcher: spaCy's matcher object 
    self.pattern_name,self.pattern_period_stay,self.pattern_residential_addr: 
                Patterns used by the self.matcher.add() call
    self.pattern_relationship: List
                Contains the possible values of relationship which are used to match 
                the relationship field in the loan document.
    self.dict_regex: Dictionary
                Python dictionary containing the regular expressions which are used 
                to parse structured fields such as email-id, mobile no and pan_no
    self.dict_cond: Dictionary
                Contains boolean values corresponding to the fields(keys) in the loan
                document which indicates if the result is valid or not.
    self.key: Integer
                Int representing the field name, used in self.dict_cond to set the corresponding
                field to True if valid 
    '''

    def __init__(self):

        self.nlp = spacy.load("en_core_web_sm")
        #Creating a matcher object
        self.matcher = Matcher(self.nlp.vocab)

        person_label = self.nlp.vocab.strings["PERSON"]
        date_label = self.nlp.vocab.strings["DATE"]
        country_label = self.nlp.vocab.strings["GPE"]


        #Patterns which are to be fed to the spacy matcher
        self.pattern_name = [{"ENT_TYPE":person_label}]
        self.pattern_period_stay = [{"ENT_TYPE":date_label}]
        self.pattern_residential_addr = [{"ENT_TYPE":country_label}]

        
        self.pattern_relationship = ["wife","husband","father","mother","grandfather","grandmother","brother","sister","uncle","aunt"] 


        self.dict_pattern = {'name':self.pattern_name,'pan_no':[],'father_name':self.pattern_name,'relationship':[],'residential_addr':self.pattern_residential_addr,'period_stay':self.pattern_period_stay,'tel_no':[],
                            'mobile_no':[],'email':[]}
        
        #Regex for parsing pan_no, mobile_no and email
        self.dict_regex = {'pan_no':re.compile(r'(?P<pan_no>^([a-zA-Z]){5}([0-9]){4}([a-zA-Z]){1}?$)'),
                           "mobile_no":re.compile(r'(?P<mobile_no>^91[-\s]??\d{10}$)'),
                           "email":re.compile(r'(?P<email>^([a-zA-Z0-9_\-\.]+)(@\s)?([a-zA-Z0-9_\-\.]+)(\.\s)?([a-zA-Z]{2,5})$)')}

        
        self.dict_cond = {}
        self.key = None

    def callback_fn(self,matcher,doc,i,matches):
        '''
        Callback function for the matcher object
        Sets the value corresponding to self.key in self.dict_cond equal to True
        Parameters
        ---------------
        matcher:matcher object
        doc: String on which matcher object operates
        matches: List
                List of all the matches in the sentence
        i:  Integer
            Index of the current match
        '''
        self.dict_cond[self.key] = True

    def check_ocr(self,dict_ocr):
        '''
        Function to check the output of OCR using rule matching and NER

        Parameters
        -----------------
        dict_ocr: Dictionary
                Dictionary containing the result of fields(as keys) and the result of 
                OCR(strings) as corresponding values
        
        Returns
        -------------------
        self.dict_cond: Dict
                Dictionary containing fields(as keys) and bool value True/False indicating
                if the corresponding fields are valid or not.
        '''
        print("dict_ocr:",dict_ocr)

        for self.key, value_ls in dict_ocr.items():

            #For keys in the if condition below we use simple regex based pattern matching
            if(self.key == 'relationship' or self.key == 'pan_no' or self.key == 'mobile_no' or self.key == 'email'):
                for value in value_ls:
                    value = value.lower() #Converting to lowercase
                    if(self.key == 'relationship'):
                        if(value in self.pattern_relationship):
                            self.dict_cond[self.key] = True
                    else:
                        match = self.dict_regex[self.key].search(value)
                        if(match != None):
                            self.dict_cond[self.key] = True

            #For keys in the if condition below we make use of NER using Spacy's matcher object
            if(self.key == 'name' or self.key == 'period_stay' or self.key == 'residential_addr'
             or self.key == 'father_name'):
                print("self.key",self.key)
                self.matcher.add(str(self.key),self.callback_fn,self.dict_pattern[self.key])
            


            if(self.matcher):
                for value in value_ls:
                    print("value:",value)
                    doc = self.nlp(value)
                    #For debug
                    # ents = list(doc.ents)
                    # print("ents",ents)
                    # for i in range(0,len(ents)):
                    #     print("{},{}".format(ents[i].text,ents[i].label_)) 
                    # print([t.text for t in doc])

                    matches = self.matcher(doc)
                    print("matches:",matches)


                if("name" or "pan_no" or "period_stay" or "residential_addr" or "father_name" or "mobile_no" in self.matcher):
                    print("Inside condition make self.matcher=None")
                    print("self.key",self.key)
                    on_match, patterns = self.matcher.get(self.key)
                    print("patterns:",patterns)
                    #Clear the self.matcher object and reinitialize it 
                    self.matcher = None
                    self.matcher = Matcher(self.nlp.vocab)

        # print("self.dict_cond",self.dict_cond)
        return self.dict_cond
Beispiel #4
0
class NlpService(nlp_pb2_grpc.NlpServicer):
    def __init__(self):
        self.modelName = None
        self.nlp = None
        self.matcher = None

    def LoadModel(self, request, context):
        self.modelName = request.text
        self.nlp = spacy.load(request.text)
        response = nlp_pb2.TextResponse()
        response.message = "Model loaded '{}'".format(request.text)
        return response

    def NlpProcess(self, request, context):
        doc = self.nlp(request.text)
        response = utils.doc2proto(doc, self.modelName)
        return response

    def DocSimilarity(self, request, context):
        docA = self.nlp(request.texta)
        docB = self.nlp(request.textb)
        response = nlp_pb2.TextSimilarity()
        response.similarity = docA.similarity(docB)
        return response

    def AddRule(self, request, context):
        if self.matcher == None:
            self.matcher = Matcher(self.nlp.vocab)
        matcher_id = request.id
        patterns = [{pat.key: pat.value} for pat in request.patterns]
        self.matcher.add(matcher_id, None, patterns)
        response = nlp_pb2.TextResponse()
        response.message = "Rule with id '{}' added to matcher.".format(
            matcher_id)
        return response

    def RemoveRule(self, request, context):
        if self.matcher == None:
            return nlp_pb2.TextResponse(message="No rules exists with matcher")
        self.matcher.remove(request.text)
        return nlp_pb2.TextResponse(
            message="Rule with id '{}' removed from matcher.".format(
                request.text))

    def GetRule(self, request, context):
        if self.matcher == None:
            return nlp_pb2.TextResponse(message="No rules exists with matcher")
        _, patterns = self.matcher.get(request.text)
        return nlp_pb2.Rule(
            id=request.text,
            patterns=[
                nlp_pb2.Pattern(key=list(pat.keys())[0],
                                value=list(pat.values())[0])
                for pat in patterns[0]
            ],
        )

    def GetMatches(self, request, context):
        doc = self.nlp(request.text)
        matches = self.matcher(doc)
        reponse = nlp_pb2.Matches(matches=[
            nlp_pb2.Match(id=str(i[0]), start=i[1], end=i[2]) for i in matches
        ])
        return reponse

    def ResetMatcher(self, request, context):
        self.matcher = None
        return nlp_pb2.TextResponse(message="Matcher object reset successful.")
Beispiel #5
0
class SpacyRuleExtractor(Extractor):
    def __init__(self, nlp, rules: Dict, extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp
            rules (Dict): spacy rules
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="spacy_rule_extractor",
                           name=extractor_name)
        self.rules = rules["rules"]
        self.nlp = copy.deepcopy(nlp)
        self.tokenizer = Tokenizer(self.nlp)
        self.matcher = Matcher(self.nlp.vocab)
        self.field_name = rules[
            "field_name"] if "field_name" in rules else extractor_name
        self.rule_lst = {}
        self.hash_map = {}
        for idx, a_rule in enumerate(self.rules):
            this_rule = Rule(a_rule, self.nlp)
            self.rule_lst[this_rule.identifier + "rule_id##" +
                          str(idx)] = this_rule

    def extract(self, text: str) -> List[Extraction]:
        """
        Extract from text
        Args:
            text: str

        Returns: List[Extraction]
        """

        doc = self.tokenizer.tokenize_to_spacy_doc(text)
        self.load_matcher()

        matches = [x for x in self.matcher(doc) if x[1] != x[2]]
        pos_filtered_matches = []
        neg_filtered_matches = []
        for idx, start, end in matches:
            span_doc = self.tokenizer.tokenize_to_spacy_doc(
                doc[start:end].text)
            this_spacy_rule = self.matcher.get(idx)
            relations = self.find_relation(span_doc, this_spacy_rule)
            rule_id, _ = self.hash_map[idx]
            this_rule = self.rule_lst[rule_id]
            if self.filter_match(doc[start:end], relations,
                                 this_rule.patterns):
                value = self.form_output(doc[start:end],
                                         this_rule.output_format, relations,
                                         this_rule.patterns)
                if this_rule.polarity:
                    pos_filtered_matches.append(
                        (start, end, value, rule_id, relations))
                else:
                    neg_filtered_matches.append(
                        (start, end, value, rule_id, relations))

        return_lst = []
        if pos_filtered_matches:
            longest_lst_pos = self.get_longest(pos_filtered_matches)
            if neg_filtered_matches:
                longest_lst_neg = self.get_longest(neg_filtered_matches)
                return_lst = self.reject_neg(longest_lst_pos, longest_lst_neg)
            else:
                return_lst = longest_lst_pos

        extractions = []
        for (start, end, value, rule_id, relation) in return_lst:
            this_extraction = Extraction(value=value,
                                         extractor_name=self.name,
                                         start_token=start,
                                         end_token=end,
                                         start_char=doc[start].idx,
                                         end_char=doc[end - 1].idx +
                                         len(doc[end - 1]),
                                         rule_id=rule_id.split("rule_id##")[0],
                                         match_mapping=relation)
            extractions.append(this_extraction)

        return extractions

    def load_matcher(self) -> None:
        """
        Add constructed spacy rule to Matcher
        """
        for id_key in self.rule_lst:
            if self.rule_lst[id_key].active:
                pattern_lst = [
                    a_pattern.spacy_token_lst
                    for a_pattern in self.rule_lst[id_key].patterns
                ]

                for spacy_rule_id, spacy_rule in enumerate(
                        itertools.product(*pattern_lst)):
                    self.matcher.add(self.construct_key(id_key, spacy_rule_id),
                                     None, list(spacy_rule))

    def filter_match(self, span: span, relations: Dict,
                     patterns: List) -> bool:
        """
        Filter the match result according to prefix, suffix, min, max ...
        Args:
            span: span
            relations: Dict
            patterns: List of pattern

        Returns: bool
        """

        for pattern_id, a_pattern in enumerate(patterns):
            token_range = relations[pattern_id]
            if token_range:
                tokens = [x for x in span[token_range[0]:token_range[1]]]
                if a_pattern.type == "word":
                    if not self.pre_suf_fix_filter(tokens, a_pattern.prefix,
                                                   a_pattern.suffix):
                        return False
                if a_pattern.type == "shape":
                    if not (self.full_shape_filter(tokens,
                                                   a_pattern.full_shape)
                            and self.pre_suf_fix_filter(
                                tokens, a_pattern.prefix, a_pattern.suffix)):
                        return False
                if a_pattern.type == "number":
                    if not self.min_max_filter(tokens, a_pattern.min,
                                               a_pattern.max):
                        return False
        return True

    @staticmethod
    def get_longest(value_lst: List) -> List:
        """
        Get the longest match for overlap
        Args:
            value_lst: List

        Returns: List
        """

        value_lst.sort()
        result = []
        pivot = value_lst[0]
        start, end = pivot[0], pivot[1]
        pivot_e = end
        pivot_s = start
        for idx, (s, e, v, rule_id, _) in enumerate(value_lst):
            if s == pivot_s and pivot_e < e:
                pivot_e = e
                pivot = value_lst[idx]
            elif s != pivot_s and pivot_e < e:
                result.append(pivot)
                pivot = value_lst[idx]
                pivot_e = e
                pivot_s = s
        result.append(pivot)
        return result

    @staticmethod
    def reject_neg(pos_lst: List, neg_lst: List) -> List:
        """
        Reject some positive matches according to negative matches
        Args:
            pos_lst: List
            neg_lst: List

        Returns: List
        """

        pos_lst.sort()
        neg_lst.sort()
        result = []
        pivot_pos = pos_lst[0]
        pivot_neg = neg_lst[0]
        while pos_lst:
            if pivot_pos[1] <= pivot_neg[0]:
                result.append(pivot_pos)
                pos_lst.pop(0)
                if pos_lst:
                    pivot_pos = pos_lst[0]
            elif pivot_pos[0] >= pivot_neg[1]:
                neg_lst.pop(0)
                if not neg_lst:
                    result += pos_lst
                    break
                else:
                    pivot_neg = neg_lst[0]
            else:
                pos_lst.pop(0)
                if pos_lst:
                    pivot_pos = pos_lst[0]
        return result

    @staticmethod
    def pre_suf_fix_filter(t: List, prefix: str, suffix: str) -> bool:
        """
        Prefix and Suffix filter
        Args:
            t: List, list of tokens
            prefix: str
            suffix: str

        Returns: bool
        """

        if prefix:
            for a_token in t:
                if a_token._.n_prefix(len(prefix)) != prefix:
                    return False
        if suffix:
            for a_token in t:
                if a_token._.n_suffix(len(suffix)) != suffix:
                    return False

        return True

    @staticmethod
    def min_max_filter(t: List, min_v: str, max_v: str) -> bool:
        """
        Min and Max filter
        Args:
            t: List, list of tokens
            min_v: str
            max_v: str

        Returns: bool
        """
        def tofloat(value):
            try:
                float(value)
                return float(value)
            except ValueError:
                return False

        for a_token in t:
            if not tofloat(a_token.text):
                return False
            else:
                if min_v and tofloat(min_v):
                    this_v = tofloat(a_token.text)
                    if this_v < tofloat(min_v):
                        return False
                if max_v and tofloat(max_v):
                    this_v = tofloat(a_token.text)
                    if this_v > tofloat(max_v):
                        return False

        return True

    @staticmethod
    def full_shape_filter(t: List, shapes: List) -> bool:
        """
        Shape filter
        Args:
            t: List, list of tokens
            shapes: List

        Returns: bool
        """

        if shapes:
            for a_token in t:
                if a_token._.full_shape not in shapes:
                    return False

        return True

    @staticmethod
    def form_output(span_doc: span, output_format: str, relations: Dict,
                    patterns: List) -> str:
        """
        Form an output value according to user input of output_format
        Args:
            span_doc: span
            format: str
            relations: Dict
            patterns: List

        Returns: str
        """

        format_value = []
        output_inf = [a_pattern.in_output for a_pattern in patterns]
        for i in range(len(output_inf)):
            token_range = relations[i]
            if token_range and output_inf[i]:
                format_value.append(
                    span_doc[token_range[0]:token_range[1]].text)

        if not output_format:
            return " ".join(format_value)

        result_str = ""
        s = list(output_format)
        t1 = s.pop(0)
        t2 = s.pop(0)
        while 1:
            t3 = s.pop(0)
            if t1 == '{' and t2.isdigit() and t3 == '}':
                if int(t2) > len(format_value):
                    return result_str + t1 + t2 + t3 + "".join(s)
                result_str += format_value[int(t2) - 1]
                if not s:
                    break
                t1 = s.pop(0)
                if not s:
                    result_str += t1
                    break
                t2 = s.pop(0)
                if not s:
                    result_str += t2
                    break
            else:
                result_str += t1
                t1 = t2
                t2 = t3
                if not s:
                    result_str += t1
                    result_str += t2
                    break
        return result_str

    def construct_key(self, rule_id: str, spacy_rule_id: int) -> int:
        """
        Use a mapping to store the information about rule_id for each matches, create the mapping key here
        Args:
            rule_id: str
            spacy_rule_id:int

        Returns: int
        """

        hash_key = (rule_id, spacy_rule_id)
        hash_v = hash(hash_key) + sys.maxsize + 1
        self.hash_map[hash_v] = hash_key
        return hash_v

    def find_relation(self, span_doc: doc, r: List) -> Dict:
        """
        Get the relations between the each pattern in the spacy rule and the matches
        Args:
            span_doc: doc
            r: List

        Returns: Dict
        """

        rule = r[1][0]
        span_pivot = 0
        relation = {}
        for e_id, element in enumerate(rule):
            if not span_doc[span_pivot:]:
                for extra_id, _, in enumerate(rule[e_id:]):
                    relation[e_id + extra_id] = None
                break
            new_doc = self.tokenizer.tokenize_to_spacy_doc(
                span_doc[span_pivot:].text)
            if "OP" not in element:
                relation[e_id] = (span_pivot, span_pivot + 1)
                span_pivot += 1
            else:
                if e_id < len(rule) - 1:
                    tmp_rule_1 = [rule[e_id]]
                    tmp_rule_2 = [rule[e_id + 1]]
                    tmp_matcher = Matcher(self.nlp.vocab)
                    tmp_matcher.add(0, None, tmp_rule_1)
                    tmp_matcher.add(1, None, tmp_rule_2)
                    tmp_matches = sorted(
                        [x for x in tmp_matcher(new_doc) if x[1] != x[2]],
                        key=lambda a: a[1])

                    if not tmp_matches:
                        relation[e_id] = None
                    else:
                        matches_1 = [
                            x for x in tmp_matches if x[0] == 0 and x[1] == 0
                        ]
                        if not matches_1:
                            relation[e_id] = None
                        else:
                            _, s1, e1 = matches_1[0]
                            matches_2 = [x for x in tmp_matches if x[0] == 1]
                            if not matches_2:
                                relation[e_id] = (span_pivot, span_pivot + e1)
                                span_pivot += e1
                            else:
                                _, s2, e2 = matches_2[0]
                                if e1 <= s2:
                                    relation[e_id] = (span_pivot,
                                                      span_pivot + e1)
                                    span_pivot += e1
                                else:
                                    relation[e_id] = (span_pivot,
                                                      span_pivot + s2)
                                    span_pivot += s2
                else:
                    relation[e_id] = (span_pivot, len(span_doc))

        return relation