def test_matcher_from_api_docs(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": "test"}] assert len(matcher) == 0 matcher.add("Rule", None, pattern) assert len(matcher) == 1 matcher.remove("Rule") assert "Rule" not in matcher matcher.add("Rule", None, pattern) assert "Rule" in matcher on_match, patterns = matcher.get("Rule") assert len(patterns[0])
def test_matcher_from_api_docs(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": "test"}] assert len(matcher) == 0 matcher.add("Rule", None, pattern) assert len(matcher) == 1 matcher.remove("Rule") assert "Rule" not in matcher matcher.add("Rule", None, pattern) assert "Rule" in matcher on_match, patterns = matcher.get("Rule") assert len(patterns[0])
class nerutils: ''' Class to implement NER on the text input Attributes ------------ self.nlp: NLP Object self.matcher: spaCy's matcher object self.pattern_name,self.pattern_period_stay,self.pattern_residential_addr: Patterns used by the self.matcher.add() call self.pattern_relationship: List Contains the possible values of relationship which are used to match the relationship field in the loan document. self.dict_regex: Dictionary Python dictionary containing the regular expressions which are used to parse structured fields such as email-id, mobile no and pan_no self.dict_cond: Dictionary Contains boolean values corresponding to the fields(keys) in the loan document which indicates if the result is valid or not. self.key: Integer Int representing the field name, used in self.dict_cond to set the corresponding field to True if valid ''' def __init__(self): self.nlp = spacy.load("en_core_web_sm") #Creating a matcher object self.matcher = Matcher(self.nlp.vocab) person_label = self.nlp.vocab.strings["PERSON"] date_label = self.nlp.vocab.strings["DATE"] country_label = self.nlp.vocab.strings["GPE"] #Patterns which are to be fed to the spacy matcher self.pattern_name = [{"ENT_TYPE":person_label}] self.pattern_period_stay = [{"ENT_TYPE":date_label}] self.pattern_residential_addr = [{"ENT_TYPE":country_label}] self.pattern_relationship = ["wife","husband","father","mother","grandfather","grandmother","brother","sister","uncle","aunt"] self.dict_pattern = {'name':self.pattern_name,'pan_no':[],'father_name':self.pattern_name,'relationship':[],'residential_addr':self.pattern_residential_addr,'period_stay':self.pattern_period_stay,'tel_no':[], 'mobile_no':[],'email':[]} #Regex for parsing pan_no, mobile_no and email self.dict_regex = {'pan_no':re.compile(r'(?P<pan_no>^([a-zA-Z]){5}([0-9]){4}([a-zA-Z]){1}?$)'), "mobile_no":re.compile(r'(?P<mobile_no>^91[-\s]??\d{10}$)'), "email":re.compile(r'(?P<email>^([a-zA-Z0-9_\-\.]+)(@\s)?([a-zA-Z0-9_\-\.]+)(\.\s)?([a-zA-Z]{2,5})$)')} self.dict_cond = {} self.key = None def callback_fn(self,matcher,doc,i,matches): ''' Callback function for the matcher object Sets the value corresponding to self.key in self.dict_cond equal to True Parameters --------------- matcher:matcher object doc: String on which matcher object operates matches: List List of all the matches in the sentence i: Integer Index of the current match ''' self.dict_cond[self.key] = True def check_ocr(self,dict_ocr): ''' Function to check the output of OCR using rule matching and NER Parameters ----------------- dict_ocr: Dictionary Dictionary containing the result of fields(as keys) and the result of OCR(strings) as corresponding values Returns ------------------- self.dict_cond: Dict Dictionary containing fields(as keys) and bool value True/False indicating if the corresponding fields are valid or not. ''' print("dict_ocr:",dict_ocr) for self.key, value_ls in dict_ocr.items(): #For keys in the if condition below we use simple regex based pattern matching if(self.key == 'relationship' or self.key == 'pan_no' or self.key == 'mobile_no' or self.key == 'email'): for value in value_ls: value = value.lower() #Converting to lowercase if(self.key == 'relationship'): if(value in self.pattern_relationship): self.dict_cond[self.key] = True else: match = self.dict_regex[self.key].search(value) if(match != None): self.dict_cond[self.key] = True #For keys in the if condition below we make use of NER using Spacy's matcher object if(self.key == 'name' or self.key == 'period_stay' or self.key == 'residential_addr' or self.key == 'father_name'): print("self.key",self.key) self.matcher.add(str(self.key),self.callback_fn,self.dict_pattern[self.key]) if(self.matcher): for value in value_ls: print("value:",value) doc = self.nlp(value) #For debug # ents = list(doc.ents) # print("ents",ents) # for i in range(0,len(ents)): # print("{},{}".format(ents[i].text,ents[i].label_)) # print([t.text for t in doc]) matches = self.matcher(doc) print("matches:",matches) if("name" or "pan_no" or "period_stay" or "residential_addr" or "father_name" or "mobile_no" in self.matcher): print("Inside condition make self.matcher=None") print("self.key",self.key) on_match, patterns = self.matcher.get(self.key) print("patterns:",patterns) #Clear the self.matcher object and reinitialize it self.matcher = None self.matcher = Matcher(self.nlp.vocab) # print("self.dict_cond",self.dict_cond) return self.dict_cond
class NlpService(nlp_pb2_grpc.NlpServicer): def __init__(self): self.modelName = None self.nlp = None self.matcher = None def LoadModel(self, request, context): self.modelName = request.text self.nlp = spacy.load(request.text) response = nlp_pb2.TextResponse() response.message = "Model loaded '{}'".format(request.text) return response def NlpProcess(self, request, context): doc = self.nlp(request.text) response = utils.doc2proto(doc, self.modelName) return response def DocSimilarity(self, request, context): docA = self.nlp(request.texta) docB = self.nlp(request.textb) response = nlp_pb2.TextSimilarity() response.similarity = docA.similarity(docB) return response def AddRule(self, request, context): if self.matcher == None: self.matcher = Matcher(self.nlp.vocab) matcher_id = request.id patterns = [{pat.key: pat.value} for pat in request.patterns] self.matcher.add(matcher_id, None, patterns) response = nlp_pb2.TextResponse() response.message = "Rule with id '{}' added to matcher.".format( matcher_id) return response def RemoveRule(self, request, context): if self.matcher == None: return nlp_pb2.TextResponse(message="No rules exists with matcher") self.matcher.remove(request.text) return nlp_pb2.TextResponse( message="Rule with id '{}' removed from matcher.".format( request.text)) def GetRule(self, request, context): if self.matcher == None: return nlp_pb2.TextResponse(message="No rules exists with matcher") _, patterns = self.matcher.get(request.text) return nlp_pb2.Rule( id=request.text, patterns=[ nlp_pb2.Pattern(key=list(pat.keys())[0], value=list(pat.values())[0]) for pat in patterns[0] ], ) def GetMatches(self, request, context): doc = self.nlp(request.text) matches = self.matcher(doc) reponse = nlp_pb2.Matches(matches=[ nlp_pb2.Match(id=str(i[0]), start=i[1], end=i[2]) for i in matches ]) return reponse def ResetMatcher(self, request, context): self.matcher = None return nlp_pb2.TextResponse(message="Matcher object reset successful.")
class SpacyRuleExtractor(Extractor): def __init__(self, nlp, rules: Dict, extractor_name: str) -> None: """ Initialize the extractor, storing the rule information and construct spacy rules Args: nlp rules (Dict): spacy rules extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="spacy_rule_extractor", name=extractor_name) self.rules = rules["rules"] self.nlp = copy.deepcopy(nlp) self.tokenizer = Tokenizer(self.nlp) self.matcher = Matcher(self.nlp.vocab) self.field_name = rules[ "field_name"] if "field_name" in rules else extractor_name self.rule_lst = {} self.hash_map = {} for idx, a_rule in enumerate(self.rules): this_rule = Rule(a_rule, self.nlp) self.rule_lst[this_rule.identifier + "rule_id##" + str(idx)] = this_rule def extract(self, text: str) -> List[Extraction]: """ Extract from text Args: text: str Returns: List[Extraction] """ doc = self.tokenizer.tokenize_to_spacy_doc(text) self.load_matcher() matches = [x for x in self.matcher(doc) if x[1] != x[2]] pos_filtered_matches = [] neg_filtered_matches = [] for idx, start, end in matches: span_doc = self.tokenizer.tokenize_to_spacy_doc( doc[start:end].text) this_spacy_rule = self.matcher.get(idx) relations = self.find_relation(span_doc, this_spacy_rule) rule_id, _ = self.hash_map[idx] this_rule = self.rule_lst[rule_id] if self.filter_match(doc[start:end], relations, this_rule.patterns): value = self.form_output(doc[start:end], this_rule.output_format, relations, this_rule.patterns) if this_rule.polarity: pos_filtered_matches.append( (start, end, value, rule_id, relations)) else: neg_filtered_matches.append( (start, end, value, rule_id, relations)) return_lst = [] if pos_filtered_matches: longest_lst_pos = self.get_longest(pos_filtered_matches) if neg_filtered_matches: longest_lst_neg = self.get_longest(neg_filtered_matches) return_lst = self.reject_neg(longest_lst_pos, longest_lst_neg) else: return_lst = longest_lst_pos extractions = [] for (start, end, value, rule_id, relation) in return_lst: this_extraction = Extraction(value=value, extractor_name=self.name, start_token=start, end_token=end, start_char=doc[start].idx, end_char=doc[end - 1].idx + len(doc[end - 1]), rule_id=rule_id.split("rule_id##")[0], match_mapping=relation) extractions.append(this_extraction) return extractions def load_matcher(self) -> None: """ Add constructed spacy rule to Matcher """ for id_key in self.rule_lst: if self.rule_lst[id_key].active: pattern_lst = [ a_pattern.spacy_token_lst for a_pattern in self.rule_lst[id_key].patterns ] for spacy_rule_id, spacy_rule in enumerate( itertools.product(*pattern_lst)): self.matcher.add(self.construct_key(id_key, spacy_rule_id), None, list(spacy_rule)) def filter_match(self, span: span, relations: Dict, patterns: List) -> bool: """ Filter the match result according to prefix, suffix, min, max ... Args: span: span relations: Dict patterns: List of pattern Returns: bool """ for pattern_id, a_pattern in enumerate(patterns): token_range = relations[pattern_id] if token_range: tokens = [x for x in span[token_range[0]:token_range[1]]] if a_pattern.type == "word": if not self.pre_suf_fix_filter(tokens, a_pattern.prefix, a_pattern.suffix): return False if a_pattern.type == "shape": if not (self.full_shape_filter(tokens, a_pattern.full_shape) and self.pre_suf_fix_filter( tokens, a_pattern.prefix, a_pattern.suffix)): return False if a_pattern.type == "number": if not self.min_max_filter(tokens, a_pattern.min, a_pattern.max): return False return True @staticmethod def get_longest(value_lst: List) -> List: """ Get the longest match for overlap Args: value_lst: List Returns: List """ value_lst.sort() result = [] pivot = value_lst[0] start, end = pivot[0], pivot[1] pivot_e = end pivot_s = start for idx, (s, e, v, rule_id, _) in enumerate(value_lst): if s == pivot_s and pivot_e < e: pivot_e = e pivot = value_lst[idx] elif s != pivot_s and pivot_e < e: result.append(pivot) pivot = value_lst[idx] pivot_e = e pivot_s = s result.append(pivot) return result @staticmethod def reject_neg(pos_lst: List, neg_lst: List) -> List: """ Reject some positive matches according to negative matches Args: pos_lst: List neg_lst: List Returns: List """ pos_lst.sort() neg_lst.sort() result = [] pivot_pos = pos_lst[0] pivot_neg = neg_lst[0] while pos_lst: if pivot_pos[1] <= pivot_neg[0]: result.append(pivot_pos) pos_lst.pop(0) if pos_lst: pivot_pos = pos_lst[0] elif pivot_pos[0] >= pivot_neg[1]: neg_lst.pop(0) if not neg_lst: result += pos_lst break else: pivot_neg = neg_lst[0] else: pos_lst.pop(0) if pos_lst: pivot_pos = pos_lst[0] return result @staticmethod def pre_suf_fix_filter(t: List, prefix: str, suffix: str) -> bool: """ Prefix and Suffix filter Args: t: List, list of tokens prefix: str suffix: str Returns: bool """ if prefix: for a_token in t: if a_token._.n_prefix(len(prefix)) != prefix: return False if suffix: for a_token in t: if a_token._.n_suffix(len(suffix)) != suffix: return False return True @staticmethod def min_max_filter(t: List, min_v: str, max_v: str) -> bool: """ Min and Max filter Args: t: List, list of tokens min_v: str max_v: str Returns: bool """ def tofloat(value): try: float(value) return float(value) except ValueError: return False for a_token in t: if not tofloat(a_token.text): return False else: if min_v and tofloat(min_v): this_v = tofloat(a_token.text) if this_v < tofloat(min_v): return False if max_v and tofloat(max_v): this_v = tofloat(a_token.text) if this_v > tofloat(max_v): return False return True @staticmethod def full_shape_filter(t: List, shapes: List) -> bool: """ Shape filter Args: t: List, list of tokens shapes: List Returns: bool """ if shapes: for a_token in t: if a_token._.full_shape not in shapes: return False return True @staticmethod def form_output(span_doc: span, output_format: str, relations: Dict, patterns: List) -> str: """ Form an output value according to user input of output_format Args: span_doc: span format: str relations: Dict patterns: List Returns: str """ format_value = [] output_inf = [a_pattern.in_output for a_pattern in patterns] for i in range(len(output_inf)): token_range = relations[i] if token_range and output_inf[i]: format_value.append( span_doc[token_range[0]:token_range[1]].text) if not output_format: return " ".join(format_value) result_str = "" s = list(output_format) t1 = s.pop(0) t2 = s.pop(0) while 1: t3 = s.pop(0) if t1 == '{' and t2.isdigit() and t3 == '}': if int(t2) > len(format_value): return result_str + t1 + t2 + t3 + "".join(s) result_str += format_value[int(t2) - 1] if not s: break t1 = s.pop(0) if not s: result_str += t1 break t2 = s.pop(0) if not s: result_str += t2 break else: result_str += t1 t1 = t2 t2 = t3 if not s: result_str += t1 result_str += t2 break return result_str def construct_key(self, rule_id: str, spacy_rule_id: int) -> int: """ Use a mapping to store the information about rule_id for each matches, create the mapping key here Args: rule_id: str spacy_rule_id:int Returns: int """ hash_key = (rule_id, spacy_rule_id) hash_v = hash(hash_key) + sys.maxsize + 1 self.hash_map[hash_v] = hash_key return hash_v def find_relation(self, span_doc: doc, r: List) -> Dict: """ Get the relations between the each pattern in the spacy rule and the matches Args: span_doc: doc r: List Returns: Dict """ rule = r[1][0] span_pivot = 0 relation = {} for e_id, element in enumerate(rule): if not span_doc[span_pivot:]: for extra_id, _, in enumerate(rule[e_id:]): relation[e_id + extra_id] = None break new_doc = self.tokenizer.tokenize_to_spacy_doc( span_doc[span_pivot:].text) if "OP" not in element: relation[e_id] = (span_pivot, span_pivot + 1) span_pivot += 1 else: if e_id < len(rule) - 1: tmp_rule_1 = [rule[e_id]] tmp_rule_2 = [rule[e_id + 1]] tmp_matcher = Matcher(self.nlp.vocab) tmp_matcher.add(0, None, tmp_rule_1) tmp_matcher.add(1, None, tmp_rule_2) tmp_matches = sorted( [x for x in tmp_matcher(new_doc) if x[1] != x[2]], key=lambda a: a[1]) if not tmp_matches: relation[e_id] = None else: matches_1 = [ x for x in tmp_matches if x[0] == 0 and x[1] == 0 ] if not matches_1: relation[e_id] = None else: _, s1, e1 = matches_1[0] matches_2 = [x for x in tmp_matches if x[0] == 1] if not matches_2: relation[e_id] = (span_pivot, span_pivot + e1) span_pivot += e1 else: _, s2, e2 = matches_2[0] if e1 <= s2: relation[e_id] = (span_pivot, span_pivot + e1) span_pivot += e1 else: relation[e_id] = (span_pivot, span_pivot + s2) span_pivot += s2 else: relation[e_id] = (span_pivot, len(span_doc)) return relation