def __analyze_patterns( self, text: str, flags: int = None ) -> List[RecognizerResult]: """ Evaluate all patterns in the provided text. Including words in the provided deny-list :param text: text to analyze :param flags: regex flags :return: A list of RecognizerResult """ flags = flags if flags else re.DOTALL | re.MULTILINE results = [] for pattern in self.patterns: match_start_time = datetime.datetime.now() matches = re.finditer(pattern.regex, text, flags=flags) match_time = datetime.datetime.now() - match_start_time logger.debug( "--- match_time[%s]: %s.%s seconds", pattern.name, match_time.seconds, match_time.microseconds, ) for match in matches: start, end = match.span() current_match = text[start:end] # Skip empty results if current_match == "": continue score = pattern.score validation_result = self.validate_result(current_match) description = self.build_regex_explanation( self.name, pattern.name, pattern.regex, score, validation_result ) pattern_result = RecognizerResult( self.supported_entities[0], start, end, score, description ) if validation_result is not None: if validation_result: pattern_result.score = EntityRecognizer.MAX_SCORE else: pattern_result.score = EntityRecognizer.MIN_SCORE invalidation_result = self.invalidate_result(current_match) if invalidation_result is not None and invalidation_result: pattern_result.score = EntityRecognizer.MIN_SCORE if pattern_result.score > EntityRecognizer.MIN_SCORE: results.append(pattern_result) results = EntityRecognizer.remove_duplicates(results) return results
def __analyze_patterns(self, text): """ Evaluates all patterns in the provided text, including words in the provided blacklist In a sentence we could get a false positive at the end of our regex, were we want to find the IBAN but not the false positive at the end of the match. i.e. "I want my deposit in DE89370400440532013000 2 days from today." :param text: text to analyze :param flags: regex flags :return: A list of RecognizerResult """ results = [] for pattern in self.patterns: matches = re.finditer(pattern.regex, text, flags=self.flags) for match in matches: for grp_num in reversed(range(1, len(match.groups()) + 1)): start = match.span(0)[0] end = ( match.span(grp_num)[1] if match.span(grp_num)[1] > 0 else match.span(0)[1] ) current_match = text[start:end] # Skip empty results if current_match == "": continue score = pattern.score validation_result = self.validate_result(current_match) description = PatternRecognizer.build_regex_explanation( self.name, pattern.name, pattern.regex, score, validation_result ) pattern_result = RecognizerResult( self.supported_entities[0], start, end, score, description ) if validation_result is not None: if validation_result: pattern_result.score = EntityRecognizer.MAX_SCORE else: pattern_result.score = EntityRecognizer.MIN_SCORE if pattern_result.score > EntityRecognizer.MIN_SCORE: results.append(pattern_result) break return results
def __analyze_patterns(self, text): """ Evaluates all patterns in the provided text, including words in the provided blacklist :param text: text to analyze :return: A list of RecognizerResult """ results = [] for pattern in self.patterns: match_start_time = datetime.datetime.now() matches = re.finditer(pattern.regex, text, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE) match_time = datetime.datetime.now() - match_start_time self.logger.debug('--- match_time[%s]: %s.%s seconds', pattern.name, match_time.seconds, match_time.microseconds) for match in matches: start, end = match.span() current_match = text[start:end] # Skip empty results if current_match == '': continue score = pattern.score validation_result = self.validate_result(current_match) description = PatternRecognizer.build_regex_explanation( self.name, pattern.name, pattern.regex, score, validation_result) pattern_result = RecognizerResult(self.supported_entities[0], start, end, score, description) if validation_result is not None: if validation_result: pattern_result.score = EntityRecognizer.MAX_SCORE else: pattern_result.score = EntityRecognizer.MIN_SCORE if pattern_result.score > EntityRecognizer.MIN_SCORE: results.append(pattern_result) return results