def get_context(self, document: str, match_found: re.match, context_config: dict): """ This method gets the context around a found match in the document in accordance with the context configuration :param document: The document to be scanned :param match_found: The match that is to be used as the center of the context window :param context_config: The context configuration :return: A string containing the context around the found match (Can parameterize later to return str or list!) """ match_str = document[match_found.start():match_found.end()].strip() preceding_text = document[:match_found.start()] succeeding_text = document[match_found.end():] if context_config['type'] == ContextType.WORD: preceding_text_words = self.trim_boundaries( re.split(r'\s+', preceding_text)) succeeding_text_words = self.trim_boundaries( re.split(r'\s+', succeeding_text)) return match_str, ' '.join( preceding_text_words[len(preceding_text_words) - context_config['size']:] + ['TARGETWORD'] + succeeding_text_words[:context_config['size']]) if context_config['type'] == ContextType.PARAGRAPH: preceding_text_lines = self.trim_boundaries( re.split(self.split_lines_regex, preceding_text)) succeeding_text_lines = self.trim_boundaries( re.split(self.split_lines_regex, succeeding_text)) preceding_text_empty_line_indices = [ index for index, item in enumerate(preceding_text_lines) if len(item.strip()) < 1 ] succeeding_text_empty_line_indices = [ index for index, item in enumerate(succeeding_text_lines) if len(item.strip()) < 1 ] if not preceding_text_empty_line_indices: preceding_text_empty_line_indices = [-1] if not succeeding_text_empty_line_indices: succeeding_text_empty_line_indices = [ len(succeeding_text_lines) ] return ' '.join( preceding_text_lines[preceding_text_empty_line_indices[ len(preceding_text_empty_line_indices) - context_config['size'] if context_config['size'] < len( preceding_text_empty_line_indices) else -1] + 1:] + ['TARGETWORD'] + succeeding_text_lines[:succeeding_text_empty_line_indices[ context_config['size'] - 1 if context_config['size'] < len( succeeding_text_empty_line_indices) else -1]])
def is_message_id(self, match: re.match, msg: str) -> bool: start = match.start() email = match.group() # Note that our regex will match thigs like "message-id=Issue1649523226559@postfix-mail.mail-system.svc.cluster.local" # so we need to filter / check for these first if email.startswith(self.MESSAGE_ID_LINE): return True if start >= self.MESSAGE_ID_LINE_LEN: pos = start - 1 while True: char = msg[pos] if char == '=': break elif char in '{<["\'': pos = pos - 1 continue return False check = msg[pos - self.MESSAGE_ID_LINE_LEN + 1:pos + 1] if check == self.MESSAGE_ID_LINE: return True return False
def format(self, path: Path, match: re.match): """ Format the path with the result of the matching. Only replace what was captured. """ assert match is not None # get what is before and after the capture prefix = match.string[:match.start()] suffix = match.string[match.end():] updated_name = file_formatter.format( self.renamer, None, *match.groups(), **match.groupdict()) return self.untouched_root(path) / Path(prefix + updated_name + suffix)
def _wrap_date_match(order: str, match: re.match, pattern: str=None) -> dict or None: """ Args: order: enums['MDY', 'DMY', 'YMD'] - order of the date match: re.match - a regex match object pattern: str - if user defined the pattern, record it here Returns: """ return { 'value': match.group(), 'groups': match.groups(), 'start': match.start(), 'end': match.end(), 'order': order, 'pattern': pattern } if match else None