Exemple #1
0
 def get_context(self, document: str, match_found: re.match,
                 context_config: dict):
     """
     This method gets the context around a found match in the document in accordance with the context configuration
     :param document: The document to be scanned
     :param match_found: The match that is to be used as the center of the context window
     :param context_config: The context configuration
     :return: A string containing the context around the found match (Can parameterize later to return str or list!)
     """
     match_str = document[match_found.start():match_found.end()].strip()
     preceding_text = document[:match_found.start()]
     succeeding_text = document[match_found.end():]
     if context_config['type'] == ContextType.WORD:
         preceding_text_words = self.trim_boundaries(
             re.split(r'\s+', preceding_text))
         succeeding_text_words = self.trim_boundaries(
             re.split(r'\s+', succeeding_text))
         return match_str, ' '.join(
             preceding_text_words[len(preceding_text_words) -
                                  context_config['size']:] +
             ['TARGETWORD'] +
             succeeding_text_words[:context_config['size']])
     if context_config['type'] == ContextType.PARAGRAPH:
         preceding_text_lines = self.trim_boundaries(
             re.split(self.split_lines_regex, preceding_text))
         succeeding_text_lines = self.trim_boundaries(
             re.split(self.split_lines_regex, succeeding_text))
         preceding_text_empty_line_indices = [
             index for index, item in enumerate(preceding_text_lines)
             if len(item.strip()) < 1
         ]
         succeeding_text_empty_line_indices = [
             index for index, item in enumerate(succeeding_text_lines)
             if len(item.strip()) < 1
         ]
         if not preceding_text_empty_line_indices:
             preceding_text_empty_line_indices = [-1]
         if not succeeding_text_empty_line_indices:
             succeeding_text_empty_line_indices = [
                 len(succeeding_text_lines)
             ]
         return ' '.join(
             preceding_text_lines[preceding_text_empty_line_indices[
                 len(preceding_text_empty_line_indices) -
                 context_config['size'] if context_config['size'] < len(
                     preceding_text_empty_line_indices) else -1] + 1:] +
             ['TARGETWORD'] +
             succeeding_text_lines[:succeeding_text_empty_line_indices[
                 context_config['size'] - 1 if context_config['size'] < len(
                     succeeding_text_empty_line_indices) else -1]])
Exemple #2
0
    def format(self, path: Path, match: re.match):
        """
        Format the path with the result of the matching.
        Only replace what was captured.
        """
        assert match is not None

        # get what is before and after the capture
        prefix = match.string[:match.start()]
        suffix = match.string[match.end():]

        updated_name = file_formatter.format(
            self.renamer,
            None,
            *match.groups(),
            **match.groupdict())

        return self.untouched_root(path) / Path(prefix + updated_name + suffix)
Exemple #3
0
    def _wrap_date_match(order: str, match: re.match, pattern: str=None) -> dict or None:
        """

        Args:
            order: enums['MDY', 'DMY', 'YMD'] - order of the date
            match: re.match - a regex match object
            pattern: str - if user defined the pattern, record it here

        Returns:

        """
        return {
            'value': match.group(),
            'groups': match.groups(),
            'start': match.start(),
            'end': match.end(),
            'order': order,
            'pattern': pattern
        } if match else None