Ejemplo n.º 1
0
    def unescape(result: MarkedUpText) -> None:
        """
        a "private" method to replace HTML codes like > with corresponding symbols in
        the resulting plain text
        :param result: MarkedUpText containing resulting plain text
        """
        new_text = ''
        transformations = [
        ]  # type: List[Tuple[Tuple[int, int], Tuple[int, int]]]
        last_stop = 0

        for match in _charref.finditer(result.text):
            replacement = _replace_charref(match)
            src_s, src_e = (match.start(), match.end())
            end_e = src_s + len(replacement)
            if end_e != src_e:
                transformations.append(((src_s, src_e), (src_s, end_e)))
            new_text += result.text[last_stop:src_s]
            new_text += replacement
            last_stop = src_e

        new_text += result.text[last_stop:len(result.text)]
        result.text = new_text
        if transformations:
            result.apply_transformations(transformations)
Ejemplo n.º 2
0
    def process_inner_tag(self, result: MarkedUpText, tag_label: str,
                          tag_regex: Pattern,
                          make_text_function: Callable[[Any], str]) -> None:
        """
        this "private" method finds tags inside text, given it's start and end positions
        and stores labels found in result
        :param result: a MarkedUpText variable with plain text to process
        :param tag_regex: tag to find
        :param make_text_function: method that processes the tag found transforming the tag into plain text
        """
        new_text = ''
        last_stop = 0

        for match in tag_regex.finditer(result.text):
            link_markup = make_text_function(match)
            src_s, src_e = (match.start(), match.end())
            # ensure spaces between text and link text
            starts_space = src_s == 0 or result.text[src_s -
                                                     1] in self.str_spaces
            ends_space = src_e == len(result.text) - 1 or result.text[
                src_e + 1] in self.str_phrase_separators
            if not starts_space:
                link_markup = ' ' + link_markup
            if not ends_space:
                link_markup += ' '

            new_text += result.text[last_stop:src_s]
            new_text += MarkedUpText.get_marker(tag_label, True)
            new_text += link_markup
            new_text += MarkedUpText.get_marker(tag_label, False)
            last_stop = src_e

        new_text += result.text[last_stop:len(result.text)]
        result.text = new_text
Ejemplo n.º 3
0
    def check_ocr_text(self, result: MarkedUpText) -> None:
        """
        a "private" method that checks text obtained from embedded images
        The method decides whether to leave or to delete these pieces of text
        :param result: MarkedUpText, containing resulting plain text
        """
        if self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE:
            return

        self.parse_stat.parsed_ocr_text_len = self.count_text_in_images(result)
        self.parse_stat.parsed_text_len -= self.parse_stat.parsed_ocr_text_len
        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_ALWAYS:
            return

        # remove some of OCR-d text fragments or remove all of them or just quit
        remove_ocrs = False
        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT and \
                self.parse_stat.parsed_text_len >= self.settings.ocr_vector_text_min_length:
            remove_ocrs = True

        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_MORE_TEXT and \
                self.parse_stat.parsed_text_len > self.parse_stat.parsed_ocr_text_len:
            remove_ocrs = True

        if not remove_ocrs:
            return

        # do remove text that was obtained from images
        result.text = self.remove_text_in_images(result.text)
Ejemplo n.º 4
0
 def remove_text_in_images(self, result: MarkedUpText):
     im_op, im_cl = MarkedUpText.BLOCK_MARKERS['images']
     while True:
         p_start = result.text.find(im_op)
         if p_start < 0:
             break
         start = p_start + len(im_op)
         p_end = result.text.find(im_cl, start)
         if p_end < 0:
             continue
         p_end += len(im_cl)
         result.text = result.text[:p_start] + result.text[p_end:]
Ejemplo n.º 5
0
    def process_inner_tag(self, result: MarkedUpText, tag_regex: Pattern,
                          make_text_function: Callable[[Any], str]) -> None:
        """
        this "private" method finds tags inside text, given it's start and end positions
        and stores labels found in result
        :param result: a MarkedUpText variable with plain text to process
        :param tag_regex: tag to find
        :param make_text_function: method that processes the tag found transforming the tag into plain text
        """
        new_text = ''
        transformations = [
        ]  # type: List[Tuple[Tuple[int, int], Tuple[int, int]]]
        last_stop = 0
        new_labels = []

        for match in tag_regex.finditer(result.text):
            link_markup = make_text_function(match)
            src_s, src_e = (match.start(), match.end())
            # ensure spaces between text and link text
            starts_space = src_s == 0 or result.text[src_s -
                                                     1] in self.str_spaces
            ends_space = src_e == len(result.text) - 1 or result.text[
                src_e + 1] in self.str_phrase_separators
            if not starts_space:
                link_markup = ' ' + link_markup
            if not ends_space:
                link_markup += ' '

            end_e = src_s + len(link_markup)
            if end_e != src_e:
                transformations.append(((src_s, src_e), (src_s, end_e)))
            new_labels.append((src_s, end_e))

            new_text += result.text[last_stop:src_s]
            new_text += link_markup
            last_stop = src_e

        new_text += result.text[last_stop:len(result.text)]
        result.text = new_text
        if transformations:
            result.apply_transformations(transformations)
        if new_labels:
            if 'a' not in result.labels:
                result.labels['a'] = new_labels
            else:
                result.labels['a'] = result.labels['a'] + new_labels
Ejemplo n.º 6
0
    def remove_extra_linebreaks(self, result: MarkedUpText) -> None:
        """
        Removes linebreaks in the middle of the sentence. Usually, single linebreaks
        within a paragraph should be deleted and replaced with one space character.
        But we preserve the linebreaks if the paragraph is a list or a table.
        Unfortunately, presently we can't recognize a paragraph as a table (if the
        source is a PDF file).
        :param result: MarkedUpText containing resulted plain text
        """
        paragraph_op, paragraph_cl = MarkedUpText.BLOCK_MARKERS['paragraphs']

        start = 0
        while True:
            p_start = result.text.find(paragraph_op, start)
            if p_start < 0:
                break
            start = p_start + len(paragraph_op)
            p_end = result.text.find(paragraph_cl, start)
            if p_end < 0:
                continue
            p_block_end = p_end + len(paragraph_cl)

            # remove extra "\n" between p_start, p_end
            par_text = result.text[start:p_end]
            par_lines = [l for l in par_text.split('\n') if l.strip()]
            if not par_lines:
                start = p_block_end
                continue

            # if lines make a list then don't remove line breaks
            is_list = True
            list_lines = 0
            for line in par_lines:
                if self.re_list_start.match(line):
                    list_lines += 1
            max_breaks_allowed = math.ceil(len(par_lines) / 3)
            if len(par_lines) - list_lines > max_breaks_allowed:
                is_list = False

            if not is_list:
                par_text = self.re_single_newline.sub(' ', par_text)
                result.text = result.text[:start] + par_text + result.text[
                    p_end:]
            start = p_block_end