Ejemplo n.º 1
0
 def test_apply_transform(self):
     text = 'a123456789b123456789c123456789d123456789e123456789'
     markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 28)]})
     trans = [((0, 9), (0, 1))]
     markup.apply_transformations(trans)
     self.assertEqual((0, 4), markup.labels['p'][0])
     self.assertEqual((14, 20), markup.labels['p'][1])
Ejemplo n.º 2
0
 def test_apply_transformations_(self):
     text = 'A text   with extra   spaces.'
     markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 28)]})
     markup.apply_transformations([((6, 9), (6, 7)), ((19, 22), (19, 20))])
     labels = markup.labels['p']
     self.assertEqual((6, 10), labels[0])
     self.assertEqual((18, 24), labels[1])
Ejemplo n.º 3
0
    def unescape(result: MarkedUpText) -> None:
        """
        a "private" method to replace HTML codes like > with corresponding symbols in
        the resulting plain text
        :param result: MarkedUpText containing resulting plain text
        """
        new_text = ''
        transformations = [
        ]  # type: List[Tuple[Tuple[int, int], Tuple[int, int]]]
        last_stop = 0

        for match in _charref.finditer(result.text):
            replacement = _replace_charref(match)
            src_s, src_e = (match.start(), match.end())
            end_e = src_s + len(replacement)
            if end_e != src_e:
                transformations.append(((src_s, src_e), (src_s, end_e)))
            new_text += result.text[last_stop:src_s]
            new_text += replacement
            last_stop = src_e

        new_text += result.text[last_stop:len(result.text)]
        result.text = new_text
        if transformations:
            result.apply_transformations(transformations)
Ejemplo n.º 4
0
    def check_ocr_text(self, result: MarkedUpText) -> None:
        """
        a "private" method that checks text obtained from embedded images
        The method decides whether to leave or to delete these pieces of text
        :param result: MarkedUpText, containing resulting plain text
        """
        if not result.text or 'images' not in result.labels:
            return

        # remove some of OCR-d text fragments or remove all of them or just quit
        if self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE:
            return

        images = result.labels['images']
        self.parse_stat.parsed_ocr_text_len = sum(
            [result.count_non_space_chars(l_s, l_e) for l_s, l_e in images])
        self.parse_stat.parsed_text_len -= self.parse_stat.parsed_ocr_text_len
        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_ALWAYS:
            return

        remove_ocrs = False
        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT and \
                self.parse_stat.parsed_text_len >= self.settings.ocr_vector_text_min_length:
            remove_ocrs = True

        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_MORE_TEXT and \
                self.parse_stat.parsed_text_len > self.parse_stat.parsed_ocr_text_len:
            remove_ocrs = True

        if not remove_ocrs:
            return

        transformations = [(l, (l[0], l[0])) for l in images]
        result.apply_transformations(transformations)
Ejemplo n.º 5
0
    def process_inner_tag(self, result: MarkedUpText, tag_regex: Pattern,
                          make_text_function: Callable[[Any], str]) -> None:
        """
        this "private" method finds tags inside text, given it's start and end positions
        and stores labels found in result
        :param result: a MarkedUpText variable with plain text to process
        :param tag_regex: tag to find
        :param make_text_function: method that processes the tag found transforming the tag into plain text
        """
        new_text = ''
        transformations = [
        ]  # type: List[Tuple[Tuple[int, int], Tuple[int, int]]]
        last_stop = 0
        new_labels = []

        for match in tag_regex.finditer(result.text):
            link_markup = make_text_function(match)
            src_s, src_e = (match.start(), match.end())
            # ensure spaces between text and link text
            starts_space = src_s == 0 or result.text[src_s -
                                                     1] in self.str_spaces
            ends_space = src_e == len(result.text) - 1 or result.text[
                src_e + 1] in self.str_phrase_separators
            if not starts_space:
                link_markup = ' ' + link_markup
            if not ends_space:
                link_markup += ' '

            end_e = src_s + len(link_markup)
            if end_e != src_e:
                transformations.append(((src_s, src_e), (src_s, end_e)))
            new_labels.append((src_s, end_e))

            new_text += result.text[last_stop:src_s]
            new_text += link_markup
            last_stop = src_e

        new_text += result.text[last_stop:len(result.text)]
        result.text = new_text
        if transformations:
            result.apply_transformations(transformations)
        if new_labels:
            if 'a' not in result.labels:
                result.labels['a'] = new_labels
            else:
                result.labels['a'] = result.labels['a'] + new_labels