Ejemplo n.º 1
0
    def check_ocr_text(self, result: MarkedUpText) -> None:
        """
        a "private" method that checks text obtained from embedded images
        The method decides whether to leave or to delete these pieces of text
        :param result: MarkedUpText, containing resulting plain text
        """
        if not result.text or 'images' not in result.labels:
            return

        # remove some of OCR-d text fragments or remove all of them or just quit
        if self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE:
            return

        images = result.labels['images']
        self.parse_stat.parsed_ocr_text_len = sum(
            [result.count_non_space_chars(l_s, l_e) for l_s, l_e in images])
        self.parse_stat.parsed_text_len -= self.parse_stat.parsed_ocr_text_len
        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_ALWAYS:
            return

        remove_ocrs = False
        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT and \
                self.parse_stat.parsed_text_len >= self.settings.ocr_vector_text_min_length:
            remove_ocrs = True

        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_MORE_TEXT and \
                self.parse_stat.parsed_text_len > self.parse_stat.parsed_ocr_text_len:
            remove_ocrs = True

        if not remove_ocrs:
            return

        transformations = [(l, (l[0], l[0])) for l in images]
        result.apply_transformations(transformations)
Ejemplo n.º 2
0
    def count_text_in_images(self, result: MarkedUpText) -> int:
        text_len = 0
        im_op, im_cl = MarkedUpText.BLOCK_MARKERS['images']
        start = 0
        while True:
            p_start = result.text.find(im_op, start)
            if p_start < 0:
                break
            start = p_start + len(im_op)
            p_end = result.text.find(im_cl, start)
            if p_end < 0:
                continue
            text_len += result.count_non_space_chars(start, p_end)

        return text_len
Ejemplo n.º 3
0
    def parse_text(self,
                   markup: str,
                   detect_tables: bool = True) -> MarkedUpText:
        """
        The only method to call by external code. Transforms "markup" (XHTML string)
        into plain text with some formatting and extra information stored into MarkedUpText structure.
        :param markup: string containing XHTML
        :param detect_tables: whether or not to parse and store tables (MarkedUpTable) markup information
        :return: MarkedUpText - resulted text, paragraphs, pages, headings and tables markup information
        """
        result = MarkedUpText('', {'pages': [], 'paragraphs': []})
        if detect_tables:
            result.tables = self.detect_tables(markup)

        cur_block = None  # type: Optional[TikaXhtmlParser.BlockProps]
        for tg in self.re_tag.finditer(markup):
            tag_text = tg.group(0)
            tag_name = self.get_tag_name(tag_text)
            tag_start, tag_end = (tg.start(), tg.end())
            if not tag_name:
                continue

            if tag_name == 'meta':
                self.process_meta(tag_text, result)
                continue

            tag_type = self.get_tag_type(tag_text)

            if tag_name == 'div' and 'class="page"' in tag_text:
                result.labels['pages'].append((len(result.text), 0))
                continue

            block_type = 'image' if tag_name == 'div' and 'class="ocr"' in tag_text \
                else 'paragraph' if tag_name == 'p' \
                else 'heading' if self.re_tag_h.match(tag_name) else tag_name
            is_text_block = block_type == 'paragraph' or block_type == 'heading' or block_type == 'image'

            if is_text_block and tag_type == 'o':
                if block_type == 'image' and self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE:
                    continue
                cur_block = TikaXhtmlParser.BlockProps(tg.end(), block_type)
                if block_type == 'paragraph':
                    cur_block.label_name = 'paragraphs'
                elif block_type == 'heading':
                    cur_block.label_name = 'heading_' + tag_name[1:]
                elif block_type == 'image':
                    cur_block.label_name = 'images'
                continue

            if (is_text_block
                    or block_type == 'div') and tag_type == 'c' and cur_block:
                end = tag_start
                line = markup[cur_block.start:end]
                p_start = len(result.text)
                p_end = p_start + len(line)
                result.text += line
                if not cur_block.is_inline:
                    result.text += '\n\n'
                if cur_block.label_name:
                    if cur_block.label_name not in result.labels:
                        result.labels[cur_block.label_name] = []
                    result.labels[cur_block.label_name].append(
                        (p_start, p_end))

                # check if the block belongs to a table
                self.update_tables_content(result.tables, tag_start, tag_end,
                                           p_start, p_end)

                cur_block = None
                continue

        l_pages = result.labels['pages']
        if l_pages:
            pages = []
            for i in range(len(l_pages) - 1):
                pages.append((l_pages[i][0], l_pages[i + 1][0]))
            pages.append((l_pages[-1][0], len(result.text)))

        self.process_inner_tags(result)
        self.post_process(result)
        self.parse_stat.parsed_text_len = result.count_non_space_chars()
        self.check_ocr_text(result)
        self.update_tables_outer_bounds(result.tables)

        return result
Ejemplo n.º 4
0
    def parse_text(self, markup: str) -> MarkedUpText:
        """
        The only method to call by external code. Transforms "markup" (XHTML string)
        into plain text with some formatting and extra information stored into MarkedUpText structure.
        :param markup: string containing XHTML
        :param detect_tables: whether or not to parse and store tables (MarkedUpTable) markup information
        :return: MarkedUpText - resulted text, paragraphs, pages, headings and tables markup information
        """
        result = MarkedUpText('', {'pages': [], 'paragraphs': []})

        cur_block = None  # type: Optional[TikaXhtmlParser.BlockProps]
        for tg in self.re_tag.finditer(markup):
            tag_text = tg.group(0)
            tag_name = self.get_tag_name(tag_text)
            tag_start, tag_end = (tg.start(), tg.end())
            if not tag_name:
                continue

            if tag_name == 'meta':
                self.process_meta(tag_text, result)
                continue

            tag_type = self.get_tag_type(tag_text)

            if tag_name == 'div' and 'class="page"' in tag_text:
                result.add_marker('pages', True)
                result.add_marker('pages', False)
                continue

            block_type = 'images' if tag_name == 'div' and 'class="ocr"' in tag_text \
                else 'paragraphs' if tag_name == 'p' \
                else 'heading' if self.re_tag_h.match(tag_name) \
                else 'td' if tag_name == 'th' \
                else tag_name
            is_text_block = block_type in {'paragraphs', 'heading',
                                           'images'}  #, 'table', 'tr', 'td'}

            if is_text_block and tag_type == 'o':
                if block_type == 'images' and self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE:
                    continue
                cur_block = TikaXhtmlParser.BlockProps(tg.end(), block_type)
                if block_type in {'paragraphs', 'images', 'table', 'tr', 'td'}:
                    cur_block.label_name = block_type
                elif block_type == 'heading':
                    cur_block.label_name = 'heading_' + tag_name[1:]
                continue

            if block_type in {'table', 'tr', 'td'}:
                result.add_marker(block_type, tag_type == 'o')

            if (is_text_block
                    or block_type == 'div') and tag_type == 'c' and cur_block:
                end = tag_start
                line = markup[cur_block.start:end]
                p_start = len(result.text)
                p_end = p_start + len(line)

                if cur_block.label_name:
                    result.add_marker(cur_block.label_name, True)
                result.text += line
                if not cur_block.is_inline:
                    result.text += '\n\n'
                if cur_block.label_name:
                    result.add_marker(cur_block.label_name, False)
                cur_block = None
                continue

        self.process_inner_tags(result)
        self.post_process(result)
        self.parse_stat.parsed_text_len = result.count_non_space_chars()
        self.check_ocr_text(result)
        return result