コード例 #1
0
    def check_ocr_text(self, result: MarkedUpText) -> None:
        """
        a "private" method that checks text obtained from embedded images
        The method decides whether to leave or to delete these pieces of text
        :param result: MarkedUpText, containing resulting plain text
        """
        if not result.text or 'images' not in result.labels:
            return

        # remove some of OCR-d text fragments or remove all of them or just quit
        if self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE:
            return

        images = result.labels['images']
        self.parse_stat.parsed_ocr_text_len = sum(
            [result.count_non_space_chars(l_s, l_e) for l_s, l_e in images])
        self.parse_stat.parsed_text_len -= self.parse_stat.parsed_ocr_text_len
        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_ALWAYS:
            return

        remove_ocrs = False
        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT and \
                self.parse_stat.parsed_text_len >= self.settings.ocr_vector_text_min_length:
            remove_ocrs = True

        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_MORE_TEXT and \
                self.parse_stat.parsed_text_len > self.parse_stat.parsed_ocr_text_len:
            remove_ocrs = True

        if not remove_ocrs:
            return

        transformations = [(l, (l[0], l[0])) for l in images]
        result.apply_transformations(transformations)
コード例 #2
0
 def test_apply_transformations_(self):
     text = 'A text   with extra   spaces.'
     markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 28)]})
     markup.apply_transformations([((6, 9), (6, 7)), ((19, 22), (19, 20))])
     labels = markup.labels['p']
     self.assertEqual((6, 10), labels[0])
     self.assertEqual((18, 24), labels[1])
コード例 #3
0
 def test_apply_transform(self):
     text = 'a123456789b123456789c123456789d123456789e123456789'
     markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 28)]})
     trans = [((0, 9), (0, 1))]
     markup.apply_transformations(trans)
     self.assertEqual((0, 4), markup.labels['p'][0])
     self.assertEqual((14, 20), markup.labels['p'][1])
コード例 #4
0
    def process_inner_tag(self, result: MarkedUpText, tag_label: str,
                          tag_regex: Pattern,
                          make_text_function: Callable[[Any], str]) -> None:
        """
        this "private" method finds tags inside text, given it's start and end positions
        and stores labels found in result
        :param result: a MarkedUpText variable with plain text to process
        :param tag_regex: tag to find
        :param make_text_function: method that processes the tag found transforming the tag into plain text
        """
        new_text = ''
        last_stop = 0

        for match in tag_regex.finditer(result.text):
            link_markup = make_text_function(match)
            src_s, src_e = (match.start(), match.end())
            # ensure spaces between text and link text
            starts_space = src_s == 0 or result.text[src_s -
                                                     1] in self.str_spaces
            ends_space = src_e == len(result.text) - 1 or result.text[
                src_e + 1] in self.str_phrase_separators
            if not starts_space:
                link_markup = ' ' + link_markup
            if not ends_space:
                link_markup += ' '

            new_text += result.text[last_stop:src_s]
            new_text += MarkedUpText.get_marker(tag_label, True)
            new_text += link_markup
            new_text += MarkedUpText.get_marker(tag_label, False)
            last_stop = src_e

        new_text += result.text[last_stop:len(result.text)]
        result.text = new_text
コード例 #5
0
    def unescape(result: MarkedUpText) -> None:
        """
        a "private" method to replace HTML codes like > with corresponding symbols in
        the resulting plain text
        :param result: MarkedUpText containing resulting plain text
        """
        new_text = ''
        transformations = [
        ]  # type: List[Tuple[Tuple[int, int], Tuple[int, int]]]
        last_stop = 0

        for match in _charref.finditer(result.text):
            replacement = _replace_charref(match)
            src_s, src_e = (match.start(), match.end())
            end_e = src_s + len(replacement)
            if end_e != src_e:
                transformations.append(((src_s, src_e), (src_s, end_e)))
            new_text += result.text[last_stop:src_s]
            new_text += replacement
            last_stop = src_e

        new_text += result.text[last_stop:len(result.text)]
        result.text = new_text
        if transformations:
            result.apply_transformations(transformations)
コード例 #6
0
    def remove_extra_linebreaks(self, result: MarkedUpText) -> None:
        """
        Removes linebreaks in the middle of the sentence. Usually, single linebreaks
        within a paragraph should be deleted and replaced with one space character.
        But we preserve the linebreaks if the paragraph is a list or a table.
        Unfortunately, presently we can't recognize a paragraph as a table (if the
        source is a PDF file).
        :param result: MarkedUpText containing resulted plain text
        """
        paragraphs = result.labels.get('paragraphs') or [(0, len(result.text))]
        for par_start, par_end in paragraphs:
            # check the paragraph is not a list and, therefore, can be
            # cleared of extra line breaks
            par_text = result.text[par_start:par_end]
            par_lines = [l for l in par_text.split('\n') if l.strip()]
            if not par_lines:
                continue

            # if lines make a list then don't remove line breaks
            is_list = True
            list_lines = 0
            for line in par_lines:
                if self.re_list_start.match(line):
                    list_lines += 1
            max_breaks_allowed = math.ceil(len(par_lines) / 3)
            if len(par_lines) - list_lines > max_breaks_allowed:
                is_list = False

            if not is_list:
                result.replace_by_regex(self.re_single_newline, ' ', par_start,
                                        par_end)
コード例 #7
0
    def test_replace_by_text_extra(self):
        text = 'A text   with extra   spaces.   '
        markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 28)]})
        markup.replace_by_string('   ', ' ')

        self.assertEqual('A text with extra spaces. ', markup.text)
        labels = markup.labels['p']
        self.assertEqual((6, 10), labels[0])
        self.assertEqual((18, 24), labels[1])
コード例 #8
0
    def test_replace_by_regex_extra_end(self):
        text = 'A text   with extra   spaces.   '
        markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 29)]})
        reg = re.compile(r'\s+')
        markup.replace_by_regex(reg, ' ')

        self.assertEqual('A text with extra spaces. ', markup.text)
        labels = markup.labels['p']
        self.assertEqual((6, 10), labels[0])
        self.assertEqual((18, 25), labels[1])
コード例 #9
0
    def test_replace_by_regex_extra_longer(self):
        text = 'A text   with extra   spaces, and   more spaces'
        markup = MarkedUpText(text,
                              labels={'p': [(7, 12), (22, 32), (41, 46)]})
        reg = re.compile(r'\s+')
        markup.replace_by_regex(reg, ' ')

        self.assertEqual('A text with extra spaces, and more spaces', markup.text)
        labels = markup.labels['p']
        self.assertEqual((6, 10), labels[0])
        self.assertEqual((18, 28), labels[1])
        self.assertEqual((35, 40), labels[2])
コード例 #10
0
 def try_parse_document(self,
                        ptrs: ParsingTaskParams) -> DocumentParsingResults:
     """
     :return: (text, 'plain text', None)
     """
     if ptrs.ext.strip(' .').lower() != 'txt':
         return DocumentParsingResults()
     if ptrs.logger:
         ptrs.logger.info('Trying plain text extract for file: ' +
                          ptrs.original_file_name)
     try:
         import magic
         f = magic.Magic(mime=True)
         mime_type = f.from_file(ptrs.file_path)
         if mime_type == 'text/plain':
             import chardet
             with open(ptrs.file_path, "rb") as fr:
                 bytes = fr.read()
             enc_data = chardet.detect(bytes)
             if enc_data['confidence'] > 0.9:
                 txt = bytes.decode(enc_data['encoding'])
                 rst = DocumentParsingResults(text=MarkedUpText(txt),
                                              parser='plain text')
                 return rst
     except Exception as ex:
         if ptrs.logger:
             ptrs.logger.info(
                 'Caught exception while trying to parse file '
                 f'with plain text parser: {ptrs.original_file_name}'
                 f'\n{format_exc()}')
         if ptrs.propagate_exceptions:
             raise ex
     return DocumentParsingResults()
コード例 #11
0
    def check_ocr_text(self, result: MarkedUpText) -> None:
        """
        a "private" method that checks text obtained from embedded images
        The method decides whether to leave or to delete these pieces of text
        :param result: MarkedUpText, containing resulting plain text
        """
        if self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE:
            return

        self.parse_stat.parsed_ocr_text_len = self.count_text_in_images(result)
        self.parse_stat.parsed_text_len -= self.parse_stat.parsed_ocr_text_len
        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_ALWAYS:
            return

        # remove some of OCR-d text fragments or remove all of them or just quit
        remove_ocrs = False
        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT and \
                self.parse_stat.parsed_text_len >= self.settings.ocr_vector_text_min_length:
            remove_ocrs = True

        if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_MORE_TEXT and \
                self.parse_stat.parsed_text_len > self.parse_stat.parsed_ocr_text_len:
            remove_ocrs = True

        if not remove_ocrs:
            return

        # do remove text that was obtained from images
        result.text = self.remove_text_in_images(result.text)
コード例 #12
0
    def try_parse_document(self,
                           ptrs: ParsingTaskParams) -> DocumentParsingResults:
        """
        :return: (text, 'msword', None)
        """
        try:
            log_func = lambda s: ptrs.logger.info(s) if ptrs.logger else None
            xtractor = XmlWordxExtractor(log_func=log_func)
            if not xtractor.can_process_file(ptrs.original_file_name):
                return DocumentParsingResults()

            if ptrs.logger:
                ptrs.logger.info('Trying MS Word extract for file: ' +
                                 ptrs.original_file_name)

            return DocumentParsingResults(
                MarkedUpText(xtractor.parse_file(ptrs.file_path)), 'msword',
                None, xtractor.tables)
        except Exception as ex:
            if ptrs.logger:
                ptrs.logger.info(
                    'Caught exception while trying to parse file '
                    f'with MS Word parser: {ptrs.original_file_name}'
                    f'\n{format_exc()}')
            if ptrs.propagate_exceptions:
                raise ex
            return DocumentParsingResults()
コード例 #13
0
    def process_inner_tag(self, result: MarkedUpText, tag_regex: Pattern,
                          make_text_function: Callable[[Any], str]) -> None:
        """
        this "private" method finds tags inside text, given it's start and end positions
        and stores labels found in result
        :param result: a MarkedUpText variable with plain text to process
        :param tag_regex: tag to find
        :param make_text_function: method that processes the tag found transforming the tag into plain text
        """
        new_text = ''
        transformations = [
        ]  # type: List[Tuple[Tuple[int, int], Tuple[int, int]]]
        last_stop = 0
        new_labels = []

        for match in tag_regex.finditer(result.text):
            link_markup = make_text_function(match)
            src_s, src_e = (match.start(), match.end())
            # ensure spaces between text and link text
            starts_space = src_s == 0 or result.text[src_s -
                                                     1] in self.str_spaces
            ends_space = src_e == len(result.text) - 1 or result.text[
                src_e + 1] in self.str_phrase_separators
            if not starts_space:
                link_markup = ' ' + link_markup
            if not ends_space:
                link_markup += ' '

            end_e = src_s + len(link_markup)
            if end_e != src_e:
                transformations.append(((src_s, src_e), (src_s, end_e)))
            new_labels.append((src_s, end_e))

            new_text += result.text[last_stop:src_s]
            new_text += link_markup
            last_stop = src_e

        new_text += result.text[last_stop:len(result.text)]
        result.text = new_text
        if transformations:
            result.apply_transformations(transformations)
        if new_labels:
            if 'a' not in result.labels:
                result.labels['a'] = new_labels
            else:
                result.labels['a'] = result.labels['a'] + new_labels
コード例 #14
0
    def parse_file_local_plain_text(self,
                                    local_path: str,
                                    original_file_name: str,
                                    task: Any,
                                    timeout: int = 60,
                                    encoding_name: str = 'utf-8',
                                    logger: ProcessLogger = None,
                                    enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will use plain text "stripper" and transform the source document into plain text
        inside its (Java) process.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PREFER_TEXT
        # don't use at all TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag
        os.environ[self.TIKA_PARSER_DETAIL] = ''

        tika_default_command_list = self.tika_lexnlp_default_command_list
        if enable_ocr is False and self.tika_noocr_default_command_list is not None:
            tika_default_command_list = self.tika_noocr_default_command_list
        cmd = tika_default_command_list + [
            '-J', '-t', f'-e{encoding_name}', local_path
        ]

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        logger.info(f'Tika (plain text) args: {", ".join(cmd)}')

        text = read_output(cmd,
                           stderr_callback=err,
                           encoding=encoding_name,
                           timeout_sec=timeout,
                           task=task) or ''

        try:
            ptr_val = _parse((200, text))
            return MarkedUpText(text=ptr_val['content'],
                                meta=ptr_val['metadata'])
        except Exception as ex:
            text_sample = text[:255] if text and isinstance(text,
                                                            str) else str(text)
            raise Exception(
                'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                text_sample) from ex
コード例 #15
0
 def remove_text_in_images(self, result: MarkedUpText):
     im_op, im_cl = MarkedUpText.BLOCK_MARKERS['images']
     while True:
         p_start = result.text.find(im_op)
         if p_start < 0:
             break
         start = p_start + len(im_op)
         p_end = result.text.find(im_cl, start)
         if p_end < 0:
             continue
         p_end += len(im_cl)
         result.text = result.text[:p_start] + result.text[p_end:]
コード例 #16
0
    def count_text_in_images(self, result: MarkedUpText) -> int:
        text_len = 0
        im_op, im_cl = MarkedUpText.BLOCK_MARKERS['images']
        start = 0
        while True:
            p_start = result.text.find(im_op, start)
            if p_start < 0:
                break
            start = p_start + len(im_op)
            p_end = result.text.find(im_cl, start)
            if p_end < 0:
                continue
            text_len += result.count_non_space_chars(start, p_end)

        return text_len
コード例 #17
0
 def try_parse_document(self, ptrs: ParsingTaskParams) -> DocumentParsingResults:
     if ptrs.logger:
         ptrs.logger.info('Trying Textract for file: ' +
                          ptrs.original_file_name)
     try:
         text = textract2text(ptrs.file_path, ext=ptrs.ext)
         self.parse_pdf_tables(ptrs)
         return DocumentParsingResults(
             MarkedUpText(text), 'textract', None, self.tables)
     except Exception as ex:
         if ptrs.logger:
             ptrs.logger.error('Caught exception while trying to parse file '
                               f'with Textract: {ptrs.original_file_name}'
                               f'\n{format_exc()}')
         if ptrs.propagate_exceptions:
             raise ex
         return DocumentParsingResults()
コード例 #18
0
    def process_meta(self, tag_text: str, result: MarkedUpText) -> None:
        """
        a "private" method to get metadata from a tag like <meta name="pdf:PDFVersion" content="1.4"/>
        :param tag_text: <meta> tag's full text
        :param result: MarkedUpText variable to store tag's value in
        """
        meta_name = ''
        for match in self.re_name_attr.finditer(tag_text):
            meta_name = match.group(0)
            break
        if not meta_name:
            return

        meta_val = ''
        for match in self.re_content_attr.finditer(tag_text):
            meta_val = match.group(0)
            break
        result.meta[meta_name] = meta_val
コード例 #19
0
    def remove_extra_linebreaks(self, result: MarkedUpText) -> None:
        """
        Removes linebreaks in the middle of the sentence. Usually, single linebreaks
        within a paragraph should be deleted and replaced with one space character.
        But we preserve the linebreaks if the paragraph is a list or a table.
        Unfortunately, presently we can't recognize a paragraph as a table (if the
        source is a PDF file).
        :param result: MarkedUpText containing resulted plain text
        """
        paragraph_op, paragraph_cl = MarkedUpText.BLOCK_MARKERS['paragraphs']

        start = 0
        while True:
            p_start = result.text.find(paragraph_op, start)
            if p_start < 0:
                break
            start = p_start + len(paragraph_op)
            p_end = result.text.find(paragraph_cl, start)
            if p_end < 0:
                continue
            p_block_end = p_end + len(paragraph_cl)

            # remove extra "\n" between p_start, p_end
            par_text = result.text[start:p_end]
            par_lines = [l for l in par_text.split('\n') if l.strip()]
            if not par_lines:
                start = p_block_end
                continue

            # if lines make a list then don't remove line breaks
            is_list = True
            list_lines = 0
            for line in par_lines:
                if self.re_list_start.match(line):
                    list_lines += 1
            max_breaks_allowed = math.ceil(len(par_lines) / 3)
            if len(par_lines) - list_lines > max_breaks_allowed:
                is_list = False

            if not is_list:
                par_text = self.re_single_newline.sub(' ', par_text)
                result.text = result.text[:start] + par_text + result.text[
                    p_end:]
            start = p_block_end
コード例 #20
0
    def test_replace_by_regex_limited(self):
        text = """
        <p>Here (Improve  text segmentation   (section / page / paragraph / sentence), section 1.1 Use 
        markup from document parser) I described Tika’s   output in XHTML. In short:
        </p>
        """
        labels = {'p': [(7, 12), (22, 28)]}
        reg = re.compile(r'\s+')

        markup1 = MarkedUpText(text,
                               labels={l: list(labels[l])
                                       for l in labels})
        markup1.replace_by_regex(reg, ' ')

        markup2 = MarkedUpText(text,
                               labels={l: list(labels[l])
                                       for l in labels})
        markup2.replace_by_regex(reg, ' ', 0, len(text))
        self.assertEqual(markup1.text, markup2.text)

        markup2 = MarkedUpText(text,
                               labels={l: list(labels[l])
                                       for l in labels})
        markup2.replace_by_regex(reg, ' ', 0, len(text) >> 1)
        self.assertNotEqual(markup1.text, markup2.text)
コード例 #21
0
 def test_replace_by_regex_none(self):
     text = 'A text   with extra   spaces.'
     markup = MarkedUpText(text)
     reg = re.compile(r'AbC')
     markup.replace_by_regex(reg, ' ')
     self.assertEqual(text, markup.text)
コード例 #22
0
    def parse_text(self,
                   markup: str,
                   detect_tables: bool = True) -> MarkedUpText:
        """
        The only method to call by external code. Transforms "markup" (XHTML string)
        into plain text with some formatting and extra information stored into MarkedUpText structure.
        :param markup: string containing XHTML
        :param detect_tables: whether or not to parse and store tables (MarkedUpTable) markup information
        :return: MarkedUpText - resulted text, paragraphs, pages, headings and tables markup information
        """
        result = MarkedUpText('', {'pages': [], 'paragraphs': []})
        if detect_tables:
            result.tables = self.detect_tables(markup)

        cur_block = None  # type: Optional[TikaXhtmlParser.BlockProps]
        for tg in self.re_tag.finditer(markup):
            tag_text = tg.group(0)
            tag_name = self.get_tag_name(tag_text)
            tag_start, tag_end = (tg.start(), tg.end())
            if not tag_name:
                continue

            if tag_name == 'meta':
                self.process_meta(tag_text, result)
                continue

            tag_type = self.get_tag_type(tag_text)

            if tag_name == 'div' and 'class="page"' in tag_text:
                result.labels['pages'].append((len(result.text), 0))
                continue

            block_type = 'image' if tag_name == 'div' and 'class="ocr"' in tag_text \
                else 'paragraph' if tag_name == 'p' \
                else 'heading' if self.re_tag_h.match(tag_name) else tag_name
            is_text_block = block_type == 'paragraph' or block_type == 'heading' or block_type == 'image'

            if is_text_block and tag_type == 'o':
                if block_type == 'image' and self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE:
                    continue
                cur_block = TikaXhtmlParser.BlockProps(tg.end(), block_type)
                if block_type == 'paragraph':
                    cur_block.label_name = 'paragraphs'
                elif block_type == 'heading':
                    cur_block.label_name = 'heading_' + tag_name[1:]
                elif block_type == 'image':
                    cur_block.label_name = 'images'
                continue

            if (is_text_block
                    or block_type == 'div') and tag_type == 'c' and cur_block:
                end = tag_start
                line = markup[cur_block.start:end]
                p_start = len(result.text)
                p_end = p_start + len(line)
                result.text += line
                if not cur_block.is_inline:
                    result.text += '\n\n'
                if cur_block.label_name:
                    if cur_block.label_name not in result.labels:
                        result.labels[cur_block.label_name] = []
                    result.labels[cur_block.label_name].append(
                        (p_start, p_end))

                # check if the block belongs to a table
                self.update_tables_content(result.tables, tag_start, tag_end,
                                           p_start, p_end)

                cur_block = None
                continue

        l_pages = result.labels['pages']
        if l_pages:
            pages = []
            for i in range(len(l_pages) - 1):
                pages.append((l_pages[i][0], l_pages[i + 1][0]))
            pages.append((l_pages[-1][0], len(result.text)))

        self.process_inner_tags(result)
        self.post_process(result)
        self.parse_stat.parsed_text_len = result.count_non_space_chars()
        self.check_ocr_text(result)
        self.update_tables_outer_bounds(result.tables)

        return result
コード例 #23
0
    def parse_text(self, markup: str) -> MarkedUpText:
        """
        The only method to call by external code. Transforms "markup" (XHTML string)
        into plain text with some formatting and extra information stored into MarkedUpText structure.
        :param markup: string containing XHTML
        :param detect_tables: whether or not to parse and store tables (MarkedUpTable) markup information
        :return: MarkedUpText - resulted text, paragraphs, pages, headings and tables markup information
        """
        result = MarkedUpText('', {'pages': [], 'paragraphs': []})

        cur_block = None  # type: Optional[TikaXhtmlParser.BlockProps]
        for tg in self.re_tag.finditer(markup):
            tag_text = tg.group(0)
            tag_name = self.get_tag_name(tag_text)
            tag_start, tag_end = (tg.start(), tg.end())
            if not tag_name:
                continue

            if tag_name == 'meta':
                self.process_meta(tag_text, result)
                continue

            tag_type = self.get_tag_type(tag_text)

            if tag_name == 'div' and 'class="page"' in tag_text:
                result.add_marker('pages', True)
                result.add_marker('pages', False)
                continue

            block_type = 'images' if tag_name == 'div' and 'class="ocr"' in tag_text \
                else 'paragraphs' if tag_name == 'p' \
                else 'heading' if self.re_tag_h.match(tag_name) \
                else 'td' if tag_name == 'th' \
                else tag_name
            is_text_block = block_type in {'paragraphs', 'heading',
                                           'images'}  #, 'table', 'tr', 'td'}

            if is_text_block and tag_type == 'o':
                if block_type == 'images' and self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE:
                    continue
                cur_block = TikaXhtmlParser.BlockProps(tg.end(), block_type)
                if block_type in {'paragraphs', 'images', 'table', 'tr', 'td'}:
                    cur_block.label_name = block_type
                elif block_type == 'heading':
                    cur_block.label_name = 'heading_' + tag_name[1:]
                continue

            if block_type in {'table', 'tr', 'td'}:
                result.add_marker(block_type, tag_type == 'o')

            if (is_text_block
                    or block_type == 'div') and tag_type == 'c' and cur_block:
                end = tag_start
                line = markup[cur_block.start:end]
                p_start = len(result.text)
                p_end = p_start + len(line)

                if cur_block.label_name:
                    result.add_marker(cur_block.label_name, True)
                result.text += line
                if not cur_block.is_inline:
                    result.text += '\n\n'
                if cur_block.label_name:
                    result.add_marker(cur_block.label_name, False)
                cur_block = None
                continue

        self.process_inner_tags(result)
        self.post_process(result)
        self.parse_stat.parsed_text_len = result.count_non_space_chars()
        self.check_ocr_text(result)
        return result