def check_ocr_text(self, result: MarkedUpText) -> None: """ a "private" method that checks text obtained from embedded images The method decides whether to leave or to delete these pieces of text :param result: MarkedUpText, containing resulting plain text """ if not result.text or 'images' not in result.labels: return # remove some of OCR-d text fragments or remove all of them or just quit if self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE: return images = result.labels['images'] self.parse_stat.parsed_ocr_text_len = sum( [result.count_non_space_chars(l_s, l_e) for l_s, l_e in images]) self.parse_stat.parsed_text_len -= self.parse_stat.parsed_ocr_text_len if self.settings.ocr_sets == OcrTextStoreSettings.STORE_ALWAYS: return remove_ocrs = False if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT and \ self.parse_stat.parsed_text_len >= self.settings.ocr_vector_text_min_length: remove_ocrs = True if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_MORE_TEXT and \ self.parse_stat.parsed_text_len > self.parse_stat.parsed_ocr_text_len: remove_ocrs = True if not remove_ocrs: return transformations = [(l, (l[0], l[0])) for l in images] result.apply_transformations(transformations)
def test_apply_transformations_(self): text = 'A text with extra spaces.' markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 28)]}) markup.apply_transformations([((6, 9), (6, 7)), ((19, 22), (19, 20))]) labels = markup.labels['p'] self.assertEqual((6, 10), labels[0]) self.assertEqual((18, 24), labels[1])
def test_apply_transform(self): text = 'a123456789b123456789c123456789d123456789e123456789' markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 28)]}) trans = [((0, 9), (0, 1))] markup.apply_transformations(trans) self.assertEqual((0, 4), markup.labels['p'][0]) self.assertEqual((14, 20), markup.labels['p'][1])
def process_inner_tag(self, result: MarkedUpText, tag_label: str, tag_regex: Pattern, make_text_function: Callable[[Any], str]) -> None: """ this "private" method finds tags inside text, given it's start and end positions and stores labels found in result :param result: a MarkedUpText variable with plain text to process :param tag_regex: tag to find :param make_text_function: method that processes the tag found transforming the tag into plain text """ new_text = '' last_stop = 0 for match in tag_regex.finditer(result.text): link_markup = make_text_function(match) src_s, src_e = (match.start(), match.end()) # ensure spaces between text and link text starts_space = src_s == 0 or result.text[src_s - 1] in self.str_spaces ends_space = src_e == len(result.text) - 1 or result.text[ src_e + 1] in self.str_phrase_separators if not starts_space: link_markup = ' ' + link_markup if not ends_space: link_markup += ' ' new_text += result.text[last_stop:src_s] new_text += MarkedUpText.get_marker(tag_label, True) new_text += link_markup new_text += MarkedUpText.get_marker(tag_label, False) last_stop = src_e new_text += result.text[last_stop:len(result.text)] result.text = new_text
def unescape(result: MarkedUpText) -> None: """ a "private" method to replace HTML codes like > with corresponding symbols in the resulting plain text :param result: MarkedUpText containing resulting plain text """ new_text = '' transformations = [ ] # type: List[Tuple[Tuple[int, int], Tuple[int, int]]] last_stop = 0 for match in _charref.finditer(result.text): replacement = _replace_charref(match) src_s, src_e = (match.start(), match.end()) end_e = src_s + len(replacement) if end_e != src_e: transformations.append(((src_s, src_e), (src_s, end_e))) new_text += result.text[last_stop:src_s] new_text += replacement last_stop = src_e new_text += result.text[last_stop:len(result.text)] result.text = new_text if transformations: result.apply_transformations(transformations)
def remove_extra_linebreaks(self, result: MarkedUpText) -> None: """ Removes linebreaks in the middle of the sentence. Usually, single linebreaks within a paragraph should be deleted and replaced with one space character. But we preserve the linebreaks if the paragraph is a list or a table. Unfortunately, presently we can't recognize a paragraph as a table (if the source is a PDF file). :param result: MarkedUpText containing resulted plain text """ paragraphs = result.labels.get('paragraphs') or [(0, len(result.text))] for par_start, par_end in paragraphs: # check the paragraph is not a list and, therefore, can be # cleared of extra line breaks par_text = result.text[par_start:par_end] par_lines = [l for l in par_text.split('\n') if l.strip()] if not par_lines: continue # if lines make a list then don't remove line breaks is_list = True list_lines = 0 for line in par_lines: if self.re_list_start.match(line): list_lines += 1 max_breaks_allowed = math.ceil(len(par_lines) / 3) if len(par_lines) - list_lines > max_breaks_allowed: is_list = False if not is_list: result.replace_by_regex(self.re_single_newline, ' ', par_start, par_end)
def test_replace_by_text_extra(self): text = 'A text with extra spaces. ' markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 28)]}) markup.replace_by_string(' ', ' ') self.assertEqual('A text with extra spaces. ', markup.text) labels = markup.labels['p'] self.assertEqual((6, 10), labels[0]) self.assertEqual((18, 24), labels[1])
def test_replace_by_regex_extra_end(self): text = 'A text with extra spaces. ' markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 29)]}) reg = re.compile(r'\s+') markup.replace_by_regex(reg, ' ') self.assertEqual('A text with extra spaces. ', markup.text) labels = markup.labels['p'] self.assertEqual((6, 10), labels[0]) self.assertEqual((18, 25), labels[1])
def test_replace_by_regex_extra_longer(self): text = 'A text with extra spaces, and more spaces' markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 32), (41, 46)]}) reg = re.compile(r'\s+') markup.replace_by_regex(reg, ' ') self.assertEqual('A text with extra spaces, and more spaces', markup.text) labels = markup.labels['p'] self.assertEqual((6, 10), labels[0]) self.assertEqual((18, 28), labels[1]) self.assertEqual((35, 40), labels[2])
def try_parse_document(self, ptrs: ParsingTaskParams) -> DocumentParsingResults: """ :return: (text, 'plain text', None) """ if ptrs.ext.strip(' .').lower() != 'txt': return DocumentParsingResults() if ptrs.logger: ptrs.logger.info('Trying plain text extract for file: ' + ptrs.original_file_name) try: import magic f = magic.Magic(mime=True) mime_type = f.from_file(ptrs.file_path) if mime_type == 'text/plain': import chardet with open(ptrs.file_path, "rb") as fr: bytes = fr.read() enc_data = chardet.detect(bytes) if enc_data['confidence'] > 0.9: txt = bytes.decode(enc_data['encoding']) rst = DocumentParsingResults(text=MarkedUpText(txt), parser='plain text') return rst except Exception as ex: if ptrs.logger: ptrs.logger.info( 'Caught exception while trying to parse file ' f'with plain text parser: {ptrs.original_file_name}' f'\n{format_exc()}') if ptrs.propagate_exceptions: raise ex return DocumentParsingResults()
def check_ocr_text(self, result: MarkedUpText) -> None: """ a "private" method that checks text obtained from embedded images The method decides whether to leave or to delete these pieces of text :param result: MarkedUpText, containing resulting plain text """ if self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE: return self.parse_stat.parsed_ocr_text_len = self.count_text_in_images(result) self.parse_stat.parsed_text_len -= self.parse_stat.parsed_ocr_text_len if self.settings.ocr_sets == OcrTextStoreSettings.STORE_ALWAYS: return # remove some of OCR-d text fragments or remove all of them or just quit remove_ocrs = False if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT and \ self.parse_stat.parsed_text_len >= self.settings.ocr_vector_text_min_length: remove_ocrs = True if self.settings.ocr_sets == OcrTextStoreSettings.STORE_IF_MORE_TEXT and \ self.parse_stat.parsed_text_len > self.parse_stat.parsed_ocr_text_len: remove_ocrs = True if not remove_ocrs: return # do remove text that was obtained from images result.text = self.remove_text_in_images(result.text)
def try_parse_document(self, ptrs: ParsingTaskParams) -> DocumentParsingResults: """ :return: (text, 'msword', None) """ try: log_func = lambda s: ptrs.logger.info(s) if ptrs.logger else None xtractor = XmlWordxExtractor(log_func=log_func) if not xtractor.can_process_file(ptrs.original_file_name): return DocumentParsingResults() if ptrs.logger: ptrs.logger.info('Trying MS Word extract for file: ' + ptrs.original_file_name) return DocumentParsingResults( MarkedUpText(xtractor.parse_file(ptrs.file_path)), 'msword', None, xtractor.tables) except Exception as ex: if ptrs.logger: ptrs.logger.info( 'Caught exception while trying to parse file ' f'with MS Word parser: {ptrs.original_file_name}' f'\n{format_exc()}') if ptrs.propagate_exceptions: raise ex return DocumentParsingResults()
def process_inner_tag(self, result: MarkedUpText, tag_regex: Pattern, make_text_function: Callable[[Any], str]) -> None: """ this "private" method finds tags inside text, given it's start and end positions and stores labels found in result :param result: a MarkedUpText variable with plain text to process :param tag_regex: tag to find :param make_text_function: method that processes the tag found transforming the tag into plain text """ new_text = '' transformations = [ ] # type: List[Tuple[Tuple[int, int], Tuple[int, int]]] last_stop = 0 new_labels = [] for match in tag_regex.finditer(result.text): link_markup = make_text_function(match) src_s, src_e = (match.start(), match.end()) # ensure spaces between text and link text starts_space = src_s == 0 or result.text[src_s - 1] in self.str_spaces ends_space = src_e == len(result.text) - 1 or result.text[ src_e + 1] in self.str_phrase_separators if not starts_space: link_markup = ' ' + link_markup if not ends_space: link_markup += ' ' end_e = src_s + len(link_markup) if end_e != src_e: transformations.append(((src_s, src_e), (src_s, end_e))) new_labels.append((src_s, end_e)) new_text += result.text[last_stop:src_s] new_text += link_markup last_stop = src_e new_text += result.text[last_stop:len(result.text)] result.text = new_text if transformations: result.apply_transformations(transformations) if new_labels: if 'a' not in result.labels: result.labels['a'] = new_labels else: result.labels['a'] = result.labels['a'] + new_labels
def parse_file_local_plain_text(self, local_path: str, original_file_name: str, task: Any, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will use plain text "stripper" and transform the source document into plain text inside its (Java) process. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PREFER_TEXT # don't use at all TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag os.environ[self.TIKA_PARSER_DETAIL] = '' tika_default_command_list = self.tika_lexnlp_default_command_list if enable_ocr is False and self.tika_noocr_default_command_list is not None: tika_default_command_list = self.tika_noocr_default_command_list cmd = tika_default_command_list + [ '-J', '-t', f'-e{encoding_name}', local_path ] def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') logger.info(f'Tika (plain text) args: {", ".join(cmd)}') text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout, task=task) or '' try: ptr_val = _parse((200, text)) return MarkedUpText(text=ptr_val['content'], meta=ptr_val['metadata']) except Exception as ex: text_sample = text[:255] if text and isinstance(text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex
def remove_text_in_images(self, result: MarkedUpText): im_op, im_cl = MarkedUpText.BLOCK_MARKERS['images'] while True: p_start = result.text.find(im_op) if p_start < 0: break start = p_start + len(im_op) p_end = result.text.find(im_cl, start) if p_end < 0: continue p_end += len(im_cl) result.text = result.text[:p_start] + result.text[p_end:]
def count_text_in_images(self, result: MarkedUpText) -> int: text_len = 0 im_op, im_cl = MarkedUpText.BLOCK_MARKERS['images'] start = 0 while True: p_start = result.text.find(im_op, start) if p_start < 0: break start = p_start + len(im_op) p_end = result.text.find(im_cl, start) if p_end < 0: continue text_len += result.count_non_space_chars(start, p_end) return text_len
def try_parse_document(self, ptrs: ParsingTaskParams) -> DocumentParsingResults: if ptrs.logger: ptrs.logger.info('Trying Textract for file: ' + ptrs.original_file_name) try: text = textract2text(ptrs.file_path, ext=ptrs.ext) self.parse_pdf_tables(ptrs) return DocumentParsingResults( MarkedUpText(text), 'textract', None, self.tables) except Exception as ex: if ptrs.logger: ptrs.logger.error('Caught exception while trying to parse file ' f'with Textract: {ptrs.original_file_name}' f'\n{format_exc()}') if ptrs.propagate_exceptions: raise ex return DocumentParsingResults()
def process_meta(self, tag_text: str, result: MarkedUpText) -> None: """ a "private" method to get metadata from a tag like <meta name="pdf:PDFVersion" content="1.4"/> :param tag_text: <meta> tag's full text :param result: MarkedUpText variable to store tag's value in """ meta_name = '' for match in self.re_name_attr.finditer(tag_text): meta_name = match.group(0) break if not meta_name: return meta_val = '' for match in self.re_content_attr.finditer(tag_text): meta_val = match.group(0) break result.meta[meta_name] = meta_val
def remove_extra_linebreaks(self, result: MarkedUpText) -> None: """ Removes linebreaks in the middle of the sentence. Usually, single linebreaks within a paragraph should be deleted and replaced with one space character. But we preserve the linebreaks if the paragraph is a list or a table. Unfortunately, presently we can't recognize a paragraph as a table (if the source is a PDF file). :param result: MarkedUpText containing resulted plain text """ paragraph_op, paragraph_cl = MarkedUpText.BLOCK_MARKERS['paragraphs'] start = 0 while True: p_start = result.text.find(paragraph_op, start) if p_start < 0: break start = p_start + len(paragraph_op) p_end = result.text.find(paragraph_cl, start) if p_end < 0: continue p_block_end = p_end + len(paragraph_cl) # remove extra "\n" between p_start, p_end par_text = result.text[start:p_end] par_lines = [l for l in par_text.split('\n') if l.strip()] if not par_lines: start = p_block_end continue # if lines make a list then don't remove line breaks is_list = True list_lines = 0 for line in par_lines: if self.re_list_start.match(line): list_lines += 1 max_breaks_allowed = math.ceil(len(par_lines) / 3) if len(par_lines) - list_lines > max_breaks_allowed: is_list = False if not is_list: par_text = self.re_single_newline.sub(' ', par_text) result.text = result.text[:start] + par_text + result.text[ p_end:] start = p_block_end
def test_replace_by_regex_limited(self): text = """ <p>Here (Improve text segmentation (section / page / paragraph / sentence), section 1.1 Use markup from document parser) I described Tika’s output in XHTML. In short: </p> """ labels = {'p': [(7, 12), (22, 28)]} reg = re.compile(r'\s+') markup1 = MarkedUpText(text, labels={l: list(labels[l]) for l in labels}) markup1.replace_by_regex(reg, ' ') markup2 = MarkedUpText(text, labels={l: list(labels[l]) for l in labels}) markup2.replace_by_regex(reg, ' ', 0, len(text)) self.assertEqual(markup1.text, markup2.text) markup2 = MarkedUpText(text, labels={l: list(labels[l]) for l in labels}) markup2.replace_by_regex(reg, ' ', 0, len(text) >> 1) self.assertNotEqual(markup1.text, markup2.text)
def test_replace_by_regex_none(self): text = 'A text with extra spaces.' markup = MarkedUpText(text) reg = re.compile(r'AbC') markup.replace_by_regex(reg, ' ') self.assertEqual(text, markup.text)
def parse_text(self, markup: str, detect_tables: bool = True) -> MarkedUpText: """ The only method to call by external code. Transforms "markup" (XHTML string) into plain text with some formatting and extra information stored into MarkedUpText structure. :param markup: string containing XHTML :param detect_tables: whether or not to parse and store tables (MarkedUpTable) markup information :return: MarkedUpText - resulted text, paragraphs, pages, headings and tables markup information """ result = MarkedUpText('', {'pages': [], 'paragraphs': []}) if detect_tables: result.tables = self.detect_tables(markup) cur_block = None # type: Optional[TikaXhtmlParser.BlockProps] for tg in self.re_tag.finditer(markup): tag_text = tg.group(0) tag_name = self.get_tag_name(tag_text) tag_start, tag_end = (tg.start(), tg.end()) if not tag_name: continue if tag_name == 'meta': self.process_meta(tag_text, result) continue tag_type = self.get_tag_type(tag_text) if tag_name == 'div' and 'class="page"' in tag_text: result.labels['pages'].append((len(result.text), 0)) continue block_type = 'image' if tag_name == 'div' and 'class="ocr"' in tag_text \ else 'paragraph' if tag_name == 'p' \ else 'heading' if self.re_tag_h.match(tag_name) else tag_name is_text_block = block_type == 'paragraph' or block_type == 'heading' or block_type == 'image' if is_text_block and tag_type == 'o': if block_type == 'image' and self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE: continue cur_block = TikaXhtmlParser.BlockProps(tg.end(), block_type) if block_type == 'paragraph': cur_block.label_name = 'paragraphs' elif block_type == 'heading': cur_block.label_name = 'heading_' + tag_name[1:] elif block_type == 'image': cur_block.label_name = 'images' continue if (is_text_block or block_type == 'div') and tag_type == 'c' and cur_block: end = tag_start line = markup[cur_block.start:end] p_start = len(result.text) p_end = p_start + len(line) result.text += line if not cur_block.is_inline: result.text += '\n\n' if cur_block.label_name: if cur_block.label_name not in result.labels: result.labels[cur_block.label_name] = [] result.labels[cur_block.label_name].append( (p_start, p_end)) # check if the block belongs to a table self.update_tables_content(result.tables, tag_start, tag_end, p_start, p_end) cur_block = None continue l_pages = result.labels['pages'] if l_pages: pages = [] for i in range(len(l_pages) - 1): pages.append((l_pages[i][0], l_pages[i + 1][0])) pages.append((l_pages[-1][0], len(result.text))) self.process_inner_tags(result) self.post_process(result) self.parse_stat.parsed_text_len = result.count_non_space_chars() self.check_ocr_text(result) self.update_tables_outer_bounds(result.tables) return result
def parse_text(self, markup: str) -> MarkedUpText: """ The only method to call by external code. Transforms "markup" (XHTML string) into plain text with some formatting and extra information stored into MarkedUpText structure. :param markup: string containing XHTML :param detect_tables: whether or not to parse and store tables (MarkedUpTable) markup information :return: MarkedUpText - resulted text, paragraphs, pages, headings and tables markup information """ result = MarkedUpText('', {'pages': [], 'paragraphs': []}) cur_block = None # type: Optional[TikaXhtmlParser.BlockProps] for tg in self.re_tag.finditer(markup): tag_text = tg.group(0) tag_name = self.get_tag_name(tag_text) tag_start, tag_end = (tg.start(), tg.end()) if not tag_name: continue if tag_name == 'meta': self.process_meta(tag_text, result) continue tag_type = self.get_tag_type(tag_text) if tag_name == 'div' and 'class="page"' in tag_text: result.add_marker('pages', True) result.add_marker('pages', False) continue block_type = 'images' if tag_name == 'div' and 'class="ocr"' in tag_text \ else 'paragraphs' if tag_name == 'p' \ else 'heading' if self.re_tag_h.match(tag_name) \ else 'td' if tag_name == 'th' \ else tag_name is_text_block = block_type in {'paragraphs', 'heading', 'images'} #, 'table', 'tr', 'td'} if is_text_block and tag_type == 'o': if block_type == 'images' and self.settings.ocr_sets == OcrTextStoreSettings.NEVER_STORE: continue cur_block = TikaXhtmlParser.BlockProps(tg.end(), block_type) if block_type in {'paragraphs', 'images', 'table', 'tr', 'td'}: cur_block.label_name = block_type elif block_type == 'heading': cur_block.label_name = 'heading_' + tag_name[1:] continue if block_type in {'table', 'tr', 'td'}: result.add_marker(block_type, tag_type == 'o') if (is_text_block or block_type == 'div') and tag_type == 'c' and cur_block: end = tag_start line = markup[cur_block.start:end] p_start = len(result.text) p_end = p_start + len(line) if cur_block.label_name: result.add_marker(cur_block.label_name, True) result.text += line if not cur_block.is_inline: result.text += '\n\n' if cur_block.label_name: result.add_marker(cur_block.label_name, False) cur_block = None continue self.process_inner_tags(result) self.post_process(result) self.parse_stat.parsed_text_len = result.count_non_space_chars() self.check_ocr_text(result) return result