def test_ocr_if_less(self): text = load_resource_document('parsing/xhtml_ocr_mixed.xhtml', encoding='utf-8') parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.STORE_ALWAYS, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertGreater(len(rst.text), 100) self.assertEqual(2, len(rst.labels['images'])) text = load_resource_document('parsing/xhtml_ocr_mixed.xhtml', encoding='utf-8') parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertGreater(len(rst.text), 100) self.assertTrue('images' not in rst.labels or len(rst.labels['images']) == 0) parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.NEVER_STORE, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertTrue('images' not in rst.labels or len(rst.labels['images']) == 0)
def test_complex_mixed_pdf(self): sets = XhtmlParsingSettings() sets.ocr_sets = OcrTextStoreSettings.STORE_ALWAYS full_text = load_resource_document('parsing/parsed_mixed_pdf.xhtml', encoding='utf-8') parser = TikaXhtmlParser(sets) markup = parser.parse_text(full_text) markup.convert_markers_to_labels() proc_text = markup.text self.assertEqual(-1, proc_text.find('##')) pages = markup.labels['pages'] self.assertGreater(len(pages), 100) pages_texts = [] for _start, end in pages: in_end = min(end, len(markup.text)) in_start = max(in_end - 50, 0) ending = markup.text[in_start:in_end] pages_texts.append(ending) self.assertTrue('See “RATINGS” herein.' in pages_texts[0]) self.assertTrue( 'optional redemption date of November 15, 2027.' in pages_texts[1]) self.assertTrue('by the IRS.' in pages_texts[54])
def test_process_inner_tags(self): text = """ <p>The pen employed in finishing her story, and making it what you now see it to be, has had no little difficulty to put it into a dress fit to be seen, and to make it speak language fit to be read. When a woman debauched from her youth, nay, even being the offspring of debauchery and vice, comes to give an account of all her vicious practices, and even to descend to the particular occasions and circumstances by which she ran through in threescore years, an author must be hard put to it wrap it up so clean as not to give room, especially for vicious readers, to turn it to his disadvantage. <a href="#">This page{{##PGPG##}} ends with semibold text</a>. It is suggested there cannot be the same life, the same brightness and</p> <p> beauty, in relating the penitent part as is in the criminal part. If there is any truth in that suggestion, I must be allowed to say ’tis because there is not the same taste and relish in the reading, and indeed it is too true that the difference lies not in the real worth of the subject so much as in the gust and palate of the reader.</p> """ parser = TikaXhtmlParser() markup = parser.parse_text(text) markup.convert_markers_to_labels() proc_text = markup.text self.assertEqual(-1, proc_text.find('##')) pages = markup.labels['pages'] self.assertEqual(1, len(pages)) last_page_text = proc_text[pages[0][0] - 30:pages[0][0]].strip() self.assertTrue(last_page_text.endswith('This page')) paragraphs = markup.labels['paragraphs'] p_text = proc_text[paragraphs[0][0]:paragraphs[0][1]].strip() self.assertTrue(p_text.endswith('the same brightness and'))
def test_parse_recursive_tables(self): raw = """ <table> <tr><td><p>Cell 1.1</p></td><td><p>Cell 1.2</p></td></tr> <tr> <td><p>Cell 2.1</p></td> <td> <p> <table> <tr><th><p>InCell 1.1</p></th><th><p>InCell 1.2</p></th></tr> <tr><td><p>InCell 2.1</p></td><td><p></p></td></tr> </table> </p> </td> </tr> <tr><td><p>Cell 3.1</p></td><td><p>Cell 3.2</p></td></tr> </table> """ parser = TikaXhtmlParser() rst = parser.parse_text(raw) rst.convert_markers_to_labels() self.assertEqual(2, len(rst.tables)) rst.tables.sort(key=lambda t: t.start) table_df = rst.tables[1].serialize_in_dataframe(rst.text) self.assertEqual('InCell 1.1', table_df.loc[0, 0].strip()) self.assertEqual('InCell 2.1', table_df.loc[1, 0].strip()) self.assertTrue(not table_df.loc[1, 1].strip()) table_df = rst.tables[0].serialize_in_dataframe(rst.text) self.assertEqual('Cell 1.1', table_df.loc[0, 0].strip()) self.assertEqual('Cell 3.2', table_df.loc[2, 1].strip())
def test_parse_table(self): raw = """ <?xml version="1.0" encoding="utf-8"?><html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta name="dc:publisher" content=""/> <title/> </head> <body><p>What is Lorem Ipsum?</p> <p><b>Lorem Ipsum</b> is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry.</p> <p/> <table><tbody><tr> <td><p>Row 1, column 1</p> </td> <td><p>Row 1, column 2</p> </td> <td><p>Row 1, column 3</p> </td></tr> <tr> <td><p>Row 2, column 1</p> </td> <td><p>Row 2, column 2</p> </td> <td><p>Row 2, column 3</p> </td></tr> <tr> <td><p>Row 3, column 1</p> </td> <td><p>Row 3, column 2</p> </td> <td><p>Row 3, column 3</p> </td></tr> <tr> <td><p>Row 4, column 1</p> </td> <td><p>Row 4, column 2</p> </td> <td><p>Row 4, column 3</p> </td></tr> </tbody></table> <p/> <h2>Where does it come from?</h2> <p class="normal_(Web)">Contrary to popular belief, Lorem Ipsum is not simply random text.</p> <p/> <table><tbody><tr> <td><p>r1c1: Contrary to popular belief, Lorem Ipsum is not simply random text.</p> </td> <td><p/> </td> <td><p/> </td></tr> <tr> <td><p/> </td> <td><p/> </td> <td><p class="normal_(Web)"><a name="_GoBack"/>r2c3: The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.</p> </td></tr> </tbody></table> <p/> </body></html> """ parser = TikaXhtmlParser() rst = parser.parse_text(raw, detect_tables=True) self.assertGreater(len(rst.text), 100) self.assertGreater(len(rst.labels['paragraphs']), 1) self.assertEqual(2, len(rst.tables)) table_df = rst.tables[0].serialize_in_dataframe(rst.text) for i_row, row in table_df.iterrows(): for i_cell in range(len(row)): target_str = f'Row {i_row + 1}, column {i_cell + 1}' self.assertEqual(target_str, row[i_cell]) table_df = rst.tables[1].serialize_in_dataframe(rst.text) cell_text = table_df.loc[1, 2] self.assertEqual('{_GoBack} r2c3: The first line of Lorem Ipsum, "Lorem ' + 'ipsum dolor sit amet..", comes from a line in section 1.10.32.\n\n', cell_text)
def test_parse_vector_pdf(self): text = load_resource_document('parsing/xhtml_pdf.xhtml', encoding='utf-8') parser = TikaXhtmlParser() rst = parser.parse_text(text) self.assertGreater(len(rst.text), 100) self.assertGreater(len(rst.labels['pages']), 1) self.assertGreater(len(rst.labels['paragraphs']), 5)
def test_ocr_empty_images(self): text = load_resource_document('parsing/xhtml_ocr_emptyimages.xhtml', encoding='utf-8') parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertEqual(len(rst.text), rst.markers_extra_text_length)
def test_ocr_little_text_scanned(self): text = load_resource_document('parsing/xhtml_ocr_mixed_long.xhtml', encoding='utf-8') parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertGreater(len(rst.text), 100) self.assertEqual(2, len(rst.labels['images'])) len_with_ocred = len(rst.text) text = load_resource_document('parsing/xhtml_ocr_mixed_short.xhtml', encoding='utf-8') parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertTrue('images' not in rst.labels or len(rst.labels['images']) == 0) len_wo_ocred = len(rst.text) self.assertGreater(len_with_ocred - len_wo_ocred, 100)
def test_list_parsing(self): raw = """ <?xml version="1.0" encoding="utf-8"?><html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta name="pdf:PDFVersion" content="1.4"/> <title>Sample Docx with Image.docx</title> </head> <body><div class="page"><p/> <p>Explore XHTML Tika’s output </p> <p>JIRA ticket: https://lexpredict.atlassian.net/browse/CS-3966 </p> <p>Here (Improve text segmentation (section / page / paragraph / sentence), section 1.1 Use markup from document parser) I described Tika’s output in XHTML. In short: </p> <p>● Tika uses PdfBox for “vector” files, MS Word and OpenOffice files ● and Tesseract for scanned files ● in both cases Tika returns valid XHTML ● XHTML contains almost all information on document’s structure that Tika can get </p> <p>see the aforementioned document, section 1.2 Verdict on using Tika markup for segmenting text. </p> <p>I’ve implemented a parser that reads Tika’s output in XHTML and extracts: 1. plain text with or without extra line breaks inside paragraphs, with hyperlinks </p> <p> This paragraph contains text with extra line breaks that should have been deleted because the text is not formatted as a list. This paragraph contains text with extra line breaks that should have been deleted because the text is not formatted as a list. This paragraph contains text with extra line breaks that should have been deleted because the text is not formatted as a list. This paragraph contains text with extra line breaks that should have been deleted because the text is not formatted as a list. </p> <p>formatted 2. paragraphs’ coordinates 3. pages’ coordinates 4. headings 5. tables as Pandas dataframes + anchors to the source text </p> </div> </body></html> """ parser = TikaXhtmlParser() rst = parser.parse_text(raw) rst.convert_markers_to_labels() self.assertGreater(len(rst.text), 100) self.assertGreater(len(rst.labels['paragraphs']), 1)
def test_parse_headings(self): raw = """ <?xml version="1.0" encoding="utf-8"?><html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta name="date" content="2019-08-08T15:35:00Z"/> <title/> </head> <body><h1>1. Heading One</h1> <p class="list_Paragraph"/> <p class="list_Paragraph">Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from > 45 BC, making it over 2000 years old. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", <a href="en.wikipedia.org/%20s%20s">comes from a line</a> in section 1.10.32.</p> <p class="list_Paragraph"/> <p class="list_Paragraph">The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for</p> <p class="list_Paragraph"/> <h2>1.1 Heading One One</h2> <p class="list_Paragraph">Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum & et Malorum" by Cicero are also</p> <p class="list_Paragraph"/> <h2>1.2 Heading one two</h2> <h1>2. <a name="_GoBack"/>Heading 2</h1> </body></html> """ parser = TikaXhtmlParser() rst = parser.parse_text(raw) rst.convert_markers_to_labels() self.assertGreater(len(rst.text), 100) self.assertGreater(len(rst.labels['paragraphs']), 1) self.assertGreater(len(rst.labels['heading_1']), 1) self.assertGreater(len(rst.labels['heading_2']), 1) headings = [rst.text[h_s:h_e] for h_s, h_e in rst.labels['heading_1']] self.assertEqual('1. Heading One', headings[0].strip(' \n')) self.assertEqual('2. {_GoBack} Heading 2', headings[1].strip(' \n')) self.assertGreater(len(rst.labels['a']), 0) sections = rst.find_sections() self.assertGreater(len(sections), 1) self.assertTrue("de Finibus Bonorum & et Malorum" in rst.text)
class TikaParsingWrapper: """ Parses file (provided by path) by Tika's local JAR file or calling Tika's server to parse the file. Can process XHTML or plain text Tika's output. """ # flag defines how Tika parses passed file TIKA_URL_FLAG_MODE = 'pdf-parse' # the same flag as environment variable TIKA_ENV_VAR_FLAG_MODE = 'LEXNLP_TIKA_PARSER_MODE' # flag's value - parse only PDF TIKA_MODE_OCR = 'pdf_ocr' # flag's value - parse both PDF and scanned images TIKA_MODE_PDF_ONLY = 'pdf_only' def __init__(self): self.xhtml_parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.STORE_ALWAYS, remove_extra_newlines=False)) self.tika_files_path = tempfile.gettempdir() self.tika_jar_path = tempfile.gettempdir() from django.conf import settings jar_base_path = settings.JAR_BASE_PATH tika_cls_name = 'org.apache.tika.cli.TikaCLI' tika_cp = ':'.join( [os.path.join(jar_base_path, jar) for jar in settings.TIKA_JARS]) self.tika_default_command_list = [ 'java', '-cp', tika_cp, '-Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider', tika_cls_name ] self.tika_lexnlp_default_command_list = self.tika_default_command_list[:] from apps.task.app_vars import TIKA_CUSTOM_CONFIG, TIKA_NOOCR_CUSTOM_CONFIG, TIKA_LEXNLP_CUSTOM_CONFIG custom_noocr_tika_config = TIKA_NOOCR_CUSTOM_CONFIG.val self.tika_noocr_default_command_list = None if custom_noocr_tika_config: conf_full_path = os.path.join(jar_base_path, custom_noocr_tika_config) self.tika_noocr_default_command_list = self.tika_default_command_list + [ f'--config={conf_full_path}' ] custom_tika_config = TIKA_CUSTOM_CONFIG.val if custom_tika_config: conf_full_path = os.path.join(jar_base_path, custom_tika_config) self.tika_default_command_list += [f'--config={conf_full_path}'] # LexNLP (plugin) Tika config path custom_lexp_tika_config = TIKA_LEXNLP_CUSTOM_CONFIG.val if custom_lexp_tika_config: conf_full_path = os.path.join(jar_base_path, custom_lexp_tika_config) self.tika_lexnlp_default_command_list += [ f'--config={conf_full_path}' ] def parse_file_local_plain_text(self, local_path: str, original_file_name: str, task: Any, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will use plain text "stripper" and transform the source document into plain text inside its (Java) process. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag tika_default_command_list = self.tika_lexnlp_default_command_list if enable_ocr is False and self.tika_noocr_default_command_list is not None: tika_default_command_list = self.tika_noocr_default_command_list cmd = tika_default_command_list + [ '-J', '-t', f'-e{encoding_name}', local_path ] def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') logger.info(f'Tika (plain text) args: {", ".join(cmd)}') text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout, task=task) or '' try: ptr_val = _parse((200, text)) return MarkedUpText(text=ptr_val['content'], meta=ptr_val['metadata']) except Exception as ex: text_sample = text[:255] if text and isinstance(text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex def parse_file_local_xhtml(self, local_path: str, original_file_name: str, task: Any, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text plus extra formatting information plus metadata. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') tika_default_command_list = self.tika_lexnlp_default_command_list if enable_ocr is False and self.tika_noocr_default_command_list is not None: tika_default_command_list = self.tika_noocr_default_command_list parse_commands = [ tika_default_command_list, self.tika_default_command_list ] from apps.document.app_vars import TIKA_PROCESS_RAM_MB_LIMIT ram_limit = TIKA_PROCESS_RAM_MB_LIMIT.val for cmd_index in range(len(parse_commands)): cmd_list = parse_commands[cmd_index] cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path] if ram_limit: java_index = cmd.index('java') cmd = cmd[:java_index + 1] + [f'-Xmx{ram_limit}m' ] + cmd[java_index + 1:] logger.info(f'Tika (XHTML) args: {", ".join(cmd)}') last_try = cmd_index == len(parse_commands) - 1 text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout, task=task) or '' try: output = self.xhtml_parser.parse_text(text) output_len = output.pure_text_length if output else 0 logger.info( f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}' ) if not output_len and not last_try: continue output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \ { 'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len, 'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len, } return output except Exception as ex: text_sample = text[:255] if text and isinstance( text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex def parse_file_on_server(self, option: str, url_or_path: str, server_endpoint: str = None, enable_ocr: bool = True) -> Dict: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a server. Tika will return plain text. :param option: command line options to send to Tika's server :param url_or_path: local path (or URL) to the file being parsed :param server_endpoint: Tika server's URL :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ parse_mode = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY return self.parse(option, url_or_path, server_endpoint, extra_headers={'pdf-parse': parse_mode}) def parse(self, option: str, url_or_path: str, server_endpoint: str = None, verbose: int = 0, tika_server_jar: str = None, response_mime_type: str = 'application/json', services: dict = None, raw_response: bool = False, extra_headers: Dict[str, str] = None) -> Dict: """ The method is called from parse_file_on_server to parse the file calling Tika as a server. :param option: command line options to send to Tika's server :param url_or_path: local path (or URL) to the file being parsed :param server_endpoint: Tika server's URL :param verbose: make Tika produse verbose log :param tika_server_jar: path to Tika's JAR file :param response_mime_type: response format (application/json) for plain text + metadata in JSON format :param services: :param raw_response: get raw response from Tika (text + metadata + warnings), False by default :param extra_headers: extra request header :return: dictionary with "content" (text) and "metadata" (another dictionary) keys """ services = services if services else \ {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'} tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path server_endpoint = server_endpoint if server_endpoint else self.server_endpoint path, file_type = getRemoteFile(url_or_path, self.tika_files_path) service = services.get(option, services['all']) if service == '/tika': response_mime_type = 'text/plain' content_path = self.make_content_disposition_header(path) headers = { 'Accept': response_mime_type, 'Content-Disposition': content_path } if extra_headers: headers = {**headers, **extra_headers} status, response = callServer('put', server_endpoint, service, open(path, 'rb'), headers, verbose, tika_server_jar, rawResponse=raw_response) if file_type == 'remote': os.unlink(path) return _parse((status, response)) def make_content_disposition_header(self, fn): return 'attachment; filename=%s' % os.path.basename(fn)