def pdf2pdfa(task, input_file_path, output_file_path, language='eng', output_types=['pdf'], logger=None, timeout=600): """ Convert scanned PDF into searchable PDF-A or optionally into hOCR or txt (see output_types) :param task: Task :param input_file_path: :param output_file_path: :param language: :param output_types: List['pdf', 'hocr', 'txt'] :param logger: ProcessLogger :param timeout: sec :return: """ with tempfile.TemporaryDirectory() as tmp_dir: pdf2image_kwargs = dict(fmt='jpg', jpegopt={'quality': 50}) img_list_file_path = pdf2img(input_file_path, work_dir=tmp_dir, **pdf2image_kwargs) cmd_args = ['tesseract', '-l', str(language), img_list_file_path, output_file_path] for ot in output_types: cmd_args += ['-c', f'tessedit_create_{ot}=1'] def err(line): logger.info(f'tesseract converting {img_list_file_path} ' f'images into {output_file_path}:\n{line}') read_output(cmd_args, stderr_callback=err, timeout_sec=timeout, task=task) or '' # rename file.pdf.alt.pdf into file.pdf.alt for ot in output_types: os.rename(output_file_path + f'.{ot}', output_file_path)
def parse_file_local(self, local_path: str, original_file_name: str, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> Dict: mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag cmd = self.tika_start_command_list + [ '-J', '-t', f'-e{encoding_name}', local_path ] def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout) or '' try: return _parse((200, text)) except Exception as ex: text_sample = text[:255] if text and isinstance(text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex
def parse_file_local_xhtml(self, local_path: str, original_file_name: str, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text plus extra formatting information plus metadata. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') for cmd_list in [ self.tika_default_command_list, self.tika_lexnlp_default_command_list ]: cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path] last_try = cmd == self.tika_lexnlp_default_command_list text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout) or '' try: output = self.xhtml_parser.parse_text(text) output_len = len(output.text) if output and output.text else 0 logger.info( f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}' ) if not output_len and not last_try: continue output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \ { 'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len, 'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len, } return output except Exception as ex: text_sample = text[:255] if text and isinstance( text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex
def parse_file_local_plain_text(self, local_path: str, original_file_name: str, task: Any, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will use plain text "stripper" and transform the source document into plain text inside its (Java) process. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PREFER_TEXT # don't use at all TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag os.environ[self.TIKA_PARSER_DETAIL] = '' tika_default_command_list = self.tika_lexnlp_default_command_list if enable_ocr is False and self.tika_noocr_default_command_list is not None: tika_default_command_list = self.tika_noocr_default_command_list cmd = tika_default_command_list + [ '-J', '-t', f'-e{encoding_name}', local_path ] def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') logger.info(f'Tika (plain text) args: {", ".join(cmd)}') text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout, task=task) or '' try: ptr_val = _parse((200, text)) return MarkedUpText(text=ptr_val['content'], meta=ptr_val['metadata']) except Exception as ex: text_sample = text[:255] if text and isinstance(text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex
def pdf_has_images(file_path, task, logger=None, timeout=600): """ Check whether PDF file has images :param file_path: str :param task: celery task :param logger: ProcessLogger :param timeout: timeout sec :return: bool """ def err(line): logger.info(f'pdfimages parsing {file_path}:\n{line}') cmd = ['pdfimages', '-list', file_path] out = read_output(cmd, stderr_callback=err, timeout_sec=timeout, task=task) or '' return process_image_list(out)
def parse_file_local_xhtml(self, local_path: str, original_file_name: str, task: Any, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text plus extra formatting information plus metadata. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') tika_default_command_list = self.tika_lexnlp_default_command_list if enable_ocr is False and self.tika_noocr_default_command_list is not None: tika_default_command_list = self.tika_noocr_default_command_list parse_commands = [ tika_default_command_list, self.tika_default_command_list ] from apps.document.app_vars import TIKA_PROCESS_RAM_MB_LIMIT ram_limit = TIKA_PROCESS_RAM_MB_LIMIT.val for cmd_index in range(len(parse_commands)): cmd_list = parse_commands[cmd_index] cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path] if ram_limit: java_index = cmd.index('java') cmd = cmd[:java_index + 1] + [f'-Xmx{ram_limit}m' ] + cmd[java_index + 1:] logger.info(f'Tika (XHTML) args: {", ".join(cmd)}') last_try = cmd_index == len(parse_commands) - 1 text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout, task=task) or '' try: output = self.xhtml_parser.parse_text(text) output_len = output.pure_text_length if output else 0 logger.info( f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}' ) if not output_len and not last_try: continue output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \ { 'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len, 'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len, } return output except Exception as ex: text_sample = text[:255] if text and isinstance( text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex