def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use PyPDF2 to extract textual content first. If none is found, it'll send the file through OCR. """ with open(path, 'rb') as fh: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, '') result = {'pages': []} if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() if k != 'pages': result[k] = safe_text(v) if not doc.is_extractable: log.warning("PDF not extractable: %s", path) return result for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() text = _convert_page(layout, languages) result['pages'].append(text) device.close() return result
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use pdfminer to extract textual content from each page. If none is found, it'll send the images through OCR. """ with open(path, "rb") as fh: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) try: doc = PDFDocument(parser, "") except PDFSyntaxError as pse: if "No /Root object!" in pse.message: log.info("Invalid PDF file: %r", path) return None raise result = {"pages": []} if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() if k != "pages": result[k] = safe_text(v) if not doc.is_extractable: log.warning("PDF not extractable: %s", path) return result for i, page in enumerate(PDFPage.create_pages(doc)): text = None try: interpreter.process_page(page) layout = device.get_result() text = _convert_page(layout, path) except Exception as ex: log.warning("Failed to parse PDF page: %r", ex) if text is None or len(text) < 3: log.debug("Defaulting to OCR: %r, pg. %s", path, i + 1) text = _extract_image_page(path, i + 1, languages) result["pages"].append(text) device.close() return result
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use pdfminer to extract textual content from each page. If none is found, it'll send the images through OCR. """ with open(path, 'rb') as fh: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) try: doc = PDFDocument(parser, '') except PDFSyntaxError as pse: if 'No /Root object!' in pse.message: log.info("Invalid PDF file: %r", path) return None raise result = {'pages': []} if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() if k != 'pages': result[k] = safe_text(v) if not doc.is_extractable: log.warning("PDF not extractable: %s", path) return result for i, page in enumerate(PDFPage.create_pages(doc)): text = None try: interpreter.process_page(page) layout = device.get_result() text = _convert_page(layout, path) except Exception as ex: log.warning("Failed to parse PDF page: %r", ex) if text is None or len(text) < 3: log.debug("Defaulting to OCR: %r, pg. %s", path, i + 1) text = _extract_image_page(path, i + 1, languages) result['pages'].append(text) device.close() return result