Example #1
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #2
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #3
0
def _convert_page(interpreter, page, device, page_no, path, languages):
    # If this returns None or an empty string, it'll trigger OCR.
    text_content = []
    ocr_required = False
    try:
        interpreter.process_page(page)
        layout = device.get_result()

        for text_obj in _find_objects(layout._objs, (LTTextBox, LTTextLine)):
            text = text_obj.get_text()
            if text is None:
                continue
            text = text.strip()
            if len(text):
                text_content.append(text)

        # Generous try/catch because pdfminers image support is
        # horrible.
        page_area = float(layout.width * layout.height)
        for image_obj in _find_objects(layout._objs, LTImage):
            image_area = float(image_obj.width * image_obj.height)
            page_portion = image_area / page_area
            # Go for OCR if an image makes up more than 70% of the page.
            if page_portion > 0.7:
                ocr_required = True

    except Exception as ex:
        log.exception(ex)
        ocr_required = True

    if ocr_required and get_config("OCR_PDF_PAGES"):
        log.info("Using OCR for %r, p.%s", path, page_no)
        text_content.append(_extract_image_page(path, page_no, languages))

    text = "\n".join(text_content)
    log.debug("Extracted %d characters of text from %r, p.%s", len(text), path, page_no)
    return text.strip()