Python _extract_image_page Examples

Programming Language: Python

Namespace/Package Name: aleph.ingest.tesseract

Method/Function: _extract_image_page

Examples at hotexamples.com: 3

Python _extract_image_page - 3 examples found. These are the top rated real world Python examples of aleph.ingest.tesseract._extract_image_page extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result

Example #2

Show file

File: pdf.py Project: nivertech/aleph

def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result

Example #3

Show file

File: pdf.py Project: CodeForAfrica/aleph

def _convert_page(interpreter, page, device, page_no, path, languages):
    # If this returns None or an empty string, it'll trigger OCR.
    text_content = []
    ocr_required = False
    try:
        interpreter.process_page(page)
        layout = device.get_result()

        for text_obj in _find_objects(layout._objs, (LTTextBox, LTTextLine)):
            text = text_obj.get_text()
            if text is None:
                continue
            text = text.strip()
            if len(text):
                text_content.append(text)

        # Generous try/catch because pdfminers image support is
        # horrible.
        page_area = float(layout.width * layout.height)
        for image_obj in _find_objects(layout._objs, LTImage):
            image_area = float(image_obj.width * image_obj.height)
            page_portion = image_area / page_area
            # Go for OCR if an image makes up more than 70% of the page.
            if page_portion > 0.7:
                ocr_required = True

    except Exception as ex:
        log.exception(ex)
        ocr_required = True

    if ocr_required and get_config("OCR_PDF_PAGES"):
        log.info("Using OCR for %r, p.%s", path, page_no)
        text_content.append(_extract_image_page(path, page_no, languages))

    text = "\n".join(text_content)
    log.debug("Extracted %d characters of text from %r, p.%s", len(text), path, page_no)
    return text.strip()