Esempio n. 1
0
def extract_from_pdf(doc, path, DEVNULL, callback=None):
    """ Extract text from pdfs.

    Here, we use pdftotext. If that fails, try to use tesseract under the
    assumption it's an image-based PDF. Once that is complete, we check for the
    letter e in our content. If it's not there, we try to fix the mojibake
    that ca9 sometimes creates.
    """
    process = subprocess.Popen(
        ["pdftotext", "-layout", "-enc", "UTF-8", path, "-"],
        shell=False,
        stdout=subprocess.PIPE,
        stderr=DEVNULL
    )
    content, err = process.communicate()
    if content.strip() == '' and callback:
        # probably an image PDF. Send it to OCR. N.B.: Do NOT use a subtask here
        # unless you are very careful. In the worst case, doing so can cause a
        # total celery deadlock, since this task will create another task, but
        # there might not be a worker to consume it, guaranteeing that this
        # task never finishes!
        success, content = callback(path)
        if success:
            doc.extracted_by_ocr = True
        elif content == '' or not success:
            content = 'Unable to extract document content.'
    elif 'e' not in content:
        # It's a corrupt PDF from ca9. Fix it.
        content = fix_mojibake(unicode(content, 'utf-8', errors='ignore'))

    return doc, content, err
Esempio n. 2
0
def extract_from_pdf(
    path: str,
    opinion: Opinion,
    ocr_available: bool = False,
) -> ExtractProcessResult:
    """Extract text from pdfs.

    Start with pdftotext. If we we enabled OCR - and the the content is empty
    or the PDF contains images, use tesseract. This pattern occurs because PDFs
    can be images, text-based and a mix of the two. We check for images to
    make sure we do OCR on mix-type PDFs.

    If a text-based PDF we fix corrupt PDFs from ca9.

    :param path: The path to the PDF
    :param opinion: The Opinion associated with the PDF
    :param ocr_available: Whether we should do OCR stuff
    :return Tuple of the content itself and any errors we received
    """
    process = make_pdftotext_process(path)
    content, err = process.communicate()
    content = content.decode()
    if err is not None:
        err = err.decode()

    if not ocr_available:
        if "e" not in content:
            # It's a corrupt PDF from ca9. Fix it.
            content = fix_mojibake(content)
    else:
        if ocr_needed(path, content):
            success, ocr_content = extract_by_ocr(path)
            if success:
                opinion.extracted_by_ocr = True
                # Check content length and take the longer of the two
                if len(ocr_content) > len(content):
                    content = ocr_content
            elif content == "" or not success:
                content = "Unable to extract document content."

    return content, err
Esempio n. 3
0
def extract_from_pdf(path, opinion, do_ocr=False):
    """Extract text from pdfs.

    Here, we use pdftotext. If that fails, try to use tesseract under the
    assumption it's an image-based PDF. Once that is complete, we check for the
    letter e in our content. If it's not there, we try to fix the mojibake
    that ca9 sometimes creates.
    """
    process = make_pdftotext_process(path)
    content, err = process.communicate()
    content = content.decode()
    if content.strip() == "" and do_ocr:
        success, content = extract_by_ocr(path)
        if success:
            opinion.extracted_by_ocr = True
        elif content == "" or not success:
            content = "Unable to extract document content."
    elif "e" not in content:
        # It's a corrupt PDF from ca9. Fix it.
        content = fix_mojibake(content)
    return content, err
Esempio n. 4
0
def extract_from_pdf(path, opinion, do_ocr=False):
    """ Extract text from pdfs.

    Here, we use pdftotext. If that fails, try to use tesseract under the
    assumption it's an image-based PDF. Once that is complete, we check for the
    letter e in our content. If it's not there, we try to fix the mojibake
    that ca9 sometimes creates.
    """
    process = make_pdftotext_process(path)
    content, err = process.communicate()
    if content.strip() == '' and do_ocr:
        success, content = extract_by_ocr(path)
        if success:
            opinion.extracted_by_ocr = True
        elif content == '' or not success:
            content = 'Unable to extract document content.'
    elif 'e' not in content:
        # It's a corrupt PDF from ca9. Fix it.
        content = fix_mojibake(unicode(content, 'utf-8', errors='ignore'))

    return content, err