Esempio n. 1
0
def extract_recap_pdf(pk, skip_ocr=False):
    doc = RECAPDocument.objects.get(pk=pk)
    path = doc.filepath_local.path
    process = make_pdftotext_process(path)
    content, err = process.communicate()

    if needs_ocr(content):
        if not skip_ocr:
            # probably an image PDF. Send it to OCR.
            success, content = extract_by_ocr(path)
            if success:
                doc.ocr_status = RECAPDocument.OCR_COMPLETE
            elif content == u"" or not success:
                content = u"Unable to extract document content."
                doc.ocr_status = RECAPDocument.OCR_FAILED
        else:
            content = u""
            doc.ocr_status = RECAPDocument.OCR_NEEDED
    else:
        doc.ocr_status = RECAPDocument.OCR_UNNECESSARY

    doc.plain_text, _ = anonymize(content)
    doc.save()

    return path
Esempio n. 2
0
def extract_recap_pdf(
    pks: Union[int, List[int]],
    skip_ocr: bool = False,
    check_if_needed: bool = True,
) -> List[int]:
    """Extract the contents from a RECAP PDF if necessary."""
    if not is_iter(pks):
        pks = [pks]

    processed = []
    for pk in pks:
        rd = RECAPDocument.objects.get(pk=pk)
        if check_if_needed and not rd.needs_extraction:
            # Early abort if the item doesn't need extraction and the user
            # hasn't disabled early abortion.
            processed.append(pk)
            continue

        with NamedTemporaryFile(
                prefix="extract_file_",
                suffix=".pdf",
                buffering=0,  # Make sure it's on disk when we try to use it
        ) as tmp:
            tmp.write(rd.filepath_local.read())
            process = make_pdftotext_process(tmp.name)
            content, err = process.communicate()
            content = content.decode()

            if needs_ocr(content):
                if not skip_ocr:
                    # probably an image PDF. Send it to OCR.
                    success, content = extract_by_ocr(tmp.name)
                    if success:
                        rd.ocr_status = RECAPDocument.OCR_COMPLETE
                    elif content == "" or not success:
                        content = "Unable to extract document content."
                        rd.ocr_status = RECAPDocument.OCR_FAILED
                else:
                    content = ""
                    rd.ocr_status = RECAPDocument.OCR_NEEDED
            else:
                rd.ocr_status = RECAPDocument.OCR_UNNECESSARY

        rd.plain_text, _ = anonymize(content)
        # Do not do indexing here. Creates race condition in celery.
        rd.save(index=False, do_extraction=False)
        processed.append(pk)

    return processed
Esempio n. 3
0
def extract_recap_pdf(pks, skip_ocr=False, check_if_needed=True):
    """Extract the contents from a RECAP PDF if necessary."""
    if not is_iter(pks):
        pks = [pks]

    processed = []
    for pk in pks:
        rd = RECAPDocument.objects.get(pk=pk)
        if check_if_needed and not rd.needs_extraction:
            # Early abort if the item doesn't need extraction and the user
            # hasn't disabled early abortion.
            processed.append(pk)
            continue
        path = rd.filepath_local.path
        process = make_pdftotext_process(path)
        content, err = process.communicate()
        content = content.decode()

        if needs_ocr(content):
            if not skip_ocr:
                # probably an image PDF. Send it to OCR.
                success, content = extract_by_ocr(path)
                if success:
                    rd.ocr_status = RECAPDocument.OCR_COMPLETE
                elif content == "" or not success:
                    content = "Unable to extract document content."
                    rd.ocr_status = RECAPDocument.OCR_FAILED
            else:
                content = ""
                rd.ocr_status = RECAPDocument.OCR_NEEDED
        else:
            rd.ocr_status = RECAPDocument.OCR_UNNECESSARY

        rd.plain_text, _ = anonymize(content)
        # Do not do indexing here. Creates race condition in celery.
        rd.save(index=False, do_extraction=False)
        processed.append(pk)

    return processed
Esempio n. 4
0
def extract_recap_pdf(pks, skip_ocr=False, check_if_needed=True):
    """Extract the contents from a RECAP PDF if necessary."""
    if not is_iter(pks):
        pks = [pks]

    processed = []
    for pk in pks:
        rd = RECAPDocument.objects.get(pk=pk)
        if check_if_needed and not rd.needs_extraction:
            # Early abort if the item doesn't need extraction and the user
            # hasn't disabled early abortion.
            processed.append(pk)
            continue
        path = rd.filepath_local.path
        process = make_pdftotext_process(path)
        content, err = process.communicate()

        if needs_ocr(content):
            if not skip_ocr:
                # probably an image PDF. Send it to OCR.
                success, content = extract_by_ocr(path)
                if success:
                    rd.ocr_status = RECAPDocument.OCR_COMPLETE
                elif content == u'' or not success:
                    content = u'Unable to extract document content.'
                    rd.ocr_status = RECAPDocument.OCR_FAILED
            else:
                content = u''
                rd.ocr_status = RECAPDocument.OCR_NEEDED
        else:
            rd.ocr_status = RECAPDocument.OCR_UNNECESSARY

        rd.plain_text, _ = anonymize(content)
        # Do not do indexing here. Creates race condition in celery.
        rd.save(index=False, do_extraction=False)
        processed.append(pk)

    return processed