예제 #1
0
def extract_from_pdf(doc, path, DEVNULL, callback=None):
    """ Extract text from pdfs.

    Here, we use pdftotext. If that fails, try to use tesseract under the
    assumption it's an image-based PDF. Once that is complete, we check for the
    letter e in our content. If it's not there, we try to fix the mojibake
    that ca9 sometimes creates.
    """
    process = subprocess.Popen(
        ["pdftotext", "-layout", "-enc", "UTF-8", path, "-"],
        shell=False,
        stdout=subprocess.PIPE,
        stderr=DEVNULL
    )
    content, err = process.communicate()
    if content.strip() == '' and callback:
        # probably an image PDF. Send it to OCR
        result = subtask(callback).delay(path)
        success, content = result.get()
        if success:
            doc.extracted_by_ocr = True
        elif content == '' or not success:
            content = 'Unable to extract document content.'
    elif 'e' not in content:
        # It's a corrupt PDF from ca9. Fix it.
        content = fix_mojibake(unicode(content, 'utf-8', errors='ignore'))

    return doc, content, err
예제 #2
0
def extract_from_pdf(doc, path, DEVNULL, callback=None):
    """ Extract text from pdfs.

    Here, we use pdftotext. If that fails, try to use tesseract under the
    assumption it's an image-based PDF. Once that is complete, we check for the
    letter e in our content. If it's not there, we try to fix the mojibake
    that ca9 sometimes creates.
    """
    process = subprocess.Popen(
        ["pdftotext", "-layout", "-enc", "UTF-8", path, "-"],
        shell=False,
        stdout=subprocess.PIPE,
        stderr=DEVNULL)
    content, err = process.communicate()
    if content.strip() == '' and callback:
        # probably an image PDF. Send it to OCR
        result = subtask(callback).delay(path)
        success, content = result.get()
        if success:
            doc.extracted_by_ocr = True
        elif content == '' or not success:
            content = 'Unable to extract document content.'
    elif 'e' not in content:
        # It's a corrupt PDF from ca9. Fix it.
        content = fix_mojibake(unicode(content, 'utf-8', errors='ignore'))

    return doc, content, err
def cleaner(simulate=False, verbose=True):
    """Fix cases that have mojibake as a result of pdffactory 3.51."""

    # Find all the cases using Solr
    results_si = conn.raw_query(**{'q': u'ÚÑÎ', 'caller': 'mojibake',})
    for result in results_si:
        # For each document
        doc = Document.objects.get(pk=result['id'])
        if verbose:
            print "https://www.courtlistener.com" + doc.get_absolute_url()
        # Correct the text
        text = doc.plain_text
        doc.plain_text = fix_mojibake(text)

        # Save the case
        if not simulate:
            doc.save()
예제 #4
0
def cleaner(simulate=False, verbose=True):
    """Fix cases that have mojibake as a result of pdffactory 3.51."""

    # Find all the cases using Solr
    results_si = conn.raw_query(**{"q": u"ÚÑÎ", "caller": "mojibake",})
    for result in results_si:
        # For each document
        doc = Document.objects.get(pk=result["id"])
        if verbose:
            print "https://www.courtlistener.com" + doc.get_absolute_url()
        # Correct the text
        text = doc.plain_text
        doc.plain_text = fix_mojibake(text)

        # Save the case
        if not simulate:
            doc.save()