def extract_from_pdf(doc, path, DEVNULL, callback=None): """ Extract text from pdfs. Here, we use pdftotext. If that fails, try to use tesseract under the assumption it's an image-based PDF. Once that is complete, we check for the letter e in our content. If it's not there, we try to fix the mojibake that ca9 sometimes creates. """ process = subprocess.Popen( ["pdftotext", "-layout", "-enc", "UTF-8", path, "-"], shell=False, stdout=subprocess.PIPE, stderr=DEVNULL ) content, err = process.communicate() if content.strip() == '' and callback: # probably an image PDF. Send it to OCR. N.B.: Do NOT use a subtask here # unless you are very careful. In the worst case, doing so can cause a # total celery deadlock, since this task will create another task, but # there might not be a worker to consume it, guaranteeing that this # task never finishes! success, content = callback(path) if success: doc.extracted_by_ocr = True elif content == '' or not success: content = 'Unable to extract document content.' elif 'e' not in content: # It's a corrupt PDF from ca9. Fix it. content = fix_mojibake(unicode(content, 'utf-8', errors='ignore')) return doc, content, err
def extract_from_pdf( path: str, opinion: Opinion, ocr_available: bool = False, ) -> ExtractProcessResult: """Extract text from pdfs. Start with pdftotext. If we we enabled OCR - and the the content is empty or the PDF contains images, use tesseract. This pattern occurs because PDFs can be images, text-based and a mix of the two. We check for images to make sure we do OCR on mix-type PDFs. If a text-based PDF we fix corrupt PDFs from ca9. :param path: The path to the PDF :param opinion: The Opinion associated with the PDF :param ocr_available: Whether we should do OCR stuff :return Tuple of the content itself and any errors we received """ process = make_pdftotext_process(path) content, err = process.communicate() content = content.decode() if err is not None: err = err.decode() if not ocr_available: if "e" not in content: # It's a corrupt PDF from ca9. Fix it. content = fix_mojibake(content) else: if ocr_needed(path, content): success, ocr_content = extract_by_ocr(path) if success: opinion.extracted_by_ocr = True # Check content length and take the longer of the two if len(ocr_content) > len(content): content = ocr_content elif content == "" or not success: content = "Unable to extract document content." return content, err
def extract_from_pdf(path, opinion, do_ocr=False): """Extract text from pdfs. Here, we use pdftotext. If that fails, try to use tesseract under the assumption it's an image-based PDF. Once that is complete, we check for the letter e in our content. If it's not there, we try to fix the mojibake that ca9 sometimes creates. """ process = make_pdftotext_process(path) content, err = process.communicate() content = content.decode() if content.strip() == "" and do_ocr: success, content = extract_by_ocr(path) if success: opinion.extracted_by_ocr = True elif content == "" or not success: content = "Unable to extract document content." elif "e" not in content: # It's a corrupt PDF from ca9. Fix it. content = fix_mojibake(content) return content, err
def extract_from_pdf(path, opinion, do_ocr=False): """ Extract text from pdfs. Here, we use pdftotext. If that fails, try to use tesseract under the assumption it's an image-based PDF. Once that is complete, we check for the letter e in our content. If it's not there, we try to fix the mojibake that ca9 sometimes creates. """ process = make_pdftotext_process(path) content, err = process.communicate() if content.strip() == '' and do_ocr: success, content = extract_by_ocr(path) if success: opinion.extracted_by_ocr = True elif content == '' or not success: content = 'Unable to extract document content.' elif 'e' not in content: # It's a corrupt PDF from ca9. Fix it. content = fix_mojibake(unicode(content, 'utf-8', errors='ignore')) return content, err