def extract_from_pdf(doc, path, DEVNULL, callback=None): """ Extract text from pdfs. Here, we use pdftotext. If that fails, try to use tesseract under the assumption it's an image-based PDF. Once that is complete, we check for the letter e in our content. If it's not there, we try to fix the mojibake that ca9 sometimes creates. """ process = subprocess.Popen( ["pdftotext", "-layout", "-enc", "UTF-8", path, "-"], shell=False, stdout=subprocess.PIPE, stderr=DEVNULL ) content, err = process.communicate() if content.strip() == '' and callback: # probably an image PDF. Send it to OCR result = subtask(callback).delay(path) success, content = result.get() if success: doc.extracted_by_ocr = True elif content == '' or not success: content = 'Unable to extract document content.' elif 'e' not in content: # It's a corrupt PDF from ca9. Fix it. content = fix_mojibake(unicode(content, 'utf-8', errors='ignore')) return doc, content, err
def extract_from_pdf(doc, path, DEVNULL, callback=None): """ Extract text from pdfs. Here, we use pdftotext. If that fails, try to use tesseract under the assumption it's an image-based PDF. Once that is complete, we check for the letter e in our content. If it's not there, we try to fix the mojibake that ca9 sometimes creates. """ process = subprocess.Popen( ["pdftotext", "-layout", "-enc", "UTF-8", path, "-"], shell=False, stdout=subprocess.PIPE, stderr=DEVNULL) content, err = process.communicate() if content.strip() == '' and callback: # probably an image PDF. Send it to OCR result = subtask(callback).delay(path) success, content = result.get() if success: doc.extracted_by_ocr = True elif content == '' or not success: content = 'Unable to extract document content.' elif 'e' not in content: # It's a corrupt PDF from ca9. Fix it. content = fix_mojibake(unicode(content, 'utf-8', errors='ignore')) return doc, content, err
def cleaner(simulate=False, verbose=True): """Fix cases that have mojibake as a result of pdffactory 3.51.""" # Find all the cases using Solr results_si = conn.raw_query(**{'q': u'ÚÑÎ', 'caller': 'mojibake',}) for result in results_si: # For each document doc = Document.objects.get(pk=result['id']) if verbose: print "https://www.courtlistener.com" + doc.get_absolute_url() # Correct the text text = doc.plain_text doc.plain_text = fix_mojibake(text) # Save the case if not simulate: doc.save()
def cleaner(simulate=False, verbose=True): """Fix cases that have mojibake as a result of pdffactory 3.51.""" # Find all the cases using Solr results_si = conn.raw_query(**{"q": u"ÚÑÎ", "caller": "mojibake",}) for result in results_si: # For each document doc = Document.objects.get(pk=result["id"]) if verbose: print "https://www.courtlistener.com" + doc.get_absolute_url() # Correct the text text = doc.plain_text doc.plain_text = fix_mojibake(text) # Save the case if not simulate: doc.save()