def do_document_ocr(document): """ Do OCR on all the pages of the given document object, first trying to extract text from PDF using pdftotext then by calling tesseract """ for page_index, document_page in enumerate(document.documentpage_set.all()): desc, filepath = tempfile.mkstemp() imagefile = None source = u"" try: if document.file_mimetype == u"application/pdf": pdf_filename = os.extsep.join([filepath, u"pdf"]) document.save_to_file(pdf_filename) run_pdftotext(pdf_filename, filepath, document_page.page_number) cleanup(pdf_filename) if os.stat(filepath).st_size == 0: # PDF page had no text, run tesseract on the page imagefile = convert_document_for_ocr(document, page=page_index) run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) ocr_output = os.extsep.join([filepath, u"txt"]) source = _(u"Text from OCR") else: ocr_output = filepath source = _(u"Text extracted from PDF") else: imagefile = convert_document_for_ocr(document, page=page_index) run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) ocr_output = os.extsep.join([filepath, u"txt"]) source = _(u"Text from OCR") f = codecs.open(ocr_output, "r", "utf-8") document_page = document.documentpage_set.get(page_number=page_index + 1) document_page.content = ocr_cleanup(f.read().strip()) document_page.page_label = source document_page.save() f.close() cleanup(ocr_output) finally: os.close(desc) cleanup(filepath) if imagefile: cleanup(imagefile)
def do_document_ocr(document): for page_index, document_page in enumerate(document.documentpage_set.all()): imagefile = convert_document_for_ocr(document, page=page_index) desc, filepath = tempfile.mkstemp() try: run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) ocr_output = os.extsep.join([filepath, 'txt']) f = codecs.open(ocr_output, 'r', 'utf-8') document_page = document.documentpage_set.get(page_number=page_index+1) document_page.content = f.read().strip() document_page.page_label = _(u'Text from OCR') document_page.save() f.close() cleanup(ocr_output) except TesseractError, e: cleanup(filepath) cleanup(imagefile) raise TesseractError(e) finally: