def document_analysis(docid): #TODO: Check for multiple objects? doc = util.is_valid_doc(docid) doc.file_format = util.determine_format(doc) ### Counting pages and repairing damaged documents ### num_pages = util.count_pages(doc) #TODO: The repair command doesn't quite work; need to make a copy first # or update the object's field. #if num_pages == -1 and doc.file_format == 'pdf': # Try to repair damaged PDF # cmd = ['pdftk', MEDIA_ROOT+doc.doc_file, 'output', MEDIA_ROOT+doc.doc_file] # try: # subprocess.check_call(cmd) # except subprocess.CalledProcessError as e: # print(e) #TODO: More error handling if necessary #Try again # num_pages = util.count_pages(doc) #If it's still undetectable there's not much more we can do #TODO: Report error, image cannot be processed. if doc.file_format == 'pdf': #Counting the number of pages may fail; PyPdf doesn't handle corrupt #PDFs well. num_imgs = util.count_images(doc) has_text = util.detect_text(doc) else: num_imgs = num_pages #For TIFFS num_pages might be >1 has_text = False # Decide what to do if has_text == False and num_imgs == num_pages: #Simple case #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text) pages_from_images.delay(docid) elif has_text == True and num_imgs == 0: #Nothing to OCR #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text) pages_from_rasterize.delay(docid) #Rasterize and output page images elif has_text == True and num_imgs > 0: #Mixed image / text #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text) pages_from_rasterize.delay(docid) #For now, rasterize pages, then OCR else: #Fallback to rasterization #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text) pages_from_rasterize.delay(docid) #rasterize and OCR doc.num_pages = num_pages doc.save()
def pages_from_rasterize(docid): """Rasterizes PDF pages, then continues with recognition.""" doc = util.is_valid_doc(docid) print "Rasterizing pages..." page_files = util.rasterize_pdf(doc) if doc.num_pages != len(page_files): #Page count stage couldn't determine doc.num_pages = len(page_files) doc.save() for i in range(doc.num_pages): doc_page = DocumentPage(document=doc, files_prefix=page_files[i][0], stage_output_extension=page_files[i][1], page_number=i, start_process_date=datetime.now(), status='w') doc_page.save() #Docs already guaranteed converted, move to binarization. binarize_page.delay(doc_page)
def pages_from_images(docid): doc = util.is_valid_doc(docid) print "Constructing pages from images..." #TODO: Consider splitting to multi-page TIFF so tesseract can learn page_files = util.split_to_files(doc) if doc.num_pages != len(page_files): #Page count stage couldn't determine doc.num_pages = len(page_files) doc.save() # Creates DocumentPages for each file returned by # split function, then launches conversion, etc. # tasks for each DocumentPage. for i in range(doc.num_pages): doc_page = DocumentPage(document=doc, files_prefix=page_files[i][0], stage_output_extension=page_files[i][1], page_number=i, start_process_date=datetime.now(), status='w') doc_page.save() convert_page.delay(doc_page)