def pages_from_rasterize(docid): """Rasterizes PDF pages, then continues with recognition.""" doc = util.is_valid_doc(docid) print "Rasterizing pages..." page_files = util.rasterize_pdf(doc) if doc.num_pages != len(page_files): #Page count stage couldn't determine doc.num_pages = len(page_files) doc.save() for i in range(doc.num_pages): doc_page = DocumentPage(document=doc, files_prefix=page_files[i][0], stage_output_extension=page_files[i][1], page_number=i, start_process_date=datetime.now(), status='w') doc_page.save() #Docs already guaranteed converted, move to binarization. binarize_page.delay(doc_page)
def pages_from_images(docid): doc = util.is_valid_doc(docid) print "Constructing pages from images..." #TODO: Consider splitting to multi-page TIFF so tesseract can learn page_files = util.split_to_files(doc) if doc.num_pages != len(page_files): #Page count stage couldn't determine doc.num_pages = len(page_files) doc.save() # Creates DocumentPages for each file returned by # split function, then launches conversion, etc. # tasks for each DocumentPage. for i in range(doc.num_pages): doc_page = DocumentPage(document=doc, files_prefix=page_files[i][0], stage_output_extension=page_files[i][1], page_number=i, start_process_date=datetime.now(), status='w') doc_page.save() convert_page.delay(doc_page)