def document_analysis(docid): #TODO: Check for multiple objects? doc = util.is_valid_doc(docid) doc.file_format = util.determine_format(doc) ### Counting pages and repairing damaged documents ### num_pages = util.count_pages(doc) #TODO: The repair command doesn't quite work; need to make a copy first # or update the object's field. #if num_pages == -1 and doc.file_format == 'pdf': # Try to repair damaged PDF # cmd = ['pdftk', MEDIA_ROOT+doc.doc_file, 'output', MEDIA_ROOT+doc.doc_file] # try: # subprocess.check_call(cmd) # except subprocess.CalledProcessError as e: # print(e) #TODO: More error handling if necessary #Try again # num_pages = util.count_pages(doc) #If it's still undetectable there's not much more we can do #TODO: Report error, image cannot be processed. if doc.file_format == 'pdf': #Counting the number of pages may fail; PyPdf doesn't handle corrupt #PDFs well. num_imgs = util.count_images(doc) has_text = util.detect_text(doc) else: num_imgs = num_pages #For TIFFS num_pages might be >1 has_text = False # Decide what to do if has_text == False and num_imgs == num_pages: #Simple case #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text) pages_from_images.delay(docid) elif has_text == True and num_imgs == 0: #Nothing to OCR #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text) pages_from_rasterize.delay(docid) #Rasterize and output page images elif has_text == True and num_imgs > 0: #Mixed image / text #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text) pages_from_rasterize.delay(docid) #For now, rasterize pages, then OCR else: #Fallback to rasterization #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text) pages_from_rasterize.delay(docid) #rasterize and OCR doc.num_pages = num_pages doc.save()
def count_pages(): return util.count_pages(db.Query(Comment).count())
def count_pages_by_tag(t): return util.count_pages(db.Query(tag.TagPostR).filter('tag =', t).count())
def count_pages(): return util.count_pages(db.Query(Post).count())