def process_image(content, language, noOCR=False, despeckle=False): if noOCR: logging.error("OCR disabled, no text available") return None from sh import tesseract, convert tmpFolder = tempfile.mkdtemp(prefix='imap-dms-ocr-tmp') logging.debug("Converting image in tmpfolder %s", tmpFolder) convert(convert_options, "-", tmpFolder+"/out.png", _in=content, _in_bufsize=10000) logging.debug("Running tesseract with language %s on file %s", language, tmpFolder+"/out.png") tesseract(tmpFolder+"/out.png", tmpFolder+"/out", "-l",language) f = open(tmpFolder+"/out.txt", "r") content = unicode(f.read(), "utf-8") f.close() logging.debug("Found %d chars for this page", len(content)) content=content.strip() if(len(content)==0): return None else: return content
def crack(self, dilateiter=4, erodeiter=4, threshold=200, size=(155, 55), whitelist_chars=string.ascii_lowercase): #Take all parameters ''':param whitelist_char: the characters to recognize''' resized = resizeImage(self.image, (self.image.width * 6, self.image.height * 6)) dilateImage(resized, dilateiter) erodeImage(resized, erodeiter) thresholdImage(resized, threshold, cv.CV_THRESH_BINARY) resized = resizeImage(resized, size) #Call the tesseract engine from tempfile import NamedTemporaryFile temp_img_file = NamedTemporaryFile(suffix='.jpg') temp_solution_file = NamedTemporaryFile() cv.SaveImage(temp_img_file.name, resized) tesseract(temp_img_file.name, temp_solution_file.name, '-c', 'tessedit_char_whitelist=' + whitelist_chars) ret = str(cat(temp_solution_file.name + '.txt')) import os os.unlink(temp_solution_file.name + '.txt') return ret
def process_pdf(content, language, noOCR=False, noPDFText=False, despeckle=False): if noPDFText: logging.debug("pdftotext disabled") pdfText = "" else: from sh import pdftotext logging.debug("Extracting pdf contents using pdftotext") pdfText = unicode(pdftotext('-', '-', _in=content, _in_bufsize=10000)) logging.debug("Extracted %d chars from the text", len(pdfText)) if noOCR: logging.debug("OCR disabled, returning only pdf text") else: from sh import identify, tesseract, convert logging.debug("Starting OCR Operation") logging.debug("Extracing page numbers") pageNos = map(int, str(identify("-format", "%p ", "pdf:-", _in=content,_in_bufsize=10000)).\ strip().split(' ')) logging.debug("Found pages: %s", pageNos) allPages=u"" for pageNo in pageNos: logging.debug("Processing page %d", pageNo) tmpFolder = tempfile.mkdtemp(prefix='imap-dms-ocr-tmp') co = convert_options if not despeckle else convert_options_despeckle logging.debug("Converting page to image in tmpfolder %s with options %s", tmpFolder, co) convert(co, "pdf:-[%d]" % (pageNo), tmpFolder+"/out.png", _in=content, _in_bufsize=10000) logging.debug("Running tesseract with language %s on file %s", language, tmpFolder+"/out.png") tesseract(tmpFolder+"/out.png", tmpFolder+"/out", "-l",language) f = open(tmpFolder+"/out.txt", "r") pageContent = unicode(f.read(), "utf-8") f.close() logging.debug("Found %d chars for this page", len(pageContent)) allPages+=pageContent+u"\n" shutil.rmtree(tmpFolder) pdfText=pdfText.strip()+"\n\n\n"+allPages.strip() if(len(pdfText.strip())==0): logging.error("No text could be recognized") return None else: return pdfText
def create_tiff_and_txt(pnms, language): language = language or DEFAULT_LANGUAGE tiffs = [] texts = [] for pnm in sorted(pnms): unpapered = '_' + pnm unpaper(pnm, unpapered) if os.path.exists(unpapered): os.remove(pnm) tiff = unpapered.replace('.pnm', '.tiff') convert(unpapered, tiff) if os.path.exists(tiff): os.remove(unpapered) txtfile = tiff.replace('.tiff', '') tiffs.append(tiff) texts.append(txtfile + '.txt') tesseract(tiff, txtfile, '-l', language) return tiffs, texts
def crack(self,dilateiter=4, erodeiter=4, threshold=200, size=(155,55), whitelist_chars=string.ascii_lowercase): #Take all parameters ''':param whitelist_char: the characters to recognize''' resized = resizeImage(self.image, (self.image.width*6, self.image.height*6)) dilateImage(resized, dilateiter) erodeImage(resized, erodeiter) thresholdImage(resized,threshold, cv.CV_THRESH_BINARY) resized = resizeImage(resized, size) #Call the tesseract engine from tempfile import NamedTemporaryFile temp_img_file = NamedTemporaryFile(suffix='.jpg') temp_solution_file = NamedTemporaryFile() cv.SaveImage(temp_img_file.name,resized) tesseract(temp_img_file.name, temp_solution_file.name, '-c', 'tessedit_char_whitelist='+whitelist_chars) ret = str(cat(temp_solution_file.name+'.txt')) import os os.unlink(temp_solution_file.name+'.txt') return ret