def ocr2PDF(ocrFile): zoomRatio=10 p, lines, words=readOCR(ocrFile) c=canvas.Canvas('static/test.pdf', bottomup=0,pagesize=(p.right/zoomRatio,p.bottom/zoomRatio)) # font=ImageFont.truetype('TimesNewRoman', size=10) # font=ImageFont.load('Helvetica') for l in lines: for w in l.words: textWidth = stringWidth(w.text, 'Helvetica', 10) print w.text if textWidth==0: continue fontSize=round(10.0*(w.right-w.left)/zoomRatio/textWidth) c.setFont('Helvetica',fontSize) c.drawString(w.left/zoomRatio,l.bottom/zoomRatio, w.text) c.save()
def refine_process(): lang = "eng" image = Image.open(workFile) pages, lines, words = readOCR(outputFile + ".html") resp = "" for n in xrange(len(lines)): l = lines[n] tmpImage = "static/tmp/lineImage" + str(n) + ".png" tmpXML = "static/tmp/lineImage" + str(n) lineImage = image.crop((l.left, l.top, l.right, l.bottom)).convert("RGB") w = l.right - l.left h = l.bottom - l.top scale = 1 lineImage = lineImage.resize((int(w * scale), int(h * scale)), Image.ANTIALIAS) lineImage.filter(ImageFilter.SMOOTH).filter(ImageFilter.BLUR) ImageEnhance.Contrast(lineImage) margin = 500 boxImage = Image.new("L", (lineImage.size[0] + margin, lineImage.size[1] + margin), "white") boxImage.paste(lineImage, (margin / 2, margin / 2)) # boxImage=boxImage.convert('L') # boxImage=boxImage.point(lambda i: i>180 and 255) # boxImage=boxImage.resize((boxImage.size[0],boxImage.size[1]), Image.ANTIALIAS) # boxImage=boxImage.filter(ImageFilter.SHARPEN) boxImage.save(tmpImage) command = ["tesseract", tmpImage, tmpXML, "-l", lang, "hocr"] proc = sp.Popen(command, stderr=sp.PIPE) proc.wait() linePage, lineLines, lineWords = readOCR(tmpXML + ".html") for ll in lineLines: for w in ll.words: resp += w.text + " " resp += "\n" return resp
def process_image(image, lang="eng"): image.save(inputFile) image = pre_process(image).convert("RGB") image.filter(ImageFilter.SMOOTH) image.save(workFile) crude_process(lang=lang) pages, lines, words = readOCR(outputFile + ".html") resp = "" for l in lines: # resp+='<p>' for w in l.words: resp += w.text + " " # resp+='</p>' resp += "\n" return resp return resp