Example #1
0
def ocr2PDF(ocrFile):

	zoomRatio=10

	p, lines, words=readOCR(ocrFile)

	c=canvas.Canvas('static/test.pdf', bottomup=0,pagesize=(p.right/zoomRatio,p.bottom/zoomRatio))

	# font=ImageFont.truetype('TimesNewRoman', size=10)
	# font=ImageFont.load('Helvetica')


	for l in lines:

		for w in l.words:
			textWidth = stringWidth(w.text, 'Helvetica', 10)
			print w.text
			if textWidth==0:
				continue
			fontSize=round(10.0*(w.right-w.left)/zoomRatio/textWidth)
			c.setFont('Helvetica',fontSize)

			c.drawString(w.left/zoomRatio,l.bottom/zoomRatio, w.text)

	c.save()
Example #2
0
def refine_process():
    lang = "eng"

    image = Image.open(workFile)

    pages, lines, words = readOCR(outputFile + ".html")

    resp = ""
    for n in xrange(len(lines)):
        l = lines[n]
        tmpImage = "static/tmp/lineImage" + str(n) + ".png"
        tmpXML = "static/tmp/lineImage" + str(n)

        lineImage = image.crop((l.left, l.top, l.right, l.bottom)).convert("RGB")
        w = l.right - l.left
        h = l.bottom - l.top

        scale = 1

        lineImage = lineImage.resize((int(w * scale), int(h * scale)), Image.ANTIALIAS)
        lineImage.filter(ImageFilter.SMOOTH).filter(ImageFilter.BLUR)

        ImageEnhance.Contrast(lineImage)

        margin = 500
        boxImage = Image.new("L", (lineImage.size[0] + margin, lineImage.size[1] + margin), "white")
        boxImage.paste(lineImage, (margin / 2, margin / 2))
        # boxImage=boxImage.convert('L')
        # boxImage=boxImage.point(lambda i: i>180 and 255)

        # boxImage=boxImage.resize((boxImage.size[0],boxImage.size[1]), Image.ANTIALIAS)

        # boxImage=boxImage.filter(ImageFilter.SHARPEN)

        boxImage.save(tmpImage)
        command = ["tesseract", tmpImage, tmpXML, "-l", lang, "hocr"]

        proc = sp.Popen(command, stderr=sp.PIPE)
        proc.wait()

        linePage, lineLines, lineWords = readOCR(tmpXML + ".html")
        for ll in lineLines:
            for w in ll.words:
                resp += w.text + " "
            resp += "\n"
    return resp
Example #3
0
def process_image(image, lang="eng"):
    image.save(inputFile)

    image = pre_process(image).convert("RGB")
    image.filter(ImageFilter.SMOOTH)

    image.save(workFile)

    crude_process(lang=lang)

    pages, lines, words = readOCR(outputFile + ".html")

    resp = ""
    for l in lines:
        # resp+='<p>'
        for w in l.words:
            resp += w.text + " "
        # resp+='</p>'
        resp += "\n"
    return resp

    return resp