# print datetime.now() - start_time from wand.image import Image from PIL import Image as PI import sys import os from pyocr import pyocr from pyocr import builders import io #TESSERACT_CMD = os.environ["TESSDATA_PREFIX"] + os.sep + 'tesseract.exe' if os.name == 'nt' else 'tesseract' tool = pyocr.get_available_tools()[0] print tool lang = tool.get_available_languages() print lang req_image = [] final_text = [] image_pdf = Image(file="test_pf.pdf", resolution=300) image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: txt = tool.image_to_string(PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder()) final_text.append(txt)
def convert(file_name, target_width=1500): try: with Image(filename=file_name) as img: image_page_num = len(img.sequence) # PDF里面只有一张图片 if image_page_num == 1: # 获取最终图片宽高 target_width, target_height = _get_one_info( target_width, img.width, img.height) # 缩放,文档上说比resize速度快 img.sample(target_width, target_height) # 如果最终高度大于百度最大高度,则crop if target_height > bai_du_ocr_max: img.crop(0, 0, target_width, bai_du_ocr_max) # img.save(filename='%s.jpg' % (str(int(time.time())) + '_' + # str(img.width))) result = img.make_blob('jpg') # 下面是准备二值化,发现总体速度还不如直接传给百度 # paste_image = # PImage.open(StringIO.StringIO(img.make_blob('jpg'))) # paste_image = paste_image.convert("L") # paste_image.show() # d = StringIO.StringIO() # paste_image.save(d, 'JPEG') # result = d.getvalue() # PDF里面有一张以上图片 else: # 多张时,获取最终宽高、拼接页数 target_width, target_height, page_num = _get_more_info( target_width, img.width, img.height, image_page_num) # 生成粘贴的背景图 (测试多次,发现L比RGB快) paste_image = PImage.new('L', (target_width, target_height)) # 拼接图片 for i in range(0, page_num): image = Image(image=img.sequence[i]) # 计算一张图的高度 one_img_height = int(target_height / page_num) # 缩放 image.sample(target_width, one_img_height) # 将wand库文件转成PIL库文件 pasted_image = PImage.open( StringIO.StringIO(image.make_blob('jpg'))) # 将图片粘贴到背景图 paste_image.paste(pasted_image, (0, one_img_height * i)) # 如果最终高度大于百度最大高度,则crop if target_height > bai_du_ocr_max: paste_image = paste_image.crop( (0, 0, target_width, bai_du_ocr_max)) # 从内存中读取文件 d = StringIO.StringIO() # 这里是JPEG不是JPG paste_image.save(d, 'JPEG') result = d.getvalue() # paste_image.save('%s.jpg' % (str(int(time.time())) + '_' + # str(img.width))) # 测试的时候可以打开 # paste_image.show() except Exception as e: print(e) result = False return result