예제 #1
0
# print datetime.now() - start_time

from wand.image import Image
from PIL import Image as PI
import sys
import os
from pyocr import pyocr
from pyocr import builders
import io
#TESSERACT_CMD = os.environ["TESSDATA_PREFIX"] + os.sep + 'tesseract.exe' if os.name == 'nt' else 'tesseract'

tool = pyocr.get_available_tools()[0]
print tool
lang = tool.get_available_languages()
print lang
req_image = []
final_text = []

image_pdf = Image(file="test_pf.pdf", resolution=300)
image_jpeg = image_pdf.convert('jpeg')

for img in image_jpeg.sequence:
    img_page = Image(image=img)
    req_image.append(img_page.make_blob('jpeg'))

for img in req_image:
    txt = tool.image_to_string(PI.open(io.BytesIO(img)),
                               lang=lang,
                               builder=pyocr.builders.TextBuilder())
    final_text.append(txt)
예제 #2
0
def convert(file_name, target_width=1500):
    try:
        with Image(filename=file_name) as img:
            image_page_num = len(img.sequence)
            # PDF里面只有一张图片
            if image_page_num == 1:
                # 获取最终图片宽高
                target_width, target_height = _get_one_info(
                    target_width, img.width, img.height)
                # 缩放,文档上说比resize速度快
                img.sample(target_width, target_height)

                # 如果最终高度大于百度最大高度,则crop
                if target_height > bai_du_ocr_max:
                    img.crop(0, 0, target_width, bai_du_ocr_max)

                # img.save(filename='%s.jpg' % (str(int(time.time())) + '_' +
                # str(img.width)))
                result = img.make_blob('jpg')
                # 下面是准备二值化,发现总体速度还不如直接传给百度
                # paste_image =
                # PImage.open(StringIO.StringIO(img.make_blob('jpg')))
                # paste_image = paste_image.convert("L")
                # paste_image.show()
                # d = StringIO.StringIO()
                # paste_image.save(d, 'JPEG')
                # result = d.getvalue()
            # PDF里面有一张以上图片
            else:
                # 多张时,获取最终宽高、拼接页数
                target_width, target_height, page_num = _get_more_info(
                    target_width, img.width, img.height, image_page_num)
                # 生成粘贴的背景图 (测试多次,发现L比RGB快)
                paste_image = PImage.new('L', (target_width, target_height))
                # 拼接图片
                for i in range(0, page_num):
                    image = Image(image=img.sequence[i])
                    # 计算一张图的高度
                    one_img_height = int(target_height / page_num)
                    # 缩放
                    image.sample(target_width, one_img_height)
                    # 将wand库文件转成PIL库文件
                    pasted_image = PImage.open(
                        StringIO.StringIO(image.make_blob('jpg')))
                    # 将图片粘贴到背景图
                    paste_image.paste(pasted_image, (0, one_img_height * i))
                    # 如果最终高度大于百度最大高度,则crop
                    if target_height > bai_du_ocr_max:
                        paste_image = paste_image.crop(
                            (0, 0, target_width, bai_du_ocr_max))
                    # 从内存中读取文件
                    d = StringIO.StringIO()
                    # 这里是JPEG不是JPG
                    paste_image.save(d, 'JPEG')
                    result = d.getvalue()
                    # paste_image.save('%s.jpg' % (str(int(time.time())) + '_' +
                    # str(img.width)))
                    # 测试的时候可以打开
                    # paste_image.show()
    except Exception as e:
        print(e)
        result = False
    return result