def pdf_page_to_png( src_pdf, pagenum=0, resolution=72, ): """ Returns specified PDF page as wand.image.Image png. :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages. :param int pagenum: Page number to take. :param int resolution: Resolution for resulting png in DPI. """ dst_pdf = PyPDF2.PdfFileWriter() dst_pdf.addPage(src_pdf.getPage(pagenum)) pdf_bytes = io.BytesIO() dst_pdf.write(pdf_bytes) pdf_bytes.seek(0) img = Image(pdf_bytes, resolution=resolution) img.convert("png") return img
# print datetime.now() - start_time from wand.image import Image from PIL import Image as PI import sys import os from pyocr import pyocr from pyocr import builders import io #TESSERACT_CMD = os.environ["TESSDATA_PREFIX"] + os.sep + 'tesseract.exe' if os.name == 'nt' else 'tesseract' tool = pyocr.get_available_tools()[0] print tool lang = tool.get_available_languages() print lang req_image = [] final_text = [] image_pdf = Image(file="test_pf.pdf", resolution=300) image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: txt = tool.image_to_string(PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder()) final_text.append(txt)