Beispiel #1
0
def pdf_page_to_png(
    src_pdf,
    pagenum=0,
    resolution=72,
):
    """
    Returns specified PDF page as wand.image.Image png.
    :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
    :param int pagenum: Page number to take.
    :param int resolution: Resolution for resulting png in DPI.
    """
    dst_pdf = PyPDF2.PdfFileWriter()
    dst_pdf.addPage(src_pdf.getPage(pagenum))

    pdf_bytes = io.BytesIO()
    dst_pdf.write(pdf_bytes)
    pdf_bytes.seek(0)

    img = Image(pdf_bytes, resolution=resolution)
    img.convert("png")

    return img
Beispiel #2
0
# print datetime.now() - start_time

from wand.image import Image
from PIL import Image as PI
import sys
import os
from pyocr import pyocr
from pyocr import builders
import io
#TESSERACT_CMD = os.environ["TESSDATA_PREFIX"] + os.sep + 'tesseract.exe' if os.name == 'nt' else 'tesseract'

tool = pyocr.get_available_tools()[0]
print tool
lang = tool.get_available_languages()
print lang
req_image = []
final_text = []

image_pdf = Image(file="test_pf.pdf", resolution=300)
image_jpeg = image_pdf.convert('jpeg')

for img in image_jpeg.sequence:
    img_page = Image(image=img)
    req_image.append(img_page.make_blob('jpeg'))

for img in req_image:
    txt = tool.image_to_string(PI.open(io.BytesIO(img)),
                               lang=lang,
                               builder=pyocr.builders.TextBuilder())
    final_text.append(txt)