def select_image_layer(infiles, output_file, log, context): """Selects the image layer for the output page. If possible this is the orientation-corrected input page, or an image of the whole page converted to PDF.""" options = context.get_options() page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf')) image = next(ii for ii in infiles if ii.endswith('.image')) if options.lossless_reconstruction: log.debug( f"{page_number(page_pdf):4d}: page eligible for lossless reconstruction" ) re_symlink(page_pdf, output_file, log) # Still points to multipage return pageinfo = get_pageinfo(image, context) # We rasterize a square DPI version of each page because most image # processing tools don't support rectangular DPI. Use the square DPI as it # accurately describes the image. It would be possible to resample the image # at this stage back to non-square DPI to more closely resemble the input, # except that the hocr renderer does not understand non-square DPI. The # sandwich renderer would be fine. dpi = get_page_square_dpi(pageinfo, options) layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi)) # This create a single page PDF with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf: log.debug(f'{page_number(page_pdf):4d}: convert') img2pdf.convert( imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf ) log.debug(f'{page_number(page_pdf):4d}: convert done')
def triage_image_file(input_file, output_file, log, options): try: log.info("Input file is not a PDF, checking if it is an image...") im = Image.open(input_file) except EnvironmentError as e: msg = str(e) # Recover the original filename realpath = '' if os.path.islink(input_file): realpath = os.path.realpath(input_file) elif os.path.isfile(input_file): realpath = '<stdin>' msg = msg.replace(input_file, realpath) log.error(msg) raise UnsupportedImageFormatError() from e else: log.info("Input file is an image") if 'dpi' in im.info: if im.info['dpi'] <= (96, 96) and not options.image_dpi: log.info("Image size: (%d, %d)" % im.size) log.info("Image resolution: (%d, %d)" % im.info['dpi']) log.error( "Input file is an image, but the resolution (DPI) is " "not credible. Estimate the resolution at which the " "image was scanned and specify it using --image-dpi.") raise DpiError() elif not options.image_dpi: log.info("Image size: (%d, %d)" % im.size) log.error( "Input file is an image, but has no resolution (DPI) " "in its metadata. Estimate the resolution at which " "image was scanned and specify it using --image-dpi.") raise DpiError() if 'iccprofile' not in im.info: if im.mode == 'RGB': log.info('Input image has no ICC profile, assuming sRGB') elif im.mode == 'CMYK': log.info('Input CMYK image has no ICC profile, not usable') raise UnsupportedImageFormatError() im.close() try: log.info("Image seems valid. Try converting to PDF...") layout_fun = img2pdf.default_layout_fun if options.image_dpi: layout_fun = img2pdf.get_fixed_dpi_layout_fun( (options.image_dpi, options.image_dpi)) with open(output_file, 'wb') as outf: img2pdf.convert( input_file, layout_fun=layout_fun, with_pdfrw=False, outputstream=outf) log.info("Successfully converted to PDF, processing...") except img2pdf.ImageOpenError as e: log.error(e) raise UnsupportedImageFormatError() from e
def select_image_layer(infiles, output_file, log, context): """Selects the image layer for the output page. If possible this is the orientation-corrected input page, or an image of the whole page converted to PDF.""" options = context.get_options() page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf')) image = next(ii for ii in infiles if ii.endswith('.image')) if options.lossless_reconstruction: log.debug("{:4d}: page eligible for lossless reconstruction".format( page_number(page_pdf))) re_symlink(page_pdf, output_file, log) else: pageinfo = get_pageinfo(image, context) # We rasterize a square DPI version of each page because most image # processing tools don't support rectangular DPI. Use the square DPI # as it accurately describes the image. It would be possible to # resample the image at this stage back to non-square DPI to more # closely resemble the input, except that the hocr renderer does not # understand non-square DPI. The tess4 renderer would be fine. dpi = get_page_square_dpi(pageinfo, options) layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi)) with open(image, 'rb') as imfile, \ open(output_file, 'wb') as pdf: log.debug('{:4d}: convert'.format(page_number(page_pdf))) img2pdf.convert(imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf) log.debug('{:4d}: convert done'.format(page_number(page_pdf)))
def select_image_layer( infiles, output_file, log, pdfinfo, pdfinfo_lock): page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf')) image = next(ii for ii in infiles if ii.endswith('.image')) if lossless_reconstruction: log.debug("{:4d}: page eligible for lossless reconstruction".format( page_number(page_pdf))) re_symlink(page_pdf, output_file) else: pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock) dpi = get_page_dpi(pageinfo) dpi = float(dpi[0]), float(dpi[1]) layout_fun = img2pdf.get_fixed_dpi_layout_fun(dpi) with open(image, 'rb') as imfile, \ open(output_file, 'wb') as pdf: rawdata = imfile.read() img2pdf.convert( rawdata, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf)
def triage_image_file(input_file, output_file, log): try: log.info("Input file is not a PDF, checking if it is an image...") im = Image.open(input_file) except EnvironmentError as e: msg = str(e) # Recover the original filename realpath = '' if os.path.islink(input_file): realpath = os.path.realpath(input_file) elif os.path.isfile(input_file): realpath = '<stdin>' msg = msg.replace(input_file, realpath) log.error(msg) sys.exit(ExitCode.input_file) return else: log.info("Input file is an image") if 'dpi' in im.info: if im.info['dpi'] <= (96, 96) and not options.image_dpi: log.info("Image size: (%d, %d)" % im.size) log.info("Image resolution: (%d, %d)" % im.info['dpi']) log.error( "Input file is an image, but the resolution (DPI) is " "not credible. Estimate the resolution at which the " "image was scanned and specify it using --image-dpi.") sys.exit(ExitCode.input_file) elif not options.image_dpi: log.info("Image size: (%d, %d)" % im.size) log.error("Input file is an image, but has no resolution (DPI) " "in its metadata. Estimate the resolution at which " "image was scanned and specify it using --image-dpi.") sys.exit(ExitCode.input_file) if 'iccprofile' not in im.info: if im.mode == 'RGB': log.info('Input image has no ICC profile, assuming sRGB') elif im.mode == 'CMYK': log.info('Input CMYK image has no ICC profile, not usable') sys.exit(ExitCode.input_file) im.close() try: log.info("Image seems valid. Try converting to PDF...") layout_fun = img2pdf.default_layout_fun if options.image_dpi: layout_fun = img2pdf.get_fixed_dpi_layout_fun( (options.image_dpi, options.image_dpi)) with open(output_file, 'wb') as outf: img2pdf.convert(input_file, layout_fun=layout_fun, with_pdfrw=False, outputstream=outf) log.info("Successfully converted to PDF, processing...") except img2pdf.ImageOpenError as e: log.error(e) sys.exit(ExitCode.input_file)
def triage_image_file(input_file, output_file, options, log): log.info("Input file is not a PDF, checking if it is an image...") try: im = Image.open(input_file) except EnvironmentError as e: # Recover the original filename log.error(str(e).replace(input_file, options.input_file)) raise UnsupportedImageFormatError() from e with im: log.info("Input file is an image") if 'dpi' in im.info: if im.info['dpi'] <= (96, 96) and not options.image_dpi: log.info("Image size: (%d, %d)" % im.size) log.info("Image resolution: (%d, %d)" % im.info['dpi']) log.error( "Input file is an image, but the resolution (DPI) is " "not credible. Estimate the resolution at which the " "image was scanned and specify it using --image-dpi." ) raise DpiError() elif not options.image_dpi: log.info("Image size: (%d, %d)" % im.size) log.error( "Input file is an image, but has no resolution (DPI) " "in its metadata. Estimate the resolution at which " "image was scanned and specify it using --image-dpi." ) raise DpiError() if im.mode in ('RGBA', 'LA'): log.error( "The input image has an alpha channel. Remove the alpha " "channel first." ) raise UnsupportedImageFormatError() if 'iccprofile' not in im.info: if im.mode == 'RGB': log.info("Input image has no ICC profile, assuming sRGB") elif im.mode == 'CMYK': log.error("Input CMYK image has no ICC profile, not usable") raise UnsupportedImageFormatError() try: log.info("Image seems valid. Try converting to PDF...") layout_fun = img2pdf.default_layout_fun if options.image_dpi: layout_fun = img2pdf.get_fixed_dpi_layout_fun( (options.image_dpi, options.image_dpi) ) with open(output_file, 'wb') as outf: img2pdf.convert( input_file, layout_fun=layout_fun, with_pdfrw=False, outputstream=outf ) log.info("Successfully converted to PDF, processing...") except img2pdf.ImageOpenError as e: log.error(e) raise UnsupportedImageFormatError() from e
def triage_image_file(input_file, output_file, log): try: log.info("Input file is not a PDF, checking if it is an image...") im = Image.open(input_file) except EnvironmentError as e: log.error(e) sys.exit(ExitCode.input_file) return else: log.info("Input file is an image") if 'dpi' in im.info: if im.info['dpi'] <= (96, 96) and not options.image_dpi: log.info("Image size: (%d, %d)" % im.size) log.info("Image resolution: (%d, %d)" % im.info['dpi']) log.error( "Input file is an image, but the resolution (DPI) is " "not credible. Estimate the resolution at which the " "image was scanned and specify it using --image-dpi.") sys.exit(ExitCode.input_file) elif not options.image_dpi: log.info("Image size: (%d, %d)" % im.size) log.error( "Input file is an image, but has no resolution (DPI) " "in its metadata. Estimate the resolution at which " "image was scanned and specify it using --image-dpi.") sys.exit(ExitCode.input_file) if 'iccprofile' not in im.info: if im.mode == 'RGB': log.info('Input image has no ICC profile, assuming sRGB') elif im.mode == 'CMYK': log.info('Input CMYK image has no ICC profile, not usable') sys.exit(ExitCode.input_file) im.close() try: log.info("Image seems valid. Try converting to PDF...") layout_fun = img2pdf.default_layout_fun if options.image_dpi: layout_fun = img2pdf.get_fixed_dpi_layout_fun( (options.image_dpi, options.image_dpi)) with open(output_file, 'wb') as outf: img2pdf.convert( input_file, layout_fun=layout_fun, with_pdfrw=False, outputstream=outf) log.info("Successfully converted to PDF, processing...") except img2pdf.ImageOpenError as e: log.error(e) sys.exit(ExitCode.input_file)
def create_pdf_page_from_image(image, page_context): # We rasterize a square DPI version of each page because most image # processing tools don't support rectangular DPI. Use the square DPI as it # accurately describes the image. It would be possible to resample the image # at this stage back to non-square DPI to more closely resemble the input, # except that the hocr renderer does not understand non-square DPI. The # sandwich renderer would be fine. output_file = page_context.get_path('visible.pdf') dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi)) # This create a single page PDF with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf: page_context.log.debug('convert') img2pdf.convert( imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf ) page_context.log.debug('convert done') return output_file
def make_rotate_test(prefix, image_angle, page_angle): im = Image.open(fspath(resources / 'typewriter.png')) if image_angle != 0: ccw_angle = -image_angle % 360 im = im.transpose(getattr(Image, 'ROTATE_{}'.format(ccw_angle))) memimg = BytesIO() im.save(memimg, format='PNG') memimg.seek(0) mempdf = BytesIO() img2pdf.convert(memimg.read(), layout_fun=img2pdf.get_fixed_dpi_layout_fun( (200, 200)), outputstream=mempdf) mempdf.seek(0) pike = pikepdf.open(mempdf) pike.pages[0].Rotate = page_angle target = outdir / '{}_{}_{}.pdf'.format(prefix, image_angle, page_angle) pike.save(target) return target
def make_rotate_test(prefix, image_angle, page_angle): im = Image.open(fspath(resources / 'typewriter.png')) if image_angle != 0: ccw_angle = -image_angle % 360 im = im.transpose(getattr(Image, f'ROTATE_{ccw_angle}')) memimg = BytesIO() im.save(memimg, format='PNG') memimg.seek(0) mempdf = BytesIO() img2pdf.convert( memimg.read(), layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)), outputstream=mempdf, ) mempdf.seek(0) pike = pikepdf.open(mempdf) pike.pages[0].Rotate = page_angle target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf' pike.save(target) return target
def select_image_layer(infiles, output_file, log, pdfinfo, pdfinfo_lock): page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf')) image = next(ii for ii in infiles if ii.endswith('.image')) if lossless_reconstruction: log.debug("{:4d}: page eligible for lossless reconstruction".format( page_number(page_pdf))) re_symlink(page_pdf, output_file) else: pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock) dpi = get_page_dpi(pageinfo) dpi = float(dpi[0]), float(dpi[1]) layout_fun = img2pdf.get_fixed_dpi_layout_fun(dpi) with open(image, 'rb') as imfile, \ open(output_file, 'wb') as pdf: rawdata = imfile.read() img2pdf.convert(rawdata, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf)
elif len( response.content ) < 1024: # 图书馆如果页码不存在,也会响应 HTTP 200,并返回一个 142 Bytes 的 content print(pic_name + ' 超出页码范围') pic_path_list.remove(pic_full_path) PAGE_TYPE[type_i][2] = i - 1 # 设置页码数,合并 pdf 时用 break else: with open(pic_full_path, 'wb') as f: f.write(response.content) # 写入图片 print(pic_name + ' 下载成功') # 将图片合成为 pdf print('开始生成 PDF……') with open(PDF_PATH, "wb+") as f: # w+ 直接覆盖 layout_fun = img2pdf.get_fixed_dpi_layout_fun( (300, 300)) # 固定 DPI,避免封面、正文 DPI 不同导致尺寸不同 f.write(img2pdf.convert(pic_path_list, layout_fun=layout_fun)) # 为 PDF 添加简单的书签 # from PyPDF2 import PdfFileReader, PdfFileWriter # def _get_parent_bookmark(current_indent, history_indent, bookmarks): # '''The parent of A is the nearest bookmark whose indent is smaller than A's # ''' # assert len(history_indent) == len(bookmarks) # if current_indent == 0: # return None # for i in range(len(history_indent) - 1, -1, -1): # # len(history_indent) - 1 ===> 0 # if history_indent[i] < current_indent: # return bookmarks[i] # return None