# USAGE: ./add_on_page.py $in_filepath $out_filepath # Inspired by https://github.com/pmaupin/pdfrw/blob/master/examples/watermark.py import sys from fpdf import FPDF from pdfrw import PageMerge, PdfReader, PdfWriter IN_FILEPATH = sys.argv[1] OUT_FILEPATH = sys.argv[2] ON_PAGE_INDEX = 1 UNDERNEATH = ( False # if True, new content will be placed underneath page (painted first) ) def new_content(): fpdf = FPDF() fpdf.add_page() fpdf.set_font("helvetica", size=36) fpdf.text(50, 50, "Hello!") reader = PdfReader(fdata=bytes(fpdf.output())) return reader.pages[0] writer = PdfWriter(trailer=PdfReader(IN_FILEPATH)) PageMerge(writer.pagearray[ON_PAGE_INDEX]).add(new_content(), prepend=UNDERNEATH).render() writer.write(OUT_FILEPATH)
from pdfrw import PdfReader, PdfWriter import glob arr = [f for f in glob.glob("*.pdf")] for i in arr: pages = PdfReader(i).pages parts = [(1, 2), (2, 3), (3, 4)] for part in parts: outdata = PdfWriter(f'out/{i}_{part[0]}.pdf') for pagenum in range(*part): outdata.addpage(pages[pagenum - 1]) outdata.write() #https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory #pageNumbers = pdf_reader.getNumPages() #for i in range (pageNumbers):
from pdfrw import PdfReader, PdfWriter import os title = 'Product Sheet' files = os.listdir('files') for file in files: trailer = PdfReader('files\\' + file) trailer.Info.Title = title PdfWriter('out\\' + file, trailer=trailer).write()
def write_async(self, outfile, process_semaphore, progress_cb=None): pdf_writer = PdfWriter(version="1.5") pdf_group = PdfDict() pdf_group.indirect = True pdf_group.CS = PdfName.DeviceRGB pdf_group.I = PdfBool(True) pdf_group.S = PdfName.Transparency pdf_font_mapping = PdfDict() pdf_font_mapping.indirect = True pdf_font_mapping.F1 = self._build_font() for _ in self._pages: pdf_page = PdfDict() pdf_page.Type = PdfName.Page pdf_writer.addpage(pdf_page) # pdfrw makes a internal copy of the pages # use the copy so that references to pages in links are correct pdf_pages = list(pdf_writer.pagearray) srgb_colorspace = PdfDict() srgb_colorspace.indirect = True srgb_colorspace.N = 3 # Number of components (red, green, blue) with open(SRGB_ICC_FILENAME, "rb") as f: srgb_colorspace_stream = f.read() srgb_colorspace.Filter = [PdfName.FlateDecode] srgb_colorspace.stream = zlib.compress(srgb_colorspace_stream, 9).decode("latin-1") srgb_colorspace.Length1 = len(srgb_colorspace_stream) default_rgb_colorspace = PdfArray([PdfName.ICCBased, srgb_colorspace]) default_rgb_colorspace.indirect = True # Handle all pages in parallel @asyncio.coroutine def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather( *[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather( *[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray( [0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_colorspace = PdfDict() pdf_colorspace.DefaultRGB = default_rgb_colorspace pdf_resources.ColorSpace = pdf_colorspace pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [ PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height) ] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0 ] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages)) finished_pages = 0 yield from asyncio.gather(*[ make_page(page, pdf_page, process_semaphore) for page, pdf_page in zip(self._pages, pdf_pages) ]) trailer = pdf_writer.trailer document_id = PdfString().from_bytes(os.urandom(16)) trailer.ID = [document_id, document_id] mark_info = PdfDict() mark_info.Marked = PdfBool(True) trailer.Root.MarkInfo = mark_info struct_tree_root = PdfDict() struct_tree_root.Type = PdfName.StructTreeRoot trailer.Root.StructTreeRoot = struct_tree_root metadata = PdfDict() metadata.indirect = True metadata.Type = PdfName.Metadata metadata.Subtype = PdfName.XML xmp = XMPMeta() xmp.set_property(XMP_NS_PDFA_ID, "part", "2") xmp.set_property(XMP_NS_PDFA_ID, "conformance", "A") metadata_stream = xmp.serialize_to_str().encode("utf-8") metadata.Filter = [PdfName.FlateDecode] metadata.stream = zlib.compress(metadata_stream, 9).decode("latin-1") metadata.Length1 = len(metadata_stream) trailer.Root.Metadata = metadata with TemporaryDirectory(prefix="djpdf-") as temp_dir: pdf_writer.write(path.join(temp_dir, "temp.pdf")) cmd = [ QPDF_CMD, "--stream-data=preserve", "--object-streams=preserve", "--normalize-content=n", "--newline-before-endstream" ] if LINEARIZE_PDF: cmd.extend(["--linearize"]) cmd.extend([ path.abspath(path.join(temp_dir, "temp.pdf")), path.abspath(outfile) ]) yield from run_command_async(cmd, process_semaphore)
''' usage: print_two.py my.pdf Creates print_two.my.pdf This is only useful when you can cut down sheets of paper to make two small documents. Works for double-sided only right now. ''' import sys import os from pdfrw import PdfReader, PdfWriter, PageMerge def fixpage(page, count=[0]): count[0] += 1 oddpage = (count[0] & 1) result = PageMerge() for rotation in (180 + 180 * oddpage, 180 * oddpage): result.add(page, rotate=rotation) result[1].x = result[0].w return result.render() inpfn, = sys.argv[1:] outfn = 'print_two.' + os.path.basename(inpfn) pages = PdfReader(inpfn).pages PdfWriter(outfn).addpages(fixpage(x) for x in pages).write()
import sys import os from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict def adjust(page, margin=0, scale=1): info = PageMerge().add(page) x1, y1, x2, y2 = info.xobj_box viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin) page = PageMerge().add(page, viewrect=viewrect) page[0].scale(scale) return page.render() inpfn = 'F:page-number.pdf' outfn = 'F:poster.' + os.path.basename(inpfn) reader = PdfReader(inpfn) writer = PdfWriter(outfn) writer.addpage(adjust(reader.pages[0])) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) writer.write()
def get(self, format: str, path: str): """Handle the GET method call.""" if format != 'pdf': self.log.exception('format must be pdf') raise web.HTTPError(500, 'format must be pdf') self.config.PDFExporter.preprocessors = [thermohw.ExtractAttachmentsPreprocessor] self.config.PDFExporter.template_file = os.path.join(thermohw_dir, 'homework.tpl') self.config.PDFExporter.filters = {'convert_div': thermohw.convert_div, 'convert_raw_html': thermohw.convert_raw_html} self.config.PDFExporter.latex_count = 1 exporter = PDFExporter(config=self.config, log=self.log) exporter.writer.build_directory = '.' pdfs = [] path = path.strip('/').strip() paths = path.split('.ipynb') for path in paths: if not path: continue path += '.ipynb' # If the notebook relates to a real file (default contents manager), # give its path to nbconvert. ext_resources_dir: Union[str, None] basename: str os_path: str if hasattr(self.contents_manager, '_get_os_path'): os_path = self.contents_manager._get_os_path(path) ext_resources_dir, basename = os.path.split(os_path) else: ext_resources_dir = None model: Dict[str, str] = self.contents_manager.get(path=path) name: str = model['name'] if model['type'] != 'notebook': # not a notebook, redirect to files return FilesRedirectHandler.redirect_to_files(self, path) nb = model['content'] self.set_header('Last-Modified', model['last_modified']) # create resources dictionary mod_date: str = model['last_modified'].strftime(text.date_format) nb_title: str = os.path.splitext(name)[0] config_dir: str = self.application.settings['config_dir'] resource_dict: Dict[str, str] = { "metadata": { "name": nb_title, "modified_date": mod_date }, "config_dir": config_dir, } if ext_resources_dir: resource_dict['metadata']['path'] = ext_resources_dir output: bytes try: output, _ = exporter.from_notebook_node( nb, resources=resource_dict ) except Exception as e: self.log.exception("nbconvert failed: %s", e) raise web.HTTPError(500, "nbconvert failed: %s" % e) pdfs.append(io.BytesIO(output)) writer = PdfWriter() for pdf in pdfs: writer.addpages(PdfReader(pdf).pages) bio = io.BytesIO() writer.write(bio) bio.seek(0) output = bio.read() bio.close() # Force download if requested if self.get_argument('download', 'false').lower() == 'true': filename = 'final_output.pdf' self.set_header('Content-Disposition', 'attachment; filename="{}"'.format(filename)) # MIME type if exporter.output_mimetype: self.set_header('Content-Type', '{}; charset=utf-8'.format(exporter.output_mimetype)) self.set_header('Cache-Control', 'no-store, no-cache, must-revalidate, max-age=0') self.finish(output)
def go(inpfn, outfn): pages = PdfReader(inpfn).pages writer = PdfWriter() while pages: writer.addpage(get4(pages)) writer.write(outfn)
if args.path: path = args.path if args.verbose: print("Searching {} for PDF files.\n".format(path)) # Generate a list of file names (includes the full path) fileList = [] for filePath in glob(path + "/*.pdf"): if args.verbose: print("Found {}".format(filePath)) fileList.append(filePath) # sort the list in 'natural' order sortedFiles = natsorted(fileList) # loop through the list of PDFs, and add them to a new PDF outFile = PdfWriter() for pdf in sortedFiles: x = PdfReader(pdf) if args.verbose: print("Adding {} pages from {} to the combined file.".format( x.numPages, pdf)) outFile.addpages(x.pages) outFile.write(combinedFile) if args.verbose: m = PdfReader(combinedFile) print("\nCombined file created at {} with a total of {} pages.".format( combinedFile, m.numPages))
def __init__(self, *args): if len(args) < 1: raise Exception('Need at least a file to slice.') dest = args[0].split('[')[0] if len(args) == 1 else args[1] PdfWriter().addpages(reversed(get_document_pages(args[0]))).write(dest)
# Удалите первые две страницы (титульный лист) из PDF from pdfrw import PdfReader, PdfWriter input_file = "example.pdf" output_file = "example-updated.pdf" # Определить объекты чтения и записи reader_input = PdfReader(input_file) writer_output = PdfWriter() # Перейти на страницу один за другим for current_page in range(len(reader_input.pages)): if current_page > 1: writer_output.addpage(reader_input.pages[current_page]) print("adding page %i" % (current_page + 1)) # Записать измененный контент на диск writer_output.write(output_file)
--- gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=stamp-test.pdf -dBATCH stamp.pdf prior-post-pro-vel-cropped.pdf pdfnup stamp-test.pdf --nup 2x1 --landscape --outfile stamp-test-side.pdf pdfjam --keepinfo --landscape --trim "100mm 0mm 0mm 0mm" --clip true stamp-test-side.pdf -o test-side-cropped.pdf pdfjam --keepinfo --landscape --trim "0mm 0mm 100mm 0mm" --clip true stamp-test-side.pdf -o stamp-side-cropped.pdf from pdfrw import PdfReader, PdfWriter, PageMerge ipdf = PdfReader('test-side-cropped.pdf') wpdf = PdfReader('stamp-side-cropped.pdf') PageMerge(ipdf.pages[0]).add(wpdf.pages[0]).render() PdfWriter().write('newfile.pdf', ipdf) pdfcrop --margins '-120 5 -140 5' newfile.pdf newfile-cropped.pdf --- # coding: utf-8 from PyPDF2 import PdfFileWriter, PdfFileReader output = PdfFileWriter() ipdf = PdfFileReader(open('prior-post-pro-vel-cropped.pdf', 'rb')) wpdf = PdfFileReader(open('prior-posterior-stamp.pdf', 'rb')) watermark = wpdf.getPage(0) page = ipdf.getPage(0) page.mergePage(watermark)
def merge_attachment(self): filename = 'Print Shipping Labels.pdf' picking_obj = self.env['stock.picking'] picking = picking_obj.browse(self._context.get('active_ids')) lst =[] writer = PdfWriter() for pick in picking: pick.shipping_label_print_bool = True ship_name = 'Shipping labels' "%s" %pick.name if not pick.ship_label_bool : attachments = self.env['ir.attachment'].search([('res_id','=',pick.id)]) else: attachments = self.env['ir.attachment'].search([('res_id','=',pick.id),('name','=',ship_name)]) for att in attachments: lst.append(att) # writer = PdfFileWriter() # inpfn, = sys.argv[1:] # outfn = '4up.' + os.path.basename(inpfn) # pages = PdfReader(inpfn).pages def get4(srcpages): if not pick.ship_label_bool: scale = 0.35 srcpages = PageMerge() + srcpages x_increment, y_increment = (scale * i for i in srcpages.xobj_box[2:]) for i, page in enumerate(srcpages): page.scale(scale) page.x = x_increment if i & 1 else 0 page.y = 0 if i & 2 else y_increment return srcpages.render() if pick.ship_label_bool: scale = 0.88 srcpages = PageMerge() + srcpages x_increment, y_increment = (scale * i for i in srcpages.xobj_box[2:]) for i, page in enumerate(srcpages): page.scale(scale) # page.x = x_increment if i & 1 else 0 # page.y = 0 if i & 2 else y_increment # print "parrrrrrrrrrrrrrrrrrrrrr",page.x,page.y return srcpages.render() def get4_fedex(srcpages): scale = 0.88 srcpages = PageMerge() + srcpages x_increment, y_increment = (scale * i for i in srcpages.xobj_box[2:]) for i, page in enumerate(srcpages): page.scale(scale) # page.x = x_increment if i & 1 else 0 # page.y = 0 if i & 2 else y_increment # print "parrrrrrrrrrrrrrrrrrrrrr",page.x,page.y return srcpages.render() for pdf in lst: pages = PdfReader(BytesIO(base64.decodestring(pdf.datas))).pages pick1 = picking_obj.browse(pdf.res_id) for index in range(0, len(pages), 1): if pick1.carrier_id.delivery_type =='ups': writer.addpage(get4(pages[index:index + 1])) if pick1.carrier_id.delivery_type =='fedex': writer.addpage(get4_fedex(pages[index:index + 1])) # Return merged PDF s = BytesIO() writer.write(s) reader = PdfFileReader(s) writer = PdfFileWriter() for page in range(0, reader.getNumPages()): p=reader.getPage(page) if pick1.carrier_id.delivery_type =='fedex' and not pick1.ship_label_bool : p.mediaBox.lowerRight = (900, 145) p.mediaBox.lowerLeft = (-600, 390) p.mediaBox.upperLeft = (99, 500) p.mediaBox.upperRight = (530, 680) if pick1.carrier_id.delivery_type =='ups' and not pick1.ship_label_bool: p.mediaBox.lowerRight = (450, 145) p.mediaBox.upperRight = (425, 600) p.mediaBox.lowerLeft = (-150, 275) p.mediaBox.upperLeft = (-5, 565) writer.addPage(p) s = BytesIO() writer.write(s) out = base64.b64encode(s.getvalue()) view_report_status_id=self.env['view.report'].create({'file_name':out,'datas_fname':filename}) return { 'res_id' :view_report_status_id.id, 'name' :'Print Shipping Labels', 'view_type':'form', 'view_mode':'form', 'res_model':'view.report', 'view_id' : False , 'type' :'ir.actions.act_window', }
def handle(self, f=inputf, out=outputf, with_pdfrw=with_pdfrw): with open(f, "rb") as inf: orig_imgdata = inf.read() output = img2pdf.convert(orig_imgdata, nodate=True, with_pdfrw=with_pdfrw) from pdfrw import PdfReader, PdfName, PdfWriter from pdfrw.py23_diffs import convert_load, convert_store x = PdfReader(PdfReaderIO(convert_load(output))) self.assertEqual(sorted(x.keys()), [PdfName.Info, PdfName.Root, PdfName.Size]) self.assertIn(x.Root.Pages.Count, ('1', '2')) if len(x.Root.Pages.Kids) == '1': self.assertEqual(x.Size, '7') self.assertEqual(len(x.Root.Pages.Kids), 1) elif len(x.Root.Pages.Kids) == '2': self.assertEqual(x.Size, '10') self.assertEqual(len(x.Root.Pages.Kids), 2) self.assertEqual(x.Info, {}) self.assertEqual(sorted(x.Root.keys()), [PdfName.Pages, PdfName.Type]) self.assertEqual(x.Root.Type, PdfName.Catalog) self.assertEqual(sorted(x.Root.Pages.keys()), [PdfName.Count, PdfName.Kids, PdfName.Type]) self.assertEqual(x.Root.Pages.Type, PdfName.Pages) orig_img = Image.open(f) for pagenum in range(len(x.Root.Pages.Kids)): # retrieve the original image frame that this page was # generated from orig_img.seek(pagenum) cur_page = x.Root.Pages.Kids[pagenum] ndpi = orig_img.info.get("dpi", (96.0, 96.0)) # In python3, the returned dpi value for some tiff images will # not be an integer but a float. To make the behaviour of # img2pdf the same between python2 and python3, we convert that # float into an integer by rounding. # Search online for the 72.009 dpi problem for more info. ndpi = (int(round(ndpi[0])), int(round(ndpi[1]))) imgwidthpx, imgheightpx = orig_img.size pagewidth = 72.0 * imgwidthpx / ndpi[0] pageheight = 72.0 * imgheightpx / ndpi[1] def format_float(f): if int(f) == f: return str(int(f)) else: return ("%.4f" % f).rstrip("0") self.assertEqual(sorted(cur_page.keys()), [ PdfName.Contents, PdfName.MediaBox, PdfName.Parent, PdfName.Resources, PdfName.Type ]) self.assertEqual(cur_page.MediaBox, [ '0', '0', format_float(pagewidth), format_float(pageheight) ]) self.assertEqual(cur_page.Parent, x.Root.Pages) self.assertEqual(cur_page.Type, PdfName.Page) self.assertEqual(cur_page.Resources.keys(), [PdfName.XObject]) self.assertEqual(cur_page.Resources.XObject.keys(), [PdfName.Im0]) self.assertEqual(cur_page.Contents.keys(), [PdfName.Length]) self.assertEqual(cur_page.Contents.Length, str(len(cur_page.Contents.stream))) self.assertEqual( cur_page.Contents.stream, "q\n%.4f 0 0 %.4f 0.0000 0.0000 cm\n" "/Im0 Do\nQ" % (pagewidth, pageheight)) imgprops = cur_page.Resources.XObject.Im0 # test if the filter is valid: self.assertIn( imgprops.Filter, [[PdfName.DCTDecode], [PdfName.JPXDecode], [PdfName.FlateDecode], [PdfName.CCITTFaxDecode]]) # test if the colorspace is valid self.assertIn(imgprops.ColorSpace, [ PdfName.DeviceGray, PdfName.DeviceRGB, PdfName.DeviceCMYK ]) # test if the image has correct size self.assertEqual(imgprops.Width, str(orig_img.size[0])) self.assertEqual(imgprops.Height, str(orig_img.size[1])) # if the input file is a jpeg then it should've been copied # verbatim into the PDF if imgprops.Filter in [[PdfName.DCTDecode], [PdfName.JPXDecode]]: self.assertEqual(cur_page.Resources.XObject.Im0.stream, convert_load(orig_imgdata)) elif imgprops.Filter == [PdfName.CCITTFaxDecode]: tiff_header = tiff_header_for_ccitt( int(imgprops.Width), int(imgprops.Height), int(imgprops.Length), 4) imgio = BytesIO() imgio.write(tiff_header) imgio.write( convert_store(cur_page.Resources.XObject.Im0.stream)) imgio.seek(0) im = Image.open(imgio) self.assertEqual(im.tobytes(), orig_img.tobytes()) try: im.close() except AttributeError: pass elif imgprops.Filter == [PdfName.FlateDecode]: # otherwise, the data is flate encoded and has to be equal # to the pixel data of the input image imgdata = zlib.decompress( convert_store(cur_page.Resources.XObject.Im0.stream)) colorspace = imgprops.ColorSpace if colorspace == PdfName.DeviceGray: colorspace = 'L' elif colorspace == PdfName.DeviceRGB: colorspace = 'RGB' elif colorspace == PdfName.DeviceCMYK: colorspace = 'CMYK' else: raise Exception("invalid colorspace") im = Image.frombytes( colorspace, (int(imgprops.Width), int(imgprops.Height)), imgdata) if orig_img.mode == '1': self.assertEqual(im.tobytes(), orig_img.convert("L").tobytes()) elif orig_img.mode not in ("RGB", "L", "CMYK", "CMYK;I"): self.assertEqual(im.tobytes(), orig_img.convert("RGB").tobytes()) # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not # have the close() method try: im.close() except AttributeError: pass # now use pdfrw to parse and then write out both pdfs and check the # result for equality y = PdfReader(out) outx = BytesIO() outy = BytesIO() xwriter = PdfWriter() ywriter = PdfWriter() xwriter.trailer = x ywriter.trailer = y xwriter.write(outx) ywriter.write(outy) self.assertEqual(outx.getvalue(), outy.getvalue()) # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the # close() method try: orig_img.close() except AttributeError: pass
def save_to_file(pdf_obj, file_path): short_path_for_logging = '/'.join(file_path.split('/')[-3:]) logger.debug("Saving to file: " + short_path_for_logging) y = PdfWriter() y.write(file_path, pdf_obj)
''' import sys import os # import find_pdfrw from pdfrw import PdfReader, PdfWriter inpfn = sys.argv[1] rotate = sys.argv[2] outfn = sys.argv[3] rotate = int(rotate) assert rotate % 90 == 0 # ranges = [[int(y) for y in x.split('-')] for x in ranges] trailer = PdfReader(inpfn) pages = trailer.pages ranges = [[1, len(pages)]] for onerange in ranges: onerange = (onerange + onerange[-1:])[:2] for pagenum in range(onerange[0] - 1, onerange[1]): pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or 0) + rotate) % 360 outdata = PdfWriter() outdata.trailer = trailer outdata.write(outfn)
if not isinstance(initial, list): initial = [initial] files = [] queue = initial[:] while bool(queue): current = queue.pop(0) if isfile(current) and splitext(current)[1] in ext: files.append(current) elif isdir(current): sub = [join(current,x) for x in listdir(current)] queue += sub logging.info("Found {} {} files".format(len(files), ext)) return files pdfs = get_data_files(args.directory, '.pdf') logging.info("Chopping pdfs") for pdf in pdfs: logging.info("Reading: {}".format(pdf)) data = PdfReader(pdf) edited = PdfWriter() for x in range(1, len(data.pages)): edited.addpage(data.pages[x]) out_name = join(args.out, split(pdf)[1]) logging.info("Writing to: {}".format(out_name)) edited.write(out_name) logging.info("-----------")
def popups_write_pdf(file): from pdfrw import PdfWriter w = PdfWriter(version='1.5', compress=pdf_popup_config['compress']) w.trailer = popup_pdf w.write(file)
result[-1].x += result[0].w return result.render() parser = argparse.ArgumentParser() parser.add_argument("input", help="Input pdf file name") parser.add_argument("-p", "--padding", action = "store_true", help="Padding the document so that all pages use the same type of sheet") args = parser.parse_args() inpfn = args.input outfn = 'booklet.' + os.path.basename(inpfn) ipages = PdfReader(inpfn).pages if args.padding: pad_to = 4 else: pad_to = 2 # Make sure we have a correct number of sides ipages += [None]*(-len(ipages)%pad_to) opages = [] while len(ipages) > 2: opages.append(fixpage(ipages.pop(), ipages.pop(0))) opages.append(fixpage(ipages.pop(0), ipages.pop())) opages += ipages PdfWriter(outfn).addpages(opages).write()
def copyrightParse(sourceKey, bucketName, context): # BOTO3 objects s3 = boto3.resource('s3') s3client = boto3.client('s3') object = s3.Object(bucketName, sourceKey) # Copyright Data metadata = object.metadata if "copyright" in metadata: return 'Copyright already exists - aborting' dateTimeObj = datetime.now() timestampStr = dateTimeObj.strftime("%d-%b-%Y (%H:%M:%S.%f)") metadata['copyright'] = timestampStr # Get prelim data from object with io.BytesIO(object.get()['Body'].read()) as pdf_content_sample: existing_pdf = PdfReader(pdf_content_sample) # Get Dimensions of document to make corresponding sized watermark mbox = existing_pdf.pages[0].MediaBox mediabox = tuple(float(x) for x in mbox) ### ReportLab implementation # Get Source PDF to watermark - Load single page to generate watermark to the right size # Create memory position for Watermark PDF with io.BytesIO() as packet: print('Loading PDF file - Watermark generation') height = 40 width = mediabox[2] # create a new PDF with Reportlab can = canvas.Canvas(packet) can.setPageSize((width, height)) # Get Copyright content copyrightContent = getCopyrightContent() # Stylesheet additions stylesheet = getSampleStyleSheet() style_watermark = stylesheet["Normal"] style_watermark.alignment = TA_CENTER style_watermark.textColor = colors.Color(0, 0, 0, alpha=0.5) style_watermark.fontSize = 8 style_watermark.font = 'Helvetica' # Creating Paragraph copyright_paragraph = Paragraph(copyrightContent, style_watermark) # Creating Table to wrap Paragraph data = [[copyright_paragraph]] table = Table(data) table.setStyle( TableStyle([ ('BACKGROUND', (0, 0), (-1, -1), colors.Color(255, 255, 255, alpha=0.5)), ])) # Adding Table to Canvas # Make sure the width is an integer! print(f'Table width set to {math.floor(width)}') table.wrapOn(can, math.floor(width), 15) table.drawOn(can, 0, 0) # Saving can.save() # Move to start of memory pointer packet.seek(0) watermark_input = PdfReader(packet) watermark = watermark_input.pages[0] # Iterate through pages, updating source file. for current_page in range(len(existing_pdf.pages)): merger = PageMerge(existing_pdf.pages[current_page]) merger.add(watermark).render() # write the modified content to disk writer_output = PdfWriter() outputStream = io.BytesIO() with outputStream as pdfOutput: writer_output.write(pdfOutput, existing_pdf) print('File written to PDFWriter') pdfOutput.seek(0) s3client.upload_fileobj(pdfOutput, bucketName, sourceKey, ExtraArgs={"Metadata": metadata}) status = f'Copyright Set: {timestampStr}' return status
def render(source, *, progress_cb=lambda x: None): # Exports the self as a PDF document to disk # progress_cb will be called with a progress percentage between 0 and # 100. This percentage calculation is split 50% for the rendering # of the lines and 50% merging with the base PDF file. This callback # also provides an opportunity to abort the process. If the callback # raises an error, this function will take steps to abort gracefullly # and pass the error upwards. vector = True # TODO: Different rendering styles source = sources.get_source(source) # If this is using a base PDF, the percentage is calculated # differently. uses_base_pdf = source.exists('{ID}.pdf') # Document metadata should already be loaded (from device) # ... # Generate page information # If a PDF file was uploaded, but never opened, there may not be # a .content file. So, just load a barebones one with a 'pages' # key of zero length, so it doesn't break the rest of the # process. pages = [] if source.exists('{ID}.content'): with source.open('{ID}.content', 'r') as f: pages = json.load(f).get('pages', []) # Render each page as a pdf tmpfh = tempfile.TemporaryFile() pdf_canvas = canvas.Canvas(tmpfh, (PDFWIDTH, PDFHEIGHT)) # TODO: check pageCompression # Don't load all the pages into memory, because large notebooks # about 500 pages could use up to 3 GB of RAM. Create them by # iteration so they get released by garbage collector. changed_pages = [] annotations = [] for i in range(0, len(pages)): page = document.DocumentPage(source, pages[i], i) if source.exists(page.rmpath): changed_pages.append(i) page.render_to_painter(pdf_canvas, vector) annotations.append(page.get_grouped_annotations()) progress_cb((i + 1) / len(pages) * 50) pdf_canvas.save() tmpfh.seek(0) # This new PDF represents just the notebook. If there was a # parent PDF, merge it now. if uses_base_pdf and not changed_pages: # Since there is no stroke data, just return the PDF data progress_cb(100) log.info('exported pdf') return source.open('{ID}.pdf', 'rb') # PDF exists, stroke data exists, so mix them together. if uses_base_pdf: rmpdfr = PdfReader(tmpfh) basepdfr = PdfReader(source.open('{ID}.pdf', 'rb')) else: basepdfr = PdfReader(tmpfh) # Alias, which is used for annotations and layers. rmpdfr = basepdfr # If making a 'layered' PDF (with optional content groups, # OCGs), associate the annoatations with the layer. # This property list is put into the rmpdfr document, which # will not have any existing properties. ocgprop = IndirectPdfDict(OCGs=PdfArray(), D=PdfDict(Order=PdfArray())) for i in range(0, len(basepdfr.pages)): basepage = basepdfr.pages[i] rmpage = rmpdfr.pages[i] # Apply OCGs apply_ocg = False #TODO configurable? bool(int(QSettings().value( #'pane/notebooks/export_pdf_ocg'))) if apply_ocg: ocgorderinner = do_apply_ocg(basepage, rmpage, i, uses_base_pdf, ocgprop, annotations) else: ocgorderinner = None # Apply annotations to the rmpage. This must come after # applying OCGs, because the annotation may belong to # one of those groups. apply_annotations(rmpage, annotations[i], ocgorderinner) # If this is a normal notebook with highlighting, # just add the annotations and forget about the rest, # which are page geometry transformations. if uses_base_pdf: merge_pages(basepage, rmpage, i in changed_pages) progress_cb(((i + 1) / rmpdfr.numPages * 50) + 50) # Apply the OCG order. The basepdf may have already had OCGs # and so we must not overwrite them. NOTE: there are other # properties that ought to be carried over, but this is the # minimum required. if apply_ocg: if '/OCProperties' in basepdfr.Root: basepdfr.Root.OCProperties.OCGs += ocgprop.OCGs basepdfr.Root.OCProperties.D.Order += ocgprop.D.Order else: basepdfr.Root.OCProperties = ocgprop pdfw = PdfWriter() stream = tempfile.SpooledTemporaryFile(SPOOL_MAX) pdfw.write(stream, basepdfr) stream.seek(0) log.info('exported pdf') return stream
def debug(event, context): # Get Source PDF to watermark filename = "sample.pdf" existing_pdf = PdfReader(open(filename, "rb")) # Get Dimensions of document to make corresponding sized watermark mbox = existing_pdf.pages[0].MediaBox mediabox = tuple(float(x) for x in mbox) with io.BytesIO() as packet: height = 40 width = mediabox[2] # create a new PDF with Reportlab can = canvas.Canvas(packet) can.setPageSize((width, height)) # Get Copyright content copyrightContent = getCopyrightContent() # Stylesheet additions stylesheet = getSampleStyleSheet() style_watermark = stylesheet["Normal"] style_watermark.alignment = TA_CENTER style_watermark.textColor = colors.Color(0, 0, 0, alpha=0.5) style_watermark.fontSize = 8 style_watermark.font = 'Helvetica' # Creating Paragraph copyright_paragraph = Paragraph(copyrightContent, style_watermark) # Creating Table to wrap Paragraph data = [[copyright_paragraph]] table = Table(data) table.setStyle( TableStyle([ ('BACKGROUND', (0, 0), (-1, -1), colors.Color(255, 255, 255, alpha=0.5)), ])) # Adding Table to Canvas table.wrapOn(can, math.floor(width), 15) table.drawOn(can, 0, 0) # Saving can.save() # Move to start of memory pointer packet.seek(0) # Setting up PDF as a PDFFileReader object watermark_input = PdfReader(packet) watermark = watermark_input.pages[0] # Iterate through pages, updating source file. for current_page in range(len(existing_pdf.pages)): print(f'page {current_page}') merger = PageMerge(existing_pdf.pages[current_page]) merger.add(watermark).render() # write the modified content to disk writer_output = PdfWriter() outputStream = open(f"processed_{filename}", "wb") with outputStream as pdfOutput: writer_output.write(pdfOutput, existing_pdf) print('Processed PDF - copyright added')
def fingerprinter_upload(request): processed_files = [] pdf_file = request.FILES.get('pdf-file') copy_count = request.POST.get('copy-count', 1) suffix = request.POST.get('file-suffix', '') try: copy_count = int(copy_count) except: copy_count = 1 if pdf_file is not None: #make save directory rand_path = randomword(9) fingerprint_dir = os.path.join(settings.BASE_DIR, settings.STATIC_ROOT, 'fingerprints', rand_path) os.makedirs(fingerprint_dir) s = os.path.splitext(pdf_file.name) filename = s[0].replace("'", '').replace('"', '') #handle non ascii chars in file name #(strangly only wsgi seems to choke on those) if isinstance(filename, unicode): try: filename = unidecode(filename) except: filename = re.sub(r'[^\x00-\x7F]+', '.', filename) extension = s[1] file_content = pdf_file.read() content = PdfReader(io.BytesIO(file_content)) if content.ID is None: file_id = 'No ID' else: file_id = str(content.ID[0]).replace('<', '').replace('>', '')\ .replace('(', '').replace(')', '') #bad file_ids can contain strange characters #TODO When we upgrade try: file_id.encode('utf-8').strip() except UnicodeDecodeError: file_id = 'Unreadable' file_info = { 'filename': pdf_file.name, 'size': pdf_file.size, 'id': file_id, 'directory_name': rand_path } for copy_index in range(copy_count): if suffix and suffix != '': save_filename = filename + '-' + suffix + '-' + str( copy_index + 1) + extension else: save_filename = filename + '-' + str(copy_index + 1) + extension file_path = os.path.join(fingerprint_dir, save_filename) static_link = os.path.join('/pdf', save_filename) download_link = os.path.join('/static/drop-pdf', save_filename) content = PdfReader(io.BytesIO(file_content)) #add some random meta data content.Info.randomMetaData = binascii.b2a_hex( os.urandom(20)).upper() #change id to random id md = hashlib.md5(filename) md.update(str(time.time())) md.update(os.urandom(10)) new_id = md.hexdigest().upper() #keep length 32 new_id = new_id[0:32] while len(new_id) < 32: new_id += random.choice('0123456789ABCDEF') content.ID = [new_id, new_id] PdfWriter(file_path, trailer=content).write() #copy file into online annotator with unique name annotation_name = filename + '-' + suffix + '-' \ + str(copy_index + 1) + '-' + rand_path + extension annotation_path = os.path.join(settings.BASE_DIR, settings.STATIC_ROOT, 'drop-pdf', annotation_name) shutil.copy(file_path, annotation_path) #For some reason nested directories do not provide files from static. #We need to clean up double "settings" file and sanify the basic setup but #For now serve the file from a dedicated URL. copy_info = { 'filename': save_filename, 'download_path': os.path.join(rand_path, save_filename), 'docdrop_link': annotation_name, 'id': content.ID[0] } processed_files.append(copy_info) else: raise Http404('file not provided') data = { 'processed_files': processed_files, 'file_info': file_info, 'archive_name': filename } return render_to_response('refingerprint_results.html', data)
from pdfrw import PdfReader, PdfWriter pages = PdfReader('Official Ielts Practice Materials 2.pdf').pages parts = [(15, 28)] for part in parts: outdata = PdfWriter(f'pages_{part[0]}_{part[1]}.pdf') for pagenum in range(*part): outdata.addpage(pages[pagenum - 1]) outdata.write()
# -*- coding: utf-8 -*- import os, sys, datetime from pdfrw import PdfReader, PdfWriter writer = PdfWriter() now = datetime.datetime.now() data_path = os.getcwd() + "/data/" dir_path = data_path + str(now.year) + '_' + sys.argv[1] + "week" if not os.path.exists(dir_path + "/result"): os.mkdir(dir_path + "/result") files = [x for x in os.listdir(dir_path) if x.endswith('.pdf')] for fname in sorted(files, key = lambda x: int(x.split(".")[0])): print ("[" + fname + "] Merged") writer.addpages(PdfReader(os.path.join(dir_path, fname)).pages) writer.write(dir_path + "/result/"+ str(now.year) + "_" + sys.argv[1] + "_merge.pdf") print("\nENDED MERGE REPORT!")
# Multiple copies of first page used as a placeholder to # get blank page on back. for p1, p2 in zip(pages, pages[1:]): if p1[1] is p2[1]: pages.remove(p1) return IndirectPdfDict( Type=PdfName.Page, Contents=PdfDict(stream=''.join(page.stream for page in pages)), MediaBox=PdfArray([0, 0, x, y]), Resources=PdfDict(XObject=PdfDict(pages), ), ) inpfn, = sys.argv[1:] outfn = 'booklet.' + os.path.basename(inpfn) pages = PdfReader(inpfn, decompress=False).pages # Use page1 as a marker to print a blank at the end if len(pages) & 1: pages.append(pages[0]) bigpages = [] while len(pages) > 2: bigpages.append(fixpage(pages.pop(), pages.pop(0))) bigpages.append(fixpage(pages.pop(0), pages.pop())) bigpages += pages PdfWriter().addpages(bigpages).write(outfn)
def do_test(self, params, prev_results=[''], scrub=False): params = params.split() hashkey = 'examples/%s' % '_'.join(params) params = [lookup.get(x, x) for x in params] progname = params[0] params[0] = prog_dir % progname srcf = params[1] params.insert(0, sys.executable) subdir, progname = os.path.split(progname) subdir = os.path.join(dstdir, subdir) if not os.path.exists(subdir): os.makedirs(subdir) os.chdir(subdir) dstf = '%s.%s' % (progname, os.path.basename(srcf)) scrub = scrub and dstf dstf = dstf if not scrub else 'final.%s' % dstf hash = '------no-file-generated---------' expects = expected.results[hashkey] # If the test has been deliberately skipped, # we are done. Otherwise, execute it even # if we don't know about it yet, so we have # results to compare. result = 'fail' size = 0 try: if 'skip' in expects: result = 'skip requested' return self.skipTest(result) elif 'xfail' in expects: result = 'xfail requested' return self.fail(result) exists = os.path.exists(dstf) if expects or not exists: if exists: os.remove(dstf) if scrub and os.path.exists(scrub): os.remove(scrub) subprocess.call(params) if scrub: PdfWriter().addpages(PdfReader(scrub).pages).write(dstf) with open(dstf, 'rb') as f: data = f.read() size = len(data) if data: hash = hashlib.md5(data).hexdigest() lookup[hash] = dstf prev_results[0] = hash else: os.remove(dstf) if expects: if len(expects) == 1: expects, = expects self.assertEqual(hash, expects) else: self.assertIn(hash, expects) result = 'pass' else: result = 'skip' self.skipTest('No hash available') finally: result = '%8d %-20s %s %s\n' % (size, result, hashkey, hash) with open(hashfile, 'ab') as f: f.write(convert_store(result))
from pdfrw import PdfReader, PdfWriter sections = [ 'Introduction', '1_Experimental_datasets', '2_Structured_data_from_literature', '3_Analysis_tools', '4_Simulation_environments', '5_Model_sharing', '6_Computing_infrastructure', '7_Open_source_initiatives', '8_Web_portals' ] #sections = ['Introduction','1_Experimental_datasets', '2_Structured_data_from_literature'] for section in sections: print("++++++++++++++++++++++++++++++++++\n+ Adding section: %s\n+" % section) big_file = PdfWriter() files = os.listdir(section) files = sorted(files) for f in files: fpath = section + '/' + f if os.path.isfile(fpath) and fpath.endswith( 'pptx') and not f == 'Template.pptx': print("+ Incorporating: %s" % fpath) call([ "libreoffice", "--headless", "--invisible", "--convert-to", "pdf", fpath ]) pdf_file_name = f.replace('pptx', 'pdf')
# You probably should just wrap each JS action with a try/catch, # because Chrome does no error reporting or even logging otherwise; # you just get a silent failure. page.AA.O = make_js_action(""" try { %s } catch (e) { app.alert(e.message); } """ % (script)) page.Annots = PdfArray(annots) return page if len(sys.argv) > 1: js_file = open(sys.argv[1], 'r') fields = [] for line in js_file: if not line.startswith('/// '): break pieces = line.split() params = [pieces[1]] + [float(token) for token in pieces[2:]] fields.append(make_field(*params)) js_file.seek(0) out = PdfWriter() out.addpage(make_page(fields, js_file.read())) out.write('result.pdf')
parser.add_argument('--evenrev', dest='evenrev', action='store_const', const=True, default=False, help='reverses the even pages before shuffling') args = parser.parse_args() # The shuffling magic even = PdfReader(args.evenFile[0]) odd = PdfReader(args.oddFile[0]) isEvenReversed = args.evenrev isOddReversed = args.oddrev all = PdfWriter() blank = PageMerge() blank.mbox = [0, 0, 612, 792] # 8.5 x 11 blank = blank.render() if isEvenReversed and not isOddReversed: for i in range(0, len(odd.pages)): all.addpage(odd.pages[i]) all.addpage(even.pages[len(even.pages) - 1 - i]) elif isOddReversed and not isEvenReversed: for i in range(0, len(odd.pages)): all.addpage(odd.pages[len(odd.pages) - 1 - i]) all.addpage(even.pages[i]) elif isEvenReversed and isOddReversed: for i in range(0, len(odd.pages)): all.addpage(odd.pages[len(odd.pages) - 1 - i])