def mailmerge( ifile, ofile, fn_updatetmpl, content_list ): with open( ifile, "rb" ) as f: pdf = PdfFileReader( f ) out = PdfFileWriter( ) # Get and update template page pg = pdf.getPage( 0 ) if fn_updatetmpl: fn_updatetmpl( pg ) # Use one copy of template contents and page resources dictionary tmpl = out._addObject( pg.getContents() ) res = out._addObject( pg['/Resources'] ) # Generate page for each content element for cdata in content_list: if isinstance( cdata, StreamObject ): c = cdata else: d = { '/Length' : len(cdata), '__streamdata__' : cdata } c = StreamObject.initializeFromDictionary( d ) p = DictionaryObject( pg.items() ) p[NameObject('/Contents')] = ArrayObject( [tmpl,c] ) p[NameObject('/Resources')] = res out.addPage( p ) # Write complete output file with open( ofile, "wb" ) as g: out.write( g )
def add_outlines(toc, filename, output): build_outlines_btree(toc) pdf_out = PdfFileWriter() inputFile = open(filename, 'rb') pdf_in = PdfFileReader(inputFile) for p in pdf_in.pages: pdf_out.addPage(p) toc_num = len(toc) if (toc_num == 0): # Just copy if toc empty outputFile = open(output, "wb") pdf_out.write(outputFile) inputFile.close() outputFile.close() return idoix = len(pdf_out._objects) + 1 idorefs = [PDF.IndirectObject(x + idoix, 0, pdf_out) for x in range(toc_num + 1)] ol = PDF.DictionaryObject() ol.update({ PDF.NameObject("/Type"): PDF.NameObject("/Outlines"), PDF.NameObject("/First"): idorefs[1], PDF.NameObject("/Last"): idorefs[-1], PDF.NameObject("/Count"): PDF.NumberObject(toc_num) }) olitems = [] for t in toc: oli = PDF.DictionaryObject() oli.update({ PDF.NameObject("/Title"): PDF.TextStringObject(t["title"].decode("utf-8")), PDF.NameObject("/Dest"): make_dest(pdf_out, t["page"]) }) opt_keys = {"real_parent": "/Parent", "prev": "/Prev", "next": "/Next", "first": "/First", "last": "/Last"} for k, v in opt_keys.items(): n = getattr(t["node"], k)() if n is not None: oli.update({ PDF.NameObject(v): idorefs[n.index] }) olitems.append(oli) pdf_out._addObject(ol) for i in olitems: pdf_out._addObject(i) pdf_out._root_object.update({ PDF.NameObject("/Outlines"): idorefs[0] }) outputFile = open(output, "wb") pdf_out.write(outputFile) inputFile.close() outputFile.close()
def add_outlines(toc, filename, output): build_outlines_btree(toc) pdf_out = PdfFileWriter() inputFile = open(filename, 'rb') pdf_in = PdfFileReader(inputFile) for p in pdf_in.pages: pdf_out.addPage(p) toc_num = len(toc) idoix = len(pdf_out._objects) + 1 idorefs = [PDF.IndirectObject(x + idoix, 0, pdf_out) for x in range(toc_num + 1)] ol = PDF.DictionaryObject() ol.update({ PDF.NameObject("/Type"): PDF.NameObject("/Outlines"), PDF.NameObject("/First"): idorefs[1], PDF.NameObject("/Last"): idorefs[-1], PDF.NameObject("/Count"): PDF.NumberObject(toc_num) }) olitems = [] for t in toc: oli = PDF.DictionaryObject() oli.update({ PDF.NameObject("/Title"): PDF.TextStringObject(t["title"].decode("utf-8")), PDF.NameObject("/Dest"): make_dest(pdf_out, t["page"]) }) opt_keys = {"real_parent": "/Parent", "prev": "/Prev", "next": "/Next", "first": "/First", "last": "/Last"} for k, v in opt_keys.items(): n = getattr(t["node"], k)() if n is not None: oli.update({ PDF.NameObject(v): idorefs[n.index] }) olitems.append(oli) pdf_out._addObject(ol) for i in olitems: pdf_out._addObject(i) pdf_out._root_object.update({ PDF.NameObject("/Outlines"): idorefs[0] }) outputFile = open(output, "wb") pdf_out.write(outputFile) inputFile.close() outputFile.close()
class Document(): def __init__(self, source): if 'redactor/' not in source: source = 'redactor/' + source try: im = Image.open(source) except Exception as e: raise Exception('Failed to open image source ' + source) source = source.replace('/in/', '/temp/') self.page_size = 1240, int(1240.0 * im.size[1] / im.size[0]) im = im.resize(self.page_size, Image.ANTIALIAS) im.save(source) document = BytesIO() canvas = Canvas(document, pagesize=(self.page_size)) canvas.setFillColorRGB(1, 1, 1) canvas.drawImage(source, 0, 0, mask=(1, 1, 1, 1, 1, 1)) canvas.save() self.pdf = PdfFileWriter() self.pdf.addPage( PdfFileReader(BytesIO(document.getvalue())).getPage(0)) def add_line(self, x0, y0, x1, y1, color): document = BytesIO() canvas = Canvas(document, pagesize=self.page_size) canvas.setLineWidth(25) canvas.setStrokeColorRGB(*color) canvas.line(x0, y0, x1, y1) canvas.save() self.pdf.getPage(0).mergePage( PdfFileReader(BytesIO(document.getvalue())).getPage(0)) def add_rect(self, x0, y0, x1, y1, color=[0, 0, 0, 0]): document = BytesIO() canvas = Canvas(document, pagesize=self.page_size) canvas.setFillColorRGB(*color) canvas.rect(x0, y0, x1 - x0, y1 - y0, 0, 1) canvas.save() self.pdf.getPage(0).mergePage( PdfFileReader(BytesIO(document.getvalue())).getPage(0)) def add_note(self, src, x0, y0, comment='', author=''): self._add_image(src, x0, y0) self._add_highlight(x0, y0, 71, 39, comment, author) def export(self, fn, objects): funcs = { 'LINE': self.add_line, 'RECT': self.add_rect, 'COMM': self.add_note, } for obj in objects['objects']: if funcs.get(obj['mode'], None): funcs[obj['mode']](**obj['attributes']) self.pdf.write(open(fn, 'wb')) def _add_image(self, source, x, y): if 'redactor/' not in source: source = 'redactor/' + source document = BytesIO() canvas = Canvas(document, pagesize=self.page_size) canvas.setFillColorRGB(1, 1, 1) canvas.drawImage(source, x, y, mask='auto') canvas.save() self.pdf.getPage(0).mergePage( PdfFileReader(BytesIO(document.getvalue())).getPage(0)) def _create_highlight(self, x0, y0, width, height, comment, author='', color=[0, 0, 0, 0]): self.add_rect(x0, y0, width, height) highlight = DictionaryObject() highlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(author), NameObject("/Contents"): TextStringObject(comment), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x0), FloatObject(y0), FloatObject(x0 + width), FloatObject(y0 + width) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x0), FloatObject(y0 + width), FloatObject(x0 + width), FloatObject(y0 + width), FloatObject(x0), FloatObject(y0), FloatObject(x0 + width), FloatObject(y0) ]), }) return highlight def _add_highlight(self, x0, y0, width, height, comment, author='', color=[0, 0, 0, 0]): highlight = self._create_highlight(x0, y0, width, height, comment, author, color) highlight_ref = self.pdf._addObject(highlight) if "/Annots" in self.pdf.getPage(0): self.pdf.getPage(0)[NameObject("/Annots")].append(highlight_ref) else: self.pdf.getPage(0)[NameObject("/Annots")] = ArrayObject( [highlight_ref])
def anotate_pdf(file_path, sht, query_dict): # preparing the output file name path = pathlib.Path(file_path).parent extension = pathlib.Path(file_path).suffix name = pathlib.Path(file_path).name[:-len(extension)] result_file = str(path) + '\\' + name + '_highlighted' + extension #========================================================= # create a parser object associated with the file object parser = PDFParser(open(file_path, 'rb')) # create a PDFDocument object that stores the document structure doc = PDFDocument(parser) # Layout Analysis # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # create pdf layout - this is list with layout of every page layout = [] for page in PDFPage.create_pages(doc): interpreter.process_page(page) # receive the LTPage object for the page. layout.append(device.get_result()) # add tooltip info not sure how to use this option in the most usefull way m_meta = {"author": "AK", "contents": "HL text1"} outputStream = open(result_file, "wb") pdfInput = PdfFileReader(open(file_path, 'rb'), strict=True) pdfOutput = PdfFileWriter() npage = pdfInput.numPages for pgn in range(0, npage): for query in query_dict: all_coor = [] for page in layout: result = get_page_coordinates(page, query) all_coor.append(result) page_hl = pdfInput.getPage(pgn) for item in all_coor[pgn]: highlight = create_highlight(item[0], item[1], item[2], item[3], m_meta, color=query_dict[query]) highlight_ref = pdfOutput._addObject(highlight) if "/Annots" in page_hl: page_hl[NameObject("/Annots")].append(highlight_ref) else: page_hl[NameObject("/Annots")] = ArrayObject( [highlight_ref]) pdfOutput.addPage(page_hl) # save HL to new file pdfOutput.write(outputStream) outputStream.close() sht.range('B2').value = f'File {name+extension} completed'