def write_fillable_pdf(file, output_pdf_path, data_dict): out_dir = os.path.join(BASE_DIR, "tmp/" + str(random.randrange(20, 200, 3)) + ".pdf") INVOICE_TEMPLATE_PATH = os.path.join(BASE_DIR, file) input_stream = open(INVOICE_TEMPLATE_PATH, "rb") pdf_reader = PyPDF2.PdfFileReader(input_stream, strict=False) if "/AcroForm" in pdf_reader.trailer["/Root"]: pdf_reader.trailer["/Root"]["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) pdf_writer = PyPDF2.PdfFileWriter() set_need_appearances_writer(pdf_writer) if "/AcroForm" in pdf_writer._root_object: # Acro form is form field, set needs appearances to fix printing issues pdf_writer._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) pdf_writer.addPage(pdf_reader.getPage(0)) page = pdf_writer.getPage(0) pdf_writer.updatePageFormFieldValues(page, data_dict) for j in range(0, len(page['/Annots'])): writer_annot = page['/Annots'][j].getObject() for field in data_dict: # -----------------------------------------------------BOOYAH! if writer_annot.get('/T') == field: writer_annot.update({NameObject("/Ff"): NumberObject(1)}) # ----------------------------------------------------- output_stream = BytesIO() pdf_writer.write(output_stream) with open(out_dir, 'wb') as d: ## Open temporary file as bytes d.write(output_stream.read()) input_stream.close() return out_dir
def create_annot_box(x1, y1, x2, y2, meta, color=[1, 0, 0]): new_annot = DictionaryObject() new_annot.update({ # NameObject("/P"): parent, NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Square"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), }) return new_annot
def generatePdf(infile, outfile): pdf = PdfFileReader(open(infile, "rb"), strict=False) if "/AcroForm" in pdf.trailer["/Root"]: pdf.trailer["/Root"]["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) pdf2 = PdfFileWriter() updateFormProperlyWriter(pdf2) if "/AcroForm" in pdf2._root_object: pdf2._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) # Add pages for x in range(pdf.getNumPages() - 1): pdf2.addPage(pdf.getPage(x)) pdf2.updatePageFormFieldValues(pdf2.getPage(x), FORM_VALUES) # Flatten form fields. flat_page = pdf2.getPage(0) for j in range(0, len(flat_page['/Annots'])): writer_annot = flat_page['/Annots'][j].getObject() for field in FLATTEN_VALUES: if writer_annot.get('/T') == field: writer_annot.update({ NameObject("/Ff"): NumberObject(1) # make ReadOnly }) outputStream = open(outfile, "wb") pdf2.write(outputStream)
def createHighlight(x0, y0, x1, y1, color=[0, 0, 0]): newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x0), FloatObject(y0), FloatObject(x1), FloatObject(y1) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x0), FloatObject(y1), FloatObject(x1), FloatObject(y1), FloatObject(x0), FloatObject(y0), FloatObject(x1), FloatObject(y0) ]), }) return newHighlight
def fill(self, input_dict: Dict[str, Any]) -> None: for p, m in zip(self.input_pdf.pages, self.mapping): if "/Annots" not in p: self.pdf.addPage(p) continue for j in range(0, len(p["/Annots"])): writer_annot = p["/Annots"][j].getObject() writer_annot.update({NameObject("/Ff"): NumberObject(1)}) # make ReadOnly for mk, mv in m.items(): if writer_annot.get("/T") == mk: input_value: Any = input_dict[mv] value: str if type(input_value) == bool: if input_value: writer_annot.update( {NameObject("/V"): NameObject("/1"), NameObject("/AS"): NameObject("/1")} ) else: if "/V" in writer_annot: del writer_annot["/V"] writer_annot.update({NameObject("/AS"): NameObject("/Off")}) else: value = str(input_value) writer_annot.update( {NameObject("/V"): TextStringObject(value), NameObject("/AP"): TextStringObject(value)} ) self.pdf.addPage(p)
def __init__(self, im): super().__init__() try: depth, colorspace = MODE_TO_COLORSPACE[im.mode] except KeyError: raise NotImplementedError('image mode %r not supported' % im.mode) w, h = im.size # always compress raw image data self._data = FlateDecode.encode(im.tobytes()) self[NameObject("/Filter")] = NameObject('/FlateDecode') self[NameObject('/Type')] = NameObject('/XObject') self[NameObject('/Subtype')] = NameObject('/Image') self[NameObject('/Width')] = NumberObject(w) self[NameObject('/Height')] = NumberObject(h) self[NameObject('/BitsPerComponent')] = NumberObject(depth) self[NameObject('/ColorSpace')] = NameObject(colorspace)
def createHighlight(self,x1, y1, x2, y2, meta, color = [1, 0, 0]): newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x1), FloatObject(y2), FloatObject(x2), FloatObject(y2), FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y1) ]), }) return newHighlight
def test_CCITTFaxDecode(): data = b"" parameters = DictionaryObject({ "/K": NumberObject(-1), "/Columns": NumberObject(17) }) # This was just the result PyPDF2 1.27.9 returned. # It would be awesome if we could check if that is actually correct. assert CCITTFaxDecode.decode(data, parameters) == ( b"II*\x00\x08\x00\x00\x00\x08\x00\x00\x01\x04\x00\x01\x00\x00\x00\x11\x00" b"\x00\x00\x01\x01\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x01" b"\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x03\x01\x03\x00\x01\x00" b"\x00\x00\x04\x00\x00\x00\x06\x01\x03\x00\x01\x00\x00\x00\x00\x00" b"\x00\x00\x11\x01\x04\x00\x01\x00\x00\x00l\x00\x00\x00\x16\x01" b"\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x17\x01\x04\x00\x01\x00" b"\x00\x00\x00\x00\x00\x00\x00\x00")
def pdf(request, template): #template = r'C:\Users\Mathi\Documents\Coding\PDF_Templates\Test_Contract.pdf' # location of the pdf template outfile = r'C:\Users\Mathi\Documents\Coding\PDF_Templates\templates/test.pdf' # location of the filled in pdf input_stream = open( template, "rb" ) # opens the template for reading in binary mode and returns it as a file object # PyPDF2 class that takes a file object or path to file (test), strict determines whether user should be warned of # all problems and also causes some correctable problems to be fatal. Initialises the PdfFileReader object. pdf_reader = PyPDF2.PdfFileReader(input_stream, strict=False) # Trailer is where all the file's metadata is stored, in a pdf the AcroForm contains the annotation fields # NeedAppearances needs to be true to enable the modification and setting of field value if "/AcroForm" in pdf_reader.trailer["/Root"]: pdf_reader.trailer["/Root"]["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) # We create a blank pdf page that will be writen pdf_writer = PyPDF2.PdfFileWriter() set_need_appearances_writer(pdf_writer) if "/AcroForm" in pdf_writer._root_object: # Acro form is form field, set needs appearances to fix printing issues pdf_writer._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) data_dict = { 'numero_de_contrat#0': '12345\n', 'numero_de_contrat#1': '12345\n', 'Raison_Sociale': 'Dunder Mifflen\n', 'Adresse': '1 paper drive, Paris\n', 'SIREN': '1 paper drive, Paris\n', 'Tel': '06.95.97.02.30\n', } # Create new page in this pdf we are writingr pdf_writer.addPage(pdf_reader.getPage(0)) page = pdf_writer.getPage(0) pdf_writer.updatePageFormFieldValues(page, data_dict) for j in range(0, len(page['/Annots'])): writer_annot = page['/Annots'][j].getObject() for field in data_dict: # -----------------------------------------------------BOOYAH! if writer_annot.get('/T') == field: writer_annot.update({NameObject("/Ff"): NumberObject(1)}) # ----------------------------------------------------- output_stream = BytesIO() pdf_writer.write(output_stream) response = HttpResponse(output_stream.getvalue(), content_type='application/pdf') response['Content-Disposition'] = 'inline; filename="completed.pdf"' input_stream.close() return FileResponse(output_stream, as_attachment=True, filename='test.pdf')
def createHighlight(bbox=(0, 0, 1, 1), contents="", color=[1, 1, 0], author="iwasakishuto(@cabernet_rock)"): """Create a Highlight Args: bbox (tuple) : a bounding box showing the location of highlight. contents (str) : Text comments for a highlight label. color (list) : Highlight color. Defaults to ``[1,1,0]``. (yellow) author (str) : Who wrote the annotation (comment). Defaults to ``"iwasakishuto(@cabernet_rock)"`` . Returns: DictionaryObject: Highlight information. Examples: >>> from gummy.utils import createHighlight, addHighlightToPage >>> from PyPDF2 import PdfFileWriter, PdfFileReader >>> page_no = 0 >>> pdfOutput = PdfFileWriter() >>> with open("input.pdf", mode="rb") as inPdf: ... pdfInput = PdfFileReader(inPdf) ... page = pdfInput.getPage(page_no) ... highlight = createHighlight(bbox=(10,10,90,90), contents="COMMENT", color=(1,1,0)) ... addHighlightToPage(highlight, page, pdfOutput) ... pdfOutput.addPage(page) ... with open("output.pdf", mode="wb") as outPdf: ... pdfOutput.write(outPdf) """ from PyPDF2.generic import (DictionaryObject, NumberObject, FloatObject, NameObject, TextStringObject, ArrayObject) x1, y1, x2, y2 = bbox newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(author), NameObject("/Contents"): TextStringObject(contents), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([FloatObject(e) for e in bbox]), NameObject("/QuadPoints"): ArrayObject([FloatObject(e) for e in [x1, y2, x2, y2, x1, y1, x2, y1]]), }) return newHighlight
def pdf(request): template = os.path.join(BASE_DIR, "family1.pdf") outfile = os.path.join(BASE_DIR, "sample.pdf") input_stream = open(template, "rb") pdf_reader = PyPDF2.PdfFileReader(input_stream, strict=False) if "/AcroForm" in pdf_reader.trailer["/Root"]: pdf_reader.trailer["/Root"]["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) pdf_writer = PyPDF2.PdfFileWriter() set_need_appearances_writer(pdf_writer) if "/AcroForm" in pdf_writer._root_object: # Acro form is form field, set needs appearances to fix printing issues pdf_writer._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) data_dict = { 'Fid': 'John\n', 'Fname1': 'Smith\n', 'Faadhar': '[email protected]\n', 'Fcontact': '889-998-9967\n', 'Faddress1': 'Amazing Inc.\n', 'Faddress2': 'Dev\n', 'Faddress3': '123 Main Way\n', 'Fration': 'Johannesburg\n', 'Farogya': 'New Mexico\n', 'Faadhar1': 96705, 'from_date': 'USA\n', 'to_date': 'Who cares...\n' } pdf_writer.addPage(pdf_reader.getPage(0)) page = pdf_writer.getPage(0) pdf_writer.updatePageFormFieldValues(page, data_dict) for j in range(0, len(page['/Annots'])): writer_annot = page['/Annots'][j].getObject() for field in data_dict: # -----------------------------------------------------BOOYAH! if writer_annot.get('/T') == field: writer_annot.update({NameObject("/Ff"): NumberObject(1)}) # ----------------------------------------------------- output_stream = BytesIO() pdf_writer.write(output_stream) response = HttpResponse(output_stream.getvalue(), content_type='application/pdf') response['Content-Disposition'] = 'inline; filename="completed.pdf"' input_stream.close() return response
def create_highlight(self, x1, y1, x2, y2, meta, color=[0, 1, 0]): """ Create a highlight for a PDF. Parameters ---------- x1, y1 : float bottom left corner x2, y2 : float top right corner meta : dict keys are "author" and "contents" color : iterable Three elements, (r,g,b) """ new_highlight = DictionaryObject() new_highlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x1), FloatObject(y2), FloatObject(x2), FloatObject(y2), FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y1) ]), }) return new_highlight
def generatereport(field_dictionary, cl): def set_need_appearances_writer(writer): try: catalog = writer._root_object if "/AcroForm" not in catalog: writer._root_object.update({ NameObject("/AcroForm"): IndirectObject(len(writer._objects), 0, writer) }) need_appearances = NameObject("/NeedAppearances") writer._root_object["/AcroForm"][need_appearances] = BooleanObject( True) return writer except Exception as e: print('set_need_appearances_writer() catch : ', repr(e)) return writer outfile = cl + ".pdf" infile = cwd + '/templates/report.pdf' inputStream = open(infile, "rb") pdf_reader = PyPDF2.PdfFileReader(inputStream, strict=False) if "/AcroForm" in pdf_reader.trailer["/Root"]: pdf_reader.trailer["/Root"]["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) pdf_writer = PyPDF2.PdfFileWriter() set_need_appearances_writer(pdf_writer) if "/AcroForm" in pdf_writer._root_object: pdf_writer._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) pdf_writer.addPage(pdf_reader.getPage(0)) pdf_writer.updatePageFormFieldValues(pdf_writer.getPage(0), field_dictionary) page = pdf_writer.getPage(0) for j in range(0, len(page['/Annots'])): writer_annot = page['/Annots'][j].getObject() for field in field_dictionary: if writer_annot.get('/T') == field: writer_annot.update({ NameObject("/Ff"): NumberObject(1) # make ReadOnly }) outputStream = open(outfile, "wb") pdf_writer.write(outputStream) inputStream.close() outputStream.close()
def _create_highlight(self, x0, y0, width, height, comment, author='', color=[0, 0, 0, 0]): self.add_rect(x0, y0, width, height) highlight = DictionaryObject() highlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(author), NameObject("/Contents"): TextStringObject(comment), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x0), FloatObject(y0), FloatObject(x0 + width), FloatObject(y0 + width) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x0), FloatObject(y0 + width), FloatObject(x0 + width), FloatObject(y0 + width), FloatObject(x0), FloatObject(y0), FloatObject(x0 + width), FloatObject(y0) ]), }) return highlight
def update_form_values(infile, outfile, newvals=None): pdf = PdfFileReader(open(infile, 'rb')) writer = PdfFileWriter() set_need_appearances_writer(writer) if "/AcroForm" in writer._root_object: writer._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) # print(pdf) # if '/AcroForm' in pdf._root_object: # pdf._root_object["/AcroForm"].update( # {NameObject("/NeedAppearances"): BooleanObject(True)} # ) for i in range(pdf.getNumPages()): page = pdf.getPage(i) if not newvals: newvals = { k: f'#{i} {k}={v}' for i, (k, v) in enumerate(get_form_fields(infile).items()) } try: writer.updatePageFormFieldValues(page, newvals) for j in range(0, len(page['/Annots'])): writer_annot = page['/Annots'][j].getObject() for field in newvals: # -----------------------------------------------------BOOYAH! if writer_annot.get('/T') == field: writer_annot.update( {NameObject("/Ff"): NumberObject(1)}) # ----------------------------------------------------- writer.addPage(page) except Exception as e: print(repr(e)) writer.addPage(page) with open(outfile, 'wb') as out: writer.write(out)
def createHighlight(x1, y1, x2, y2, meta, color=[1, 0, 0]): ''' Create a highlight object which will be applied to a box in a PDF page (please, notice that coordinates start in the bottom left) with specific metadata and colors. ''' newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x1), FloatObject(y2), FloatObject(x2), FloatObject(y2), FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y1) ]), }) return newHighlight
def pdf_flatten(filename, number): # open the pdf input_stream = open(filename, "rb") pdf_reader = PyPDF2.PdfFileReader(input_stream, strict=False) if "/AcroForm" in pdf_reader.trailer["/Root"]: pdf_reader.trailer["/Root"]["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) pdf_writer = PyPDF2.PdfFileWriter() set_need_appearances_writer(pdf_writer) if "/AcroForm" in pdf_writer._root_object: # Acro form is form field, set needs appearances to fix printing issues pdf_writer._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) data_dict = pdf_reader.getFields() # this is a dict of your DB form values flatten_dict = ("number") for thisPage in range(pdf_reader.numPages): pdf_writer.addPage(pdf_reader.getPage(thisPage)) page = pdf_writer.getPage(thisPage) for j in range(0, len(page['/Annots'])): writer_annot = page['/Annots'][j].getObject() for field in data_dict: if writer_annot.get('/T') == field: if field in flatten_dict: writer_annot.update({ NameObject("/Ff"): NumberObject(1) # make ReadOnly , NameObject("/V"): TextStringObject(number) # update the value }) flatten_form = filename + "-flatten.pdf" output_stream = open(flatten_form, "wb") pdf_writer.write(output_stream) return flatten_form
def create_annotation(x, y, meta): color = [255.0 / 255.0, 209 / 255.0, 0] # link linkAnnotation = DictionaryObject() # https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf linkAnnotation.update({ # Table 165 NoZoom NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Link"), # Table 164 color, annotation rectangle NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x), FloatObject(y), FloatObject(x + 20), FloatObject(y + 20) ]), # Table 173 link annotation NameObject('/A'): DictionaryObject({ # Table 206 uri NameObject('/S'): NameObject('/URI'), NameObject('/URI'): TextStringObject(meta["contents"]) }), # Table 173 invert rect when mouse NameObject('/H'): NameObject('/I'), # table 164 hor corner radius, vert corner radius, border width # dash array table 56 NameObject('/Border'): ArrayObject([ NameObject(0), NameObject(0), NameObject(5), ]), }) commentAnnotation = DictionaryObject() # https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf commentAnnotation.update({ # Table 165 NoZoom NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Text"), # Table 170 titlebar NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), # Table 164 color, annotation rectangle NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x), FloatObject(y), FloatObject(x + 5), FloatObject(y + 5) ]), # 12.5.6.4 text annotation NameObject('/Open'): BooleanObject(False), NameObject('/Name'): NameObject('/Comment'), }) return linkAnnotation, commentAnnotation
def main(): print("Loading metadata and eText information...") with open("bookinfo.json", 'r') as bookInfoRequest: str_response = bookInfoRequest.read() bookInfo = json.loads(str_response) bookInfo = bookInfo[0]['userBookTOList'][0] with open("pageinfo.json", 'r') as pageInfoRequest: pageInfo = json.loads(pageInfoRequest.read()) pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList'] with open("pages.json", 'r') as file: downloadedData = json.loads(file.read())[0]["pdfPlayerPageInfoTOList"] def get_data(page_id): b = next((x['data'] for x in downloadedData if x['pageID'] == page_id), None) return bytearray(base64.standard_b64decode(b[len("data:application/pdf;base64,"):])) with tempfile.TemporaryDirectory() as pdfDownloadDir: # Use a temporary directory to download all the pdf files to # First, download the cover file pdfPageTable = {} pdf_page_label_table = {} # urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf")) with open(os.path.join(pdfDownloadDir, "0000 - cover.pdf"), 'w+b') as ous: ous.write(get_data(pageInfo[0]['pageID'])) # Then, download all the individual pages for the e-book def download(pdfPage): pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder'] savePath = os.path.join(pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber'])) with open(savePath, 'w+b') as out: out.write(get_data(pdfPage['pageID'])) # urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath) threadPool = ThreadPool(40) # 40 threads should download a book fairly quickly print("Reading pages from pageinfo.json to \"{}\"...".format(pdfDownloadDir)) threadPool.map(download, pageInfo) print("Assembling PDF...") # Begin to assemble the final PDF, first by adding all the pages fileMerger = PdfFileWriter() for pdfFile in sorted(os.listdir(pdfDownloadDir)): page = PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0) os.remove(os.path.join(pdfDownloadDir, pdfFile)) # Save on memory a bit fileMerger.addPage(page) bookmarksExist = True # TODO: Bookmarks currently not supported with open("bookmarks.json", 'r') as bookmarkInfoRequest: try: bookmarkInfo = json.loads(bookmarkInfoRequest.read()) bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0] except Exception as e: bookmarksExist = False def recursiveSetBookmarks(aDict, parent=None): if isinstance(aDict, dict): aDict = [aDict] for bookmark in aDict: # These are the main bookmarks under this parent (or the whole document if parent is None) bookmarkName = bookmark['name'] # Name of the section pageNum = str(bookmark['linkvalue']['content']) # First page (in the pdf's format) latestBookmark = fileMerger.addBookmark(bookmarkName, pdfPageTable[pageNum], parent) if 'basketentry' in bookmark: recursiveSetBookmarks(bookmark['basketentry'], latestBookmark) if bookmarksExist: print("Adding bookmarks...") fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning recursiveSetBookmarks(bookmarkInfo['document'][0]['basketcollection']['basket']['basketentry']) else: print("Bookmarks don't exist for book") print("Fixing metadata...") # Hack to fix metadata and page numbers: pdf_page_label_table = [(v, k) for k, v in pdfPageTable.items()] pdf_page_label_table = sorted(pdf_page_label_table, key=(lambda x: int(x[0]))) labels = ArrayObject([ NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")}) ]) last_mode = None last_prefix = "" # Now we check to see the ranges where we have roman numerals or arabic numerals # The following code is not ideal for this, so I'd appreciate a PR with a better solution for pageNumber, pageLabel in pdf_page_label_table: curr_mode = None prefix = "" style = DictionaryObject() if arabicRegex.match(pageLabel): curr_mode = "arabic" prefix = arabicRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/D")}) elif romanRegex.match(pageLabel): curr_mode = "roman" prefix = romanRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/r")}) if curr_mode != last_mode or prefix != last_prefix: if prefix: style.update({ NameObject("/P"): NameObject("({})".format(prefix)) }) labels.extend([ NumberObject(pageNumber), style, ]) last_mode = curr_mode last_prefix = prefix root_obj = fileMerger._root_object # Todo: Fix the weird page numbering bug pageLabels = DictionaryObject() # fileMerger._addObject(pageLabels) pageLabels.update({ NameObject("/Nums"): ArrayObject(labels) }) root_obj.update({ NameObject("/PageLabels"): pageLabels }) print("Writing PDF...") with open("{}.pdf".format(bookInfo['title']).replace("/", "").replace(":", "_"), "wb") as outFile: fileMerger.write(outFile)
def main(eTextUrl): bookData = urllib.parse.parse_qs(eTextUrl.split("?")[-1]) if (bookData.get("values", None)) is not None: bookData = { itemName : [itemValue] for itemName, itemValue in zip(*[iter(bookData["values"][0].split("::"))]*2) } # A few fixes in terms of capitalization bookData["bookid"] = bookData["bookID"] bookData["userid"] = bookData["userID"] bookData["sessionid"] = bookData["sessionID"] # We'll default to the roletypeid for a student bookData["roletypeid"] = [roletypeid] # 3 for Instructor... the server doesn't care, though print("Downloading metadata and eText information...") bookInfoGetUrl = bookInfoUrl.format(bookData["bookid"][0]) #print(hsidUrl(bookInfoGetUrl)) with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest: str_response = bookInfoRequest.read().decode('utf-8') bookInfo = json.loads(str_response) bookInfo = bookInfo[0]['userBookTOList'][0] pageInfoGetUrl = pageInfoUrl.format( userid=bookData['userid'][0], userroleid=bookData['roletypeid'][0], bookid=bookData['bookid'][0], bookeditionid=bookInfo['bookEditionID'], authkey=bookData['sessionid'][0], ) with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest: pageInfo = json.loads(pageInfoRequest.read().decode('utf-8')) pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList'] def getPageUrl(pdfPage, isCover="N"): pdfPage = pdfPage.replace("/assets/","") getPage = pagePath = pdfUrl.format( bookid=bookInfo['globalBookID'], pdfpage=pdfPage, iscover=isCover, authkey=bookData['sessionid'][0] ) return hsidUrl(getPage) with tempfile.TemporaryDirectory() as pdfDownloadDir: # Use a temporary directory to download all the pdf files to # First, download the cover file pdfPageTable = {} pdfPageLabelTable = {} urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf")) # Then, download all the individual pages for the e-book def download(pdfPage): pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder'] savePath = os.path.join(pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber'])) urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath) threadPool = ThreadPool(40) # 40 threads should download a book fairly quickly print("Downloading pages to \"{}\"...".format(pdfDownloadDir)) threadPool.map(download, pageInfo) print("Assembling PDF...") # Begin to assemble the final PDF, first by adding all the pages fileMerger = PdfFileWriter() for pdfFile in sorted(os.listdir(pdfDownloadDir)): fileMerger.addPage(PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0)) # And then add all the bookmarks to the final PDF bookmarkInfoGetUrl = bookmarkInfoUrl.format( userroleid=bookData['roletypeid'][0], bookid=bookData['bookid'][0], language=language, authkey=bookData['sessionid'][0], bookeditionid=bookInfo['bookEditionID'], scenarioid=bookData['scenario'][0], ) with urllib.request.urlopen(hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest: bookmarkInfo = json.loads(bookmarkInfoRequest.read().decode('utf-8')) bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0] fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning print("Fixing metadata...") # Hack to fix metadata and page numbers: pdfPageLabelTable = [(v,k) for k,v in pdfPageTable.items()] pdfPageLabelTable = sorted(pdfPageLabelTable, key=(lambda x: int(x[0]))) labels = ArrayObject([ NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")}) ]) lastMode = None lastPrefix = "" # Now we check to see the ranges where we have roman numerals or arabic numerals # The following code is not ideal for this, so I'd appreciate a PR with a better solution for pageNumber, pageLabel in pdfPageLabelTable: currMode = None prefix = "" style = DictionaryObject() if arabicRegex.match(pageLabel): currMode = "arabic" prefix = arabicRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/D")}) elif romanRegex.match(pageLabel): currMode = "roman" prefix = romanRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/r")}) if currMode != lastMode or prefix != lastPrefix: if prefix: style.update({ NameObject("/P"): NameObject("({})".format(prefix)) }) labels.extend([ NumberObject(pageNumber), style, ]) lastMode = currMode lastPrefix = prefix rootObj = fileMerger._root_object # Todo: Fix the weird page numbering bug pageLabels = DictionaryObject() #fileMerger._addObject(pageLabels) pageLabels.update({ NameObject("/Nums"): ArrayObject(labels) }) rootObj.update({ NameObject("/PageLabels"): pageLabels }) print("Writing PDF...") with open("out.pdf", "wb") as outFile: fileMerger.write(outFile)
def main(bookId): if bookId.startswith("http"): print("Trying to extract bookId from url") bookData = urllib.parse.parse_qs(bookId.split("?")[-1]) if (bookData.get("values", None)) is not None: bookData = { itemName: [itemValue] for itemName, itemValue in zip( *[iter(bookData["values"][0].split("::"))] * 2) } # Fix capitalization bookData["bookid"] = bookData["bookID"] bookId = bookData["bookid"][0] bookId = int(bookId) print( "Downloading book id {}. Please open an issue on GitHub if this book id is incorrect." .format(bookId)) print("Downloading metadata and eText information...") bookInfoGetUrl = bookInfoUrl.format(bookId) #print(hsidUrl(bookInfoGetUrl)) with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest: str_response = bookInfoRequest.read().decode('utf-8') bookInfo = json.loads(str_response) bookInfo = bookInfo[0]['userBookTOList'][0] pageInfoGetUrl = pageInfoUrl.format( userroleid=roletypeid, bookid=bookId, bookeditionid=bookInfo['bookEditionID']) with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest: pageInfo = json.loads(pageInfoRequest.read().decode('utf-8')) pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList'] def getPageUrl(pdfPage, isCover="N"): pdfPage = pdfPage.replace("/assets/", "") getPage = pagePath = pdfUrl.format(bookid=bookInfo['globalBookID'], pdfpage=pdfPage, iscover=isCover) return hsidUrl(getPage) with tempfile.TemporaryDirectory() as pdfDownloadDir: # Use a temporary directory to download all the pdf files to # First, download the cover file pdfPageTable = {} pdfPageLabelTable = {} urllib.request.urlretrieve( getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf")) # Then, download all the individual pages for the e-book def download(pdfPage): pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder'] savePath = os.path.join( pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber'])) urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath) threadPool = ThreadPool( 40) # 40 threads should download a book fairly quickly print("Downloading pages to \"{}\"...".format(pdfDownloadDir)) threadPool.map(download, pageInfo) print("Assembling PDF...") # Begin to assemble the final PDF, first by adding all the pages fileMerger = PdfFileWriter() for pdfFile in sorted(os.listdir(pdfDownloadDir)): fileMerger.addPage( PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0)) # And then add all the bookmarks to the final PDF bookmarkInfoGetUrl = bookmarkInfoUrl.format( userroleid=roletypeid, bookid=bookId, language=language, bookeditionid=bookInfo['bookEditionID'], scenarioid=1001) bookmarksExist = True with urllib.request.urlopen( hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest: try: bookmarkInfo = json.loads( bookmarkInfoRequest.read().decode('utf-8')) bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0] except Exception as e: bookmarksExist = False def recursiveSetBookmarks(aDict, parent=None): if isinstance(aDict, dict): aDict = [aDict] for bookmark in aDict: # These are the main bookmarks under this parent (or the whole document if parent is None) bookmarkName = bookmark['n'] # Name of the section pageNum = str(bookmark['lv'] ['content']) # First page (in the pdf's format) latestBookmark = fileMerger.addBookmark( bookmarkName, pdfPageTable[pageNum], parent) if 'be' in bookmark: recursiveSetBookmarks(bookmark['be'], latestBookmark) if bookmarksExist: print("Adding bookmarks...") fileMerger.addBookmark( "Cover", 0) # Add a bookmark to the cover at the beginning recursiveSetBookmarks(bookmarkInfo['document'][0]['bc']['b']['be']) else: print("Bookmarks don't exist for ID {}".format(bookId)) print("Fixing metadata...") # Hack to fix metadata and page numbers: pdfPageLabelTable = [(v, k) for k, v in pdfPageTable.items()] pdfPageLabelTable = sorted(pdfPageLabelTable, key=(lambda x: int(x[0]))) labels = ArrayObject([ NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")}) ]) lastMode = None lastPrefix = "" # Now we check to see the ranges where we have roman numerals or arabic numerals # The following code is not ideal for this, so I'd appreciate a PR with a better solution for pageNumber, pageLabel in pdfPageLabelTable: currMode = None prefix = "" style = DictionaryObject() if arabicRegex.match(pageLabel): currMode = "arabic" prefix = arabicRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/D")}) elif romanRegex.match(pageLabel): currMode = "roman" prefix = romanRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/r")}) if currMode != lastMode or prefix != lastPrefix: if prefix: style.update( {NameObject("/P"): NameObject("({})".format(prefix))}) labels.extend([ NumberObject(pageNumber), style, ]) lastMode = currMode lastPrefix = prefix rootObj = fileMerger._root_object # Todo: Fix the weird page numbering bug pageLabels = DictionaryObject() #fileMerger._addObject(pageLabels) pageLabels.update({NameObject("/Nums"): ArrayObject(labels)}) rootObj.update({NameObject("/PageLabels"): pageLabels}) print("Writing PDF...") with open( "{} - {}.pdf".format(bookId, bookInfo['title']).replace( "/", "").replace(":", "_"), "wb") as outFile: fileMerger.write(outFile)
class PdfEnhancedFileWriter(PdfFileWriter): colors_operands = { 'rgb': { 'black': [NumberObject(0), NumberObject(0), NumberObject(0)], 'white': [NumberObject(1), NumberObject(1), NumberObject(1)], }, 'cmyk': { 'black': [ NumberObject(0), NumberObject(0), NumberObject(0), NumberObject(1) ], 'white': [ NumberObject(0), NumberObject(0), NumberObject(0), NumberObject(0) ], }, 'grayscale': { 'black': [NumberObject(0)], 'white': [NumberObject(1)], } } def _getOperatorType(self, operator): operator_types = { b_('Tj'): 'text', b_("'"): 'text', b_('"'): 'text', b_("TJ"): 'text', b_('rg'): 'rgb', # color b_('RG'): 'rgb', # color b_('k'): 'cmyk', # color b_('K'): 'cmyk', # color b_('g'): 'grayscale', # color b_('G'): 'grayscale', # color b_('re'): 'rectangle', b_('l'): 'line', # line b_('m'): 'line', # start line b_('S'): 'line', # stroke(paint) line } if operator in operator_types: return operator_types[operator] return None # get the operation type that the color affects on def _getColorTargetOperationType(self, color_index, operations): for i in range(color_index + 1, len(operations)): operator = operations[i][1] operator_type = self._getOperatorType(operator) if operator_type == 'text' or operator_type == 'rectangle' or operator_type == 'line': return operator_type return False def getMinimumRectangleWidth(self, fontSize, minimumNumberOfLetters=1.5): return fontSize * minimumNumberOfLetters def removeWordStyle(self, ignoreByteStringObject=False): """ Removes imported styles from Word - Path Constructors rectangles - from this output. :param bool ignoreByteStringObject: optional parameter to ignore ByteString Objects. """ pages = self.getObject(self._pages)['/Kids'] for j in range(len(pages)): page = pages[j] pageRef = self.getObject(page) content = pageRef['/Contents'].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, pageRef) _operations = [] last_font_size = 0 for operator_index, (operands, operator) in enumerate(content.operations): if operator == b_('Tf') and operands[0][:2] == '/F': last_font_size = operands[1].as_numeric() if operator == b_('Tj'): text = operands[0] if ignoreByteStringObject: if not isinstance(text, TextStringObject): operands[0] = TextStringObject() elif operator == b_("'"): text = operands[0] if ignoreByteStringObject: if not isinstance(text, TextStringObject): operands[0] = TextStringObject() elif operator == b_('"'): text = operands[2] if ignoreByteStringObject: if not isinstance(text, TextStringObject): operands[2] = TextStringObject() elif operator == b_("TJ"): for i in range(len(operands[0])): if ignoreByteStringObject: if not isinstance(operands[0][i], TextStringObject): operands[0][i] = TextStringObject() operator_type = self._getOperatorType(operator) # we are ignoring all grayscale colors # tests showed that black underlines, borders and tables are defined by grayscale and arn't using rgb/cmyk colors if operator_type == 'rgb' or operator_type == 'cmyk': color_target_operation_type = self._getColorTargetOperationType( operator_index, content.operations) new_color = None # we are coloring all text in black and all rectangles in white # removing all colors paints rectangles in black which gives us unwanted results if color_target_operation_type == 'text': new_color = 'black' elif color_target_operation_type == 'rectangle': new_color = 'white' if new_color: operands = self.colors_operands[operator_type][ new_color] # remove styled rectangles (highlights, lines, etc.) # the 're' operator is a Path Construction operator, creates a rectangle() # presumably, that's the way word embedding all of it's graphics into a PDF when creating one if operator == b_('re'): rectangle_width = operands[-2].as_numeric() rectangle_height = operands[-1].as_numeric() minWidth = self.getMinimumRectangleWidth( last_font_size, 1) # (length of X letters at the current size) maxHeight = last_font_size + 6 # range to catch really big highlights minHeight = 1.5 # so that thin lines will not be removed # remove only style that: # it's width are bigger than the minimum # it's height is smaller than maximum and larger than minimum if rectangle_width > minWidth and rectangle_height > minHeight and rectangle_height <= maxHeight: continue _operations.append((operands, operator)) content.operations = _operations pageRef.__setitem__(NameObject('/Contents'), content)
def cloneDocumentFromReader(self, reader: PdfFileReader, *args) -> None: """Create a copy (clone) of a document from a PDF file reader. :param reader: PDF file reader instance from which the clone should be created. :callback after_page_append (function): Callback function that is invoked after each page is appended to the writer. Signature includes a reference to the appended page (delegates to appendPagesFromReader). Callback signature: :param writer_pageref (PDF page reference): Reference to the page just appended to the document. """ mustAddTogether = False newInfoRef = self._info oldPagesRef = self._pages oldPages = self.getObject(self._pages) # If there have already been any number of pages added if oldPages[NameObject("/Count")] > 0: # Keep them mustAddTogether = True else: # Through the page object out if oldPages in self._objects: newInfoRef = self._pages self._objects.remove(oldPages) # Clone the reader's root document self.cloneReaderDocumentRoot(reader) if not self._root: self._root = self._addObject(self._root_object) # Sweep for all indirect references externalReferenceMap = {} self.stack = [] newRootRef = self._sweepIndirectReferences(externalReferenceMap, self._root) # Delete the stack to reset del self.stack # Clean-Up Time!!! # Get the new root of the PDF realRoot = self.getObject(newRootRef) # Get the new pages tree root and its ID Number tmpPages = realRoot[NameObject("/Pages")] newIdNumForPages = 1 + self._objects.index(tmpPages) # Make an IndirectObject just for the new Pages self._pages = IndirectObject(newIdNumForPages, 0, self) # If there are any pages to add back in if mustAddTogether: # Set the new page's root's parent to the old # page's root's reference tmpPages[NameObject("/Parent")] = oldPagesRef # Add the reference to the new page's root in # the old page's kids array newPagesRef = self._pages oldPages[NameObject("/Kids")].append(newPagesRef) # Set all references to the root of the old/new # page's root self._pages = oldPagesRef realRoot[NameObject("/Pages")] = oldPagesRef # Update the count attribute of the page's root oldPages[NameObject("/Count")] = \ NumberObject(oldPages[NameObject("/Count")] + tmpPages[NameObject("/Count")]) else: # Bump up the info's reference b/c the old # page's tree was bumped off self._info = newInfoRef
def add_geospatial_pdf_header(m, f, f2, map_bounds, poly, epsg=None, wkt=None): """ Adds geospatial PDF information to the PDF file as per: Adobe® Supplement to the ISO 32000 PDF specification BaseVersion: 1.7 ExtensionLevel: 3 (June 2008) Notes: The epsg code or the wkt text of the projection must be provided. Must be called *after* the page has had .finish() called. """ if not HAS_PYPDF2: raise RuntimeError( "PyPDF2 not available; PyPDF2 required to add geospatial header to PDF" ) if not any((epsg, wkt)): raise RuntimeError( "EPSG or WKT required to add geospatial header to PDF") file_reader = PdfFileReader(f) file_writer = PdfFileWriter() # preserve OCProperties at document root if we have one if NameObject('/OCProperties' ) in file_reader.trailer['/Root']: #Python3-friendly file_writer._root_object[NameObject( '/OCProperties')] = file_reader.trailer['/Root'].getObject()[ NameObject('/OCProperties')] for page in file_reader.pages: gcs = DictionaryObject() gcs[NameObject('/Type')] = NameObject('/PROJCS') if epsg: gcs[NameObject('/EPSG')] = NumberObject(int(epsg)) if wkt: gcs[NameObject('/WKT')] = TextStringObject(wkt) measure = get_pdf_measure(m, gcs, poly, map_bounds) """ Returns the PDF's VP array. The VP entry is an array of viewport dictionaries. A viewport is basiscally a rectangular region on the PDF page. The only required entry is the BBox which specifies the location of the viewport on the page. """ viewport = DictionaryObject() viewport[NameObject('/Type')] = NameObject('/Viewport') bbox = ArrayObject() for x in (0, int(page.mediaBox[3]), int(page.mediaBox[2]), 0): #in pts bbox.append(FloatObject(str(x))) #Fixed viewport[NameObject('/BBox')] = bbox #viewport[NameObject('/Name')] = TextStringObject('OOMAP') viewport[NameObject('/Measure')] = measure vp_array = ArrayObject() vp_array.append(viewport) page[NameObject('/VP')] = vp_array file_writer.addPage(page) file_writer.write(f2) return (f2)
def test_number_object_exception(): with pytest.raises(OverflowError): NumberObject(1.5 * 2**10000)
def concat_and_clean(): bundle_cells = [] bundle_document = None notebook_root = None for source_path_str in ipynb_files: source_path = Path(source_path_str) bundle_path = Path(build_directory, source_path.stem + '.html') if bundle_path.exists(): rendered_file = open(str(bundle_path), 'r', encoding='utf-8') html_source = html.fromstring(rendered_file.read()) rendered_file.close() os.remove(str(bundle_path)) cells = html_source.find_class('cell') # If no main document body is available yet, grab it from the first # page that is found. Use that as the basis for the entire bundle if bundle_document == None: bundle_document = html_source body_node = bundle_document.xpath('//body')[0] cover_file = open('./cover.html', 'r', encoding='utf-8') cover_source = html.fromstring(cover_file.read()) cover_file.close() cover_styles = cover_source.xpath('//style')[0] bundle_head = bundle_document.xpath('//head')[0] bundle_head.insert(2, cover_styles) cover_page = cover_source.get_element_by_id('cover-page') body_node.insert(0, cover_page) # Include the custom styles once, which would have otherwise be # included by the final two cells in every rendered notebook bundle_file = open(str(Path('styles', 'aipstyle.html')), 'r') custom_style_elements = html.fragments_fromstring(bundle_file.read()) bundle_file.close() for element in reversed(custom_style_elements): body_node.insert(0, element) # Even though the final two cells will be ignored below they # have to be actually removed from the 'base' document to make # sure they don't show up at the top of the notebook notebook_root = html_source.get_element_by_id('notebook-container') notebook_root.remove(cells[-2]) notebook_root.remove(cells[-1]) # The final two cells of every rendered notebook are expected to # contain the code loading custom CSS, these are not necessary for # the bundled version and should therefore be removed chapter_node = html.fromstring('<section></section>') notebook_root.append(chapter_node) for cell in cells[0:-2]: chapter_node.append(cell) shutil.copy('./styles/bundle.css', str(Path(build_directory, 'custom.css'))) if bundle_document != None: bundle_filename = str(Path(build_directory, 'bundle.html')) bundle_file = open(bundle_filename, 'w', encoding='utf-8') bundle_file.write(html.tostring(bundle_document).decode('utf-8')) bundle_file.close() os.system('phantomjs --web-security=no ./print_to_pdf.js bundle/bundle.html') # The generated PDF file contains incorrect URIs for the table of # contents and the backlinks to the items in it. What used to be # 'internal' URIs in the HTML document have become 'external' URIs in # the PDF document. These have to be rewritten using a two-pass # mechanism. First the location of each of the links (both in the table # of contents as in the actual document) are collected. These should, # mostly, come in pairs. In the first pass, the location of each of # these links is recorded and stored with respect to the link it should # be linked from. Then in the second pass, all links are updated to # point to the correct page and offset on that page based on the # information gathered in the first pass. source_pdf = PdfFileReader('output/BMLIP-5SSD0.pdf') # Link dictionaries store links using their names as key with tuples # specifying their corresponding (page, ...) as values links = dict() # Collect pages and offsets for all internal links for page_number in range(0, source_pdf.getNumPages()): page = source_pdf.getPage(page_number) annotations = page['/Annots'].getObject() for annotation in annotations: object = annotation.getObject() link = object['/A'] uri = link['/URI'] if uri[0:7] == 'file://': uri_parts = uri.split('#') if len(uri_parts) > 1: # This is an internal URI if uri_parts[1][0:4] == 'toc-': key = uri_parts[1][4:] links[key] = (page_number, object['/Rect'][3]) else: links['toc-' + uri_parts[1]] = (page_number, object['/Rect'][3]) # Modify all links to point to the proper internal locations for page_number in range(0, source_pdf.getNumPages()): page = source_pdf.getPage(page_number) annotations = page['/Annots'].getObject() for annotation in annotations: object = annotation.getObject() link = object['/A'] uri = link['/URI'] if uri[0:7] == 'file://': uri_parts = uri.split('#') if len(uri_parts) > 1: # Always remove the URI pointing to the non-existent # external file del link['/URI'] # Not all link targets actually exist in the document (such # as those on the first page), these have to be ignored if uri_parts[1] in links: link_data = links[uri_parts[1]] link.update({ NameObject('/D'): ArrayObject([NumberObject(link_data[0]), NameObject('/FitH'), NumberObject(link_data[1])]), NameObject('/S'): NameObject('/GoTo') }) else: # Update the rectangle to effectively disable the link object.update({ NameObject('/Rect'): ArrayObject([NumberObject(0), NumberObject(0), NumberObject(0), NumberObject(0)]) }) target_pdf = PdfFileWriter() target_pdf.appendPagesFromReader(source_pdf) # Manually add page numbers to the table of contents toc_stream = BytesIO() toc_canvas = canvas.Canvas(toc_stream, pagesize = A4) current_page = 0 # The first page is the cover page, and thus empty as far as the table of # contents is concerned def getPageNumber(toc_item): return links[toc_item][0] toc_links = [link for link in links if link[0:4] == 'toc-'] for toc_link in sorted(toc_links, key = getPageNumber): link_data = links[toc_link] if link_data[0] > current_page: toc_canvas.showPage() current_page = links[toc_link][0] target_name = toc_link[4:] if target_name in links: target_data = links[target_name] toc_canvas.drawRightString(575, -10 + link_data[1], '%d' % (target_data[0] + 1)) toc_canvas.save() toc_stream.seek(0) toc_pdf = PdfFileReader(toc_stream) for page_number in range(1, toc_pdf.getNumPages()): target_page = target_pdf.getPage(page_number) target_page.mergePage(toc_pdf.getPage(page_number)) target_file = open('output/BMLIP-5SSD0.pdf', 'wb') target_pdf.write(target_file) target_file.close() shutil.rmtree(build_directory)
# Acro form is form field, set needs appearances to fix printing issues pdf_writer._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) data_dict = dict() # this is a dict of your DB form values pdf_writer.addPage(pdf_reader.getPage(0)) page = pdf_writer.getPage(0) # update form fields pdf_writer.updatePageFormFieldValues(page, data_dict) for j in range(0, len(page['/Annots'])): writer_annot = page['/Annots'][j].getObject() for field in data_dict: if writer_annot.get('/T') == field: writer_annot.update({ NameObject("/Ff"): NumberObject(1) # make ReadOnly }) output_stream = BytesIO() pdf_writer.write(output_stream) # output_stream is your flattened PDF def set_need_appearances_writer(writer): # basically used to ensured there are not # overlapping form fields, which makes printing hard try: catalog = writer._root_object # get the AcroForm tree and add "/NeedAppearances attribute if "/AcroForm" not in catalog: writer._root_object.update({
def generate_student_report(a, b): def student_data(a, b): global dbhost, dbuser, dbpas with pymysql.connect(dbhost, dbuser, dbpas, 'exam') as db: db.execute(f"SELECT * FROM {b} WHERE name = '{a}'") res = db.fetchone() return res def set_need_appearances_writer(writer): try: catalog = writer._root_object if "/AcroForm" not in catalog: writer._root_object.update({ NameObject("/AcroForm"): IndirectObject(len(writer._objects), 0, writer) }) need_appearances = NameObject("/NeedAppearances") writer._root_object["/AcroForm"][need_appearances] = BooleanObject( True) return writer except Exception as e: print('set_need_appearances_writer() catch : ', repr(e)) return writer def create_dict(a): # Check totals data_dict['name'] = a[1] data_dict['class'] = get_class_student(a[1]) data_dict['exam'] = rev_get_exam(b) data_dict['english_obt'] = a[2] data_dict['english_total'] = a[3] data_dict['english_percent'] = str((int(a[2]) / int(a[3])) * 100)[:4] data_dict['english_api'] = getapi(data_dict['english_percent']) data_dict['science_obt'] = a[4] data_dict['science_total'] = a[5] data_dict['science_percent'] = str((int(a[4]) / int(a[5])) * 100)[:4] data_dict['science_api'] = getapi(data_dict['science_percent']) data_dict['math_obt'] = a[6] data_dict['math_total'] = a[7] data_dict['math_percent'] = str((int(a[6]) / int(a[7])) * 100)[:4] data_dict['math_api'] = getapi(data_dict['math_percent']) data_dict['social_obt'] = a[8] data_dict['social_total'] = a[9] data_dict['social_percent'] = str((int(a[8]) / int(a[9])) * 100)[:4] data_dict['social_api'] = getapi(data_dict['social_percent']) data_dict['obt_total'] = a[-3] data_dict['total_total'] = a[-2] data_dict['percentage'] = a[-1] data_dict['total_api'] = data_dict['english_api'] + data_dict['science_api'] + \ data_dict['math_api'] + data_dict['social_api'] return data_dict x = student_data(a, b) data_dict = {} outfile = f'{cwd}/uploads/{a}.pdf' infile = f'{cwd}/templates/report2.pdf' data = create_dict(x) input_stream = open(infile, "rb") pdf_reader = PyPDF2.PdfFileReader(input_stream, strict=False) if "/AcroForm" in pdf_reader.trailer["/Root"]: pdf_reader.trailer["/Root"]["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) pdf_writer = PyPDF2.PdfFileWriter() set_need_appearances_writer(pdf_writer) if "/AcroForm" in pdf_writer._root_object: pdf_writer._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) pdf_writer.addPage(pdf_reader.getPage(0)) pdf_writer.updatePageFormFieldValues(pdf_writer.getPage(0), data) page = pdf_writer.getPage(0) for j in range(0, len(page['/Annots'])): writer_annot = page['/Annots'][j].getObject() for field in data_dict: if writer_annot.get('/T') == field: writer_annot.update({ NameObject("/Ff"): NumberObject(1) # make ReadOnly }) output_stream = open(outfile, "wb") pdf_writer.write(output_stream) input_stream.close() output_stream.close() outfile = outfile.split('/')[-1] return outfile
def convert_to_pdfa(self): """ Transform the opened PDF file into a PDF/A compliant file """ # Set the PDF version to 1.7 (as PDF/A-3 is based on version 1.7) and make it PDF/A compliant. # See https://github.com/veraPDF/veraPDF-validation-profiles/wiki/PDFA-Parts-2-and-3-rules#rule-612-1 # " The file header shall begin at byte zero and shall consist of "%PDF-1.n" followed by a single EOL marker, # where 'n' is a single digit number between 0 (30h) and 7 (37h) " # " The aforementioned EOL marker shall be immediately followed by a % (25h) character followed by at least four # bytes, each of whose encoded byte values shall have a decimal value greater than 127 " self._header = b"%PDF-1.7\n%\xFF\xFF\xFF\xFF" # Add a document ID to the trailer. This is only needed when using encryption with regular PDF, but is required # when using PDF/A pdf_id = ByteStringObject(md5(self._reader.stream.getvalue()).digest()) # The first string is based on the content at the time of creating the file, while the second is based on the # content of the file when it was last updated. When creating a PDF, both are set to the same value. self._ID = ArrayObject((pdf_id, pdf_id)) with file_open('tools/data/files/sRGB2014.icc', mode='rb') as icc_profile: icc_profile_file_data = compress(icc_profile.read()) icc_profile_stream_obj = DecodedStreamObject() icc_profile_stream_obj.setData(icc_profile_file_data) icc_profile_stream_obj.update({ NameObject("/Filter"): NameObject("/FlateDecode"), NameObject("/N"): NumberObject(3), NameObject("/Length"): NameObject(str(len(icc_profile_file_data))), }) icc_profile_obj = self._addObject(icc_profile_stream_obj) output_intent_dict_obj = DictionaryObject() output_intent_dict_obj.update({ NameObject("/S"): NameObject("/GTS_PDFA1"), NameObject("/OutputConditionIdentifier"): createStringObject("sRGB"), NameObject("/DestOutputProfile"): icc_profile_obj, NameObject("/Type"): NameObject("/OutputIntent"), }) output_intent_obj = self._addObject(output_intent_dict_obj) self._root_object.update({ NameObject("/OutputIntents"): ArrayObject([output_intent_obj]), }) pages = self._root_object['/Pages']['/Kids'] # PDF/A needs the glyphs width array embedded in the pdf to be consistent with the ones from the font file. # But it seems like it is not the case when exporting from wkhtmltopdf. if TTFont: fonts = {} # First browse through all the pages of the pdf file, to get a reference to all the fonts used in the PDF. for page in pages: for font in page.getObject()['/Resources']['/Font'].values(): for descendant in font.getObject()['/DescendantFonts']: fonts[descendant.idnum] = descendant.getObject() # Then for each font, rewrite the width array with the information taken directly from the font file. # The new width are calculated such as width = round(1000 * font_glyph_width / font_units_per_em) # See: http://martin.hoppenheit.info/blog/2018/pdfa-validation-and-inconsistent-glyph-width-information/ for font in fonts.values(): font_file = font['/FontDescriptor']['/FontFile2'] stream = io.BytesIO(decompress(font_file._data)) ttfont = TTFont(stream) font_upm = ttfont['head'].unitsPerEm glyphs = ttfont.getGlyphSet()._hmtx.metrics glyph_widths = [] for key, values in glyphs.items(): if key[:5] == 'glyph': glyph_widths.append(NumberObject(round(1000.0 * values[0] / font_upm))) font[NameObject('/W')] = ArrayObject([NumberObject(1), ArrayObject(glyph_widths)]) stream.close() else: _logger.warning('The fonttools package is not installed. Generated PDF may not be PDF/A compliant.') outlines = self._root_object['/Outlines'].getObject() outlines[NameObject('/Count')] = NumberObject(1) # Set odoo as producer self.addMetadata({ '/Creator': "Odoo", '/Producer': "Odoo", }) self.is_pdfa = True
def make_page_fields_readonly(page): for j in range(0, len(page["/Annots"])): writer_annot = page["/Annots"][j].getObject() existing_flags = writer_annot.get("/Ff") if isinstance(existing_flags, NumberObject): writer_annot.update({NameObject("/Ff"): NumberObject(existing_flags | 1)})