def add_xobject_to_page(page, obj_id): res = page.setdefault(NameObject('/Resources'), DictionaryObject()) xo = res.setdefault(NameObject('/XObject'), DictionaryObject()) seq = 0 while True: name = NameObject('/img_%s' % seq) if name not in xo: xo[name] = obj_id return name seq += 1
def addAttachment(self, name, data, subtype=None): """ Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules. :param name: The name of the attachement :param data: The data of the attachement :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise. It should take the form of "/xxx#2Fxxx". E.g. for "text/xml": "/text#2Fxml" """ adapted_subtype = subtype if subtype: # If we receive the subtype in an 'unformated' (mimetype) format, we'll try to convert it to a pdf-valid one if REGEX_SUBTYPE_UNFORMATED.match(subtype): adapted_subtype = '/' + subtype.replace('/', '#2F') if not REGEX_SUBTYPE_FORMATED.match(adapted_subtype): # The subtype still does not match the correct format, so we will not add it to the document _logger.warning( "Attempt to add an attachment with the incorrect subtype '%s'. The subtype will be ignored.", subtype) adapted_subtype = '' attachment = self._create_attachment_object({ 'filename': name, 'content': data, 'subtype': adapted_subtype, }) if self._root_object.get('/Names') and self._root_object['/Names'].get( '/EmbeddedFiles'): names_array = self._root_object["/Names"]["/EmbeddedFiles"][ "/Names"] names_array.extend([attachment.getObject()['/F'], attachment]) else: names_array = ArrayObject() names_array.extend([attachment.getObject()['/F'], attachment]) embedded_files_names_dictionary = DictionaryObject() embedded_files_names_dictionary.update( {NameObject("/Names"): names_array}) embedded_files_dictionary = DictionaryObject() embedded_files_dictionary.update({ NameObject("/EmbeddedFiles"): embedded_files_names_dictionary }) self._root_object.update( {NameObject("/Names"): embedded_files_dictionary}) if self._root_object.get('/AF'): attachment_array = self._root_object['/AF'] attachment_array.extend([attachment]) else: # Create a new object containing an array referencing embedded file # And reference this array in the root catalogue attachment_array = self._addObject(ArrayObject([attachment])) self._root_object.update({NameObject("/AF"): attachment_array})
def _create_attachment_object(self, attachment): ''' Create a PyPdf2.generic object representing an embedded file. :param attachment: A dictionary containing: * filename: The name of the file to embed (required) * content: The bytes of the file to embed (required) * subtype: The mime-type of the file to embed (optional) :return: ''' file_entry = DecodedStreamObject() file_entry.setData(attachment['content']) file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): DictionaryObject({ NameObject('/CheckSum'): createStringObject(md5(attachment['content']).hexdigest()), NameObject('/ModDate'): createStringObject( datetime.now().strftime(DEFAULT_PDF_DATETIME_FORMAT)), NameObject('/Size'): NameObject(str(len(attachment['content']))), }), }) if attachment.get('subtype'): file_entry.update({ NameObject("/Subtype"): NameObject(attachment['subtype']), }) file_entry_object = self._addObject(file_entry) filename_object = createStringObject(attachment['filename']) filespec_object = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): filename_object, NameObject("/EF"): DictionaryObject({ NameObject("/F"): file_entry_object, NameObject('/UF'): file_entry_object, }), NameObject("/UF"): filename_object, }) if attachment.get('description'): filespec_object.update({ NameObject("/Desc"): createStringObject(attachment['description']) }) return self._addObject(filespec_object)
def create_annot_box(x1, y1, x2, y2, meta, color=[1, 0, 0]): new_annot = DictionaryObject() new_annot.update({ # NameObject("/P"): parent, NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Square"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), }) return new_annot
def createHighlight(self,x1, y1, x2, y2, meta, color = [1, 0, 0]): newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x1), FloatObject(y2), FloatObject(x2), FloatObject(y2), FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y1) ]), }) return newHighlight
def createHighlight(x0, y0, x1, y1, color=[0, 0, 0]): newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x0), FloatObject(y0), FloatObject(x1), FloatObject(y1) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x0), FloatObject(y1), FloatObject(x1), FloatObject(y1), FloatObject(x0), FloatObject(y0), FloatObject(x1), FloatObject(y0) ]), }) return newHighlight
def _filespec_additional_attachments(pdf_filestream, name_arrayobj_cdict, file_dict, file_bin): filename = file_dict['filename'] logger.debug('_filespec_additional_attachments filename=%s', filename) mod_date_pdf = _get_pdf_timestamp(file_dict['mod_date']) md5sum = hashlib.md5(file_bin).hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(mod_date_pdf), NameObject('/Size'): NameObject(str(len(file_bin))), }) file_entry = DecodedStreamObject() file_entry.setData(file_bin) file_mimetype = mimetypes.guess_type(filename)[0] if not file_mimetype: file_mimetype = 'application/octet-stream' file_mimetype_insert = '/' + file_mimetype.replace('/', '#2f') file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, NameObject("/Subtype"): NameObject(file_mimetype_insert), }) file_entry_obj = pdf_filestream._addObject(file_entry) ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, }) fname_obj = createStringObject(filename) filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Unspecified"), NameObject("/Desc"): createStringObject(file_dict.get('desc', '')), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = pdf_filestream._addObject(filespec_dict) name_arrayobj_cdict[fname_obj] = filespec_obj
def test_FlateDecode(predictor, s): """ Tests FlateDecode decode() and encode() methods. """ codec = FlateDecode() s = s.encode() encoded = codec.encode(s) assert codec.decode(encoded, DictionaryObject({"/Predictor": predictor})) == s
def addAttachment(self, name, data, subtype=""): """ Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules. :param name: The name of the attachement :param data: The data of the attachement :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise. It should take the form of "/xxx%2Fxxx". E.g. for "text/xml": "/text%2Fxml" """ if subtype == 'application/xml': subtype = '/application#2Fxml' attachment = self._create_attachment_object({ 'filename': name, 'content': data, 'subtype': subtype, }) if self._root_object.get('/Names') and self._root_object['/Names'].get( '/EmbeddedFiles'): names_array = self._root_object["/Names"]["/EmbeddedFiles"][ "/Names"] names_array.extend([attachment.getObject()['/F'], attachment]) else: names_array = ArrayObject() names_array.extend([attachment.getObject()['/F'], attachment]) embedded_files_names_dictionary = DictionaryObject() embedded_files_names_dictionary.update( {NameObject("/Names"): names_array}) embedded_files_dictionary = DictionaryObject() embedded_files_dictionary.update({ NameObject("/EmbeddedFiles"): embedded_files_names_dictionary }) self._root_object.update( {NameObject("/Names"): embedded_files_dictionary}) if self._root_object.get('/AF'): attachment_array = self._root_object['/AF'] attachment_array.extend([attachment]) else: # Create a new object containing an array referencing embedded file # And reference this array in the root catalogue attachment_array = self._addObject(ArrayObject([attachment])) self._root_object.update({NameObject("/AF"): attachment_array})
def createHighlight(bbox=(0, 0, 1, 1), contents="", color=[1, 1, 0], author="iwasakishuto(@cabernet_rock)"): """Create a Highlight Args: bbox (tuple) : a bounding box showing the location of highlight. contents (str) : Text comments for a highlight label. color (list) : Highlight color. Defaults to ``[1,1,0]``. (yellow) author (str) : Who wrote the annotation (comment). Defaults to ``"iwasakishuto(@cabernet_rock)"`` . Returns: DictionaryObject: Highlight information. Examples: >>> from gummy.utils import createHighlight, addHighlightToPage >>> from PyPDF2 import PdfFileWriter, PdfFileReader >>> page_no = 0 >>> pdfOutput = PdfFileWriter() >>> with open("input.pdf", mode="rb") as inPdf: ... pdfInput = PdfFileReader(inPdf) ... page = pdfInput.getPage(page_no) ... highlight = createHighlight(bbox=(10,10,90,90), contents="COMMENT", color=(1,1,0)) ... addHighlightToPage(highlight, page, pdfOutput) ... pdfOutput.addPage(page) ... with open("output.pdf", mode="wb") as outPdf: ... pdfOutput.write(outPdf) """ from PyPDF2.generic import (DictionaryObject, NumberObject, FloatObject, NameObject, TextStringObject, ArrayObject) x1, y1, x2, y2 = bbox newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(author), NameObject("/Contents"): TextStringObject(contents), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([FloatObject(e) for e in bbox]), NameObject("/QuadPoints"): ArrayObject([FloatObject(e) for e in [x1, y2, x2, y2, x1, y1, x2, y1]]), }) return newHighlight
def create_highlight(self, x1, y1, x2, y2, meta, color=[0, 1, 0]): """ Create a highlight for a PDF. Parameters ---------- x1, y1 : float bottom left corner x2, y2 : float top right corner meta : dict keys are "author" and "contents" color : iterable Three elements, (r,g,b) """ new_highlight = DictionaryObject() new_highlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x1), FloatObject(y2), FloatObject(x2), FloatObject(y2), FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y1) ]), }) return new_highlight
def _create_annotation(x1, y1, x2, y2, color, subtype): annotation = DictionaryObject() annotation.update({ NameObject('/Subtype'): NameObject(subtype), NameObject('/C'): ArrayObject([FloatObject(c) for c in color]), NameObject('/Rect'): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2)]), }) return annotation
def test_FlateDecode_unsupported_predictor(): """ Inputs an unsupported predictor (outside the [10, 15] range) checking that PdfReadError() is raised. Once this predictor support is updated in the future, this test case may be removed. """ codec = FlateDecode() predictors = (-10, -1, 0, 9, 16, 20, 100) for predictor, s in cartesian_product(predictors, filter_inputs): s = s.encode() with pytest.raises(PdfReadError): codec.decode(codec.encode(s), DictionaryObject({"/Predictor": predictor}))
def append_attachment(writer: PdfFileWriter, fname: str, fdata: bytes): """Append attachments to a PDF.""" # The entry for the file file_entry = DecodedStreamObject() file_entry.setData(fdata) file_entry.update({NameObject("/Type"): NameObject("/EmbeddedFile")}) # The Filespec entry efEntry = DictionaryObject() efEntry.update({NameObject("/F"): file_entry}) filespec = DictionaryObject() filespec.update({ NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): createStringObject(fname), NameObject("/EF"): efEntry, }) if "/Names" not in writer._root_object.keys(): # No files attached yet. Create the entry for the root, as it needs a reference to the Filespec embeddedFilesNamesDictionary = DictionaryObject() embeddedFilesNamesDictionary.update({ NameObject("/Names"): ArrayObject([createStringObject(fname), filespec]) }) embeddedFilesDictionary = DictionaryObject() embeddedFilesDictionary.update( {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary}) writer._root_object.update( {NameObject("/Names"): embeddedFilesDictionary}) else: # There are files already attached. Append the new file. writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append( createStringObject(fname)) writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append( filespec)
def test_CCITTFaxDecode(): data = b"" parameters = DictionaryObject({ "/K": NumberObject(-1), "/Columns": NumberObject(17) }) # This was just the result PyPDF2 1.27.9 returned. # It would be awesome if we could check if that is actually correct. assert CCITTFaxDecode.decode(data, parameters) == ( b"II*\x00\x08\x00\x00\x00\x08\x00\x00\x01\x04\x00\x01\x00\x00\x00\x11\x00" b"\x00\x00\x01\x01\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x01" b"\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x03\x01\x03\x00\x01\x00" b"\x00\x00\x04\x00\x00\x00\x06\x01\x03\x00\x01\x00\x00\x00\x00\x00" b"\x00\x00\x11\x01\x04\x00\x01\x00\x00\x00l\x00\x00\x00\x16\x01" b"\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x17\x01\x04\x00\x01\x00" b"\x00\x00\x00\x00\x00\x00\x00\x00")
def _create_highlight(self, x0, y0, width, height, comment, author='', color=[0, 0, 0, 0]): self.add_rect(x0, y0, width, height) highlight = DictionaryObject() highlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(author), NameObject("/Contents"): TextStringObject(comment), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x0), FloatObject(y0), FloatObject(x0 + width), FloatObject(y0 + width) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x0), FloatObject(y0 + width), FloatObject(x0 + width), FloatObject(y0 + width), FloatObject(x0), FloatObject(y0), FloatObject(x0 + width), FloatObject(y0) ]), }) return highlight
def add_comment(output, page, text, rectangle): obj = output._addObject( DictionaryObject({ NameObject('/DA'): TextStringObject(' /Helv 10 Tf'), NameObject('/Subtype'): NameObject('/FreeText'), NameObject('/Rect'): RectangleObject(rectangle), NameObject('/Type'): NameObject('/Annot'), NameObject('/Contents'): TextStringObject(text), NameObject('/C'): ArrayObject([FloatObject(1), FloatObject(1), FloatObject(1)]), })) page['/Annots'].append(obj)
def createHighlight(x1, y1, x2, y2, meta, color=[1, 0, 0]): ''' Create a highlight object which will be applied to a box in a PDF page (please, notice that coordinates start in the bottom left) with specific metadata and colors. ''' newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x1), FloatObject(y2), FloatObject(x2), FloatObject(y2), FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y1) ]), }) return newHighlight
def get_pdf_measure(m, gcs, poly, bounds_default): """ Returns the PDF Measure dictionary. The Measure dictionary is used in the viewport array and specifies the scale and units that apply to the output map. """ measure = DictionaryObject() measure[NameObject('/Type')] = NameObject('/Measure') measure[NameObject('/Subtype')] = NameObject('/GEO') bounds = ArrayObject() """ Returns the PDF BOUNDS array. The PDF's bounds array is equivalent to the map's neatline, i.e., the border delineating the extent of geographic data on the output map. """ for x in [0, 1, 0, 0, 1, 0, 1, 1]: bounds.append(FloatObject(str(x))) measure[NameObject('/Bounds')] = bounds measure[NameObject('/GPTS')] = get_pdf_gpts(m, poly) measure[NameObject('/LPTS')] = bounds measure[NameObject('/GCS')] = gcs return measure
def test_DictionaryObject_xmp_meta(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) assert do.xmp_metadata is None
def _update_metadata_add_attachment(self, pdf_metadata, output_intents): '''This method is inspired from the code of the addAttachment() method of the PyPDF2 lib''' # The entry for the file facturx_xml_str = self.factx.xml_str md5sum = hashlib.md5().hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(_get_pdf_timestamp()), NameObject('/Size'): NameObject(str(len(facturx_xml_str))), }) file_entry = DecodedStreamObject() file_entry.setData(facturx_xml_str) # here we integrate the file itself file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = self._addObject(file_entry) # The Filespec entry ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) xmp_filename = self.factx.flavor.details['xmp_filename'] fname_obj = createStringObject(xmp_filename) filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Desc"): createStringObject("Factur-X Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = self._addObject(filespec_dict) name_arrayobj_cdict = {fname_obj: filespec_obj} # TODO: add back additional attachments? logger.debug('name_arrayobj_cdict=%s', name_arrayobj_cdict) name_arrayobj_content_sort = list( sorted(name_arrayobj_cdict.items(), key=lambda x: x[0])) logger.debug('name_arrayobj_content_sort=%s', name_arrayobj_content_sort) name_arrayobj_content_final = [] af_list = [] for (fname_obj, filespec_obj) in name_arrayobj_content_sort: name_arrayobj_content_final += [fname_obj, filespec_obj] af_list.append(filespec_obj) embedded_files_names_dict = DictionaryObject({ NameObject("/Names"): ArrayObject(name_arrayobj_content_final), }) # Then create the entry for the root, as it needs a # reference to the Filespec embedded_files_dict = DictionaryObject({ NameObject("/EmbeddedFiles"): embedded_files_names_dict, }) res_output_intents = [] logger.debug('output_intents=%s', output_intents) for output_intent_dict, dest_output_profile_dict in output_intents: dest_output_profile_obj = self._addObject( dest_output_profile_dict) # TODO detect if there are no other objects in output_intent_dest_obj # than /DestOutputProfile output_intent_dict.update({ NameObject("/DestOutputProfile"): dest_output_profile_obj, }) output_intent_obj = self._addObject(output_intent_dict) res_output_intents.append(output_intent_obj) # Update the root xmp_level_str = self.factx.flavor.details['levels'][self.factx.flavor.level]['xmp_str'] xmp_template = self.factx.flavor.get_xmp_xml() metadata_xml_str = _prepare_pdf_metadata_xml(xmp_level_str, xmp_filename, xmp_template, pdf_metadata) metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_file_entry.update({ NameObject('/Subtype'): NameObject('/XML'), NameObject('/Type'): NameObject('/Metadata'), }) metadata_obj = self._addObject(metadata_file_entry) af_value_obj = self._addObject(ArrayObject(af_list)) self._root_object.update({ NameObject("/AF"): af_value_obj, NameObject("/Metadata"): metadata_obj, NameObject("/Names"): embedded_files_dict, # show attachments when opening PDF NameObject("/PageMode"): NameObject("/UseAttachments"), }) logger.debug('res_output_intents=%s', res_output_intents) if res_output_intents: self._root_object.update({ NameObject("/OutputIntents"): ArrayObject(res_output_intents), }) metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata) self.addMetadata(metadata_txt_dict)
def main(): print("Loading metadata and eText information...") with open("bookinfo.json", 'r') as bookInfoRequest: str_response = bookInfoRequest.read() bookInfo = json.loads(str_response) bookInfo = bookInfo[0]['userBookTOList'][0] with open("pageinfo.json", 'r') as pageInfoRequest: pageInfo = json.loads(pageInfoRequest.read()) pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList'] with open("pages.json", 'r') as file: downloadedData = json.loads(file.read())[0]["pdfPlayerPageInfoTOList"] def get_data(page_id): b = next((x['data'] for x in downloadedData if x['pageID'] == page_id), None) return bytearray(base64.standard_b64decode(b[len("data:application/pdf;base64,"):])) with tempfile.TemporaryDirectory() as pdfDownloadDir: # Use a temporary directory to download all the pdf files to # First, download the cover file pdfPageTable = {} pdf_page_label_table = {} # urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf")) with open(os.path.join(pdfDownloadDir, "0000 - cover.pdf"), 'w+b') as ous: ous.write(get_data(pageInfo[0]['pageID'])) # Then, download all the individual pages for the e-book def download(pdfPage): pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder'] savePath = os.path.join(pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber'])) with open(savePath, 'w+b') as out: out.write(get_data(pdfPage['pageID'])) # urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath) threadPool = ThreadPool(40) # 40 threads should download a book fairly quickly print("Reading pages from pageinfo.json to \"{}\"...".format(pdfDownloadDir)) threadPool.map(download, pageInfo) print("Assembling PDF...") # Begin to assemble the final PDF, first by adding all the pages fileMerger = PdfFileWriter() for pdfFile in sorted(os.listdir(pdfDownloadDir)): page = PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0) os.remove(os.path.join(pdfDownloadDir, pdfFile)) # Save on memory a bit fileMerger.addPage(page) bookmarksExist = True # TODO: Bookmarks currently not supported with open("bookmarks.json", 'r') as bookmarkInfoRequest: try: bookmarkInfo = json.loads(bookmarkInfoRequest.read()) bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0] except Exception as e: bookmarksExist = False def recursiveSetBookmarks(aDict, parent=None): if isinstance(aDict, dict): aDict = [aDict] for bookmark in aDict: # These are the main bookmarks under this parent (or the whole document if parent is None) bookmarkName = bookmark['name'] # Name of the section pageNum = str(bookmark['linkvalue']['content']) # First page (in the pdf's format) latestBookmark = fileMerger.addBookmark(bookmarkName, pdfPageTable[pageNum], parent) if 'basketentry' in bookmark: recursiveSetBookmarks(bookmark['basketentry'], latestBookmark) if bookmarksExist: print("Adding bookmarks...") fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning recursiveSetBookmarks(bookmarkInfo['document'][0]['basketcollection']['basket']['basketentry']) else: print("Bookmarks don't exist for book") print("Fixing metadata...") # Hack to fix metadata and page numbers: pdf_page_label_table = [(v, k) for k, v in pdfPageTable.items()] pdf_page_label_table = sorted(pdf_page_label_table, key=(lambda x: int(x[0]))) labels = ArrayObject([ NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")}) ]) last_mode = None last_prefix = "" # Now we check to see the ranges where we have roman numerals or arabic numerals # The following code is not ideal for this, so I'd appreciate a PR with a better solution for pageNumber, pageLabel in pdf_page_label_table: curr_mode = None prefix = "" style = DictionaryObject() if arabicRegex.match(pageLabel): curr_mode = "arabic" prefix = arabicRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/D")}) elif romanRegex.match(pageLabel): curr_mode = "roman" prefix = romanRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/r")}) if curr_mode != last_mode or prefix != last_prefix: if prefix: style.update({ NameObject("/P"): NameObject("({})".format(prefix)) }) labels.extend([ NumberObject(pageNumber), style, ]) last_mode = curr_mode last_prefix = prefix root_obj = fileMerger._root_object # Todo: Fix the weird page numbering bug pageLabels = DictionaryObject() # fileMerger._addObject(pageLabels) pageLabels.update({ NameObject("/Nums"): ArrayObject(labels) }) root_obj.update({ NameObject("/PageLabels"): pageLabels }) print("Writing PDF...") with open("{}.pdf".format(bookInfo['title']).replace("/", "").replace(":", "_"), "wb") as outFile: fileMerger.write(outFile)
def create_annotation(x, y, meta): color = [255.0 / 255.0, 209 / 255.0, 0] # link linkAnnotation = DictionaryObject() # https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf linkAnnotation.update({ # Table 165 NoZoom NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Link"), # Table 164 color, annotation rectangle NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x), FloatObject(y), FloatObject(x + 20), FloatObject(y + 20) ]), # Table 173 link annotation NameObject('/A'): DictionaryObject({ # Table 206 uri NameObject('/S'): NameObject('/URI'), NameObject('/URI'): TextStringObject(meta["contents"]) }), # Table 173 invert rect when mouse NameObject('/H'): NameObject('/I'), # table 164 hor corner radius, vert corner radius, border width # dash array table 56 NameObject('/Border'): ArrayObject([ NameObject(0), NameObject(0), NameObject(5), ]), }) commentAnnotation = DictionaryObject() # https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf commentAnnotation.update({ # Table 165 NoZoom NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Text"), # Table 170 titlebar NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), # Table 164 color, annotation rectangle NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x), FloatObject(y), FloatObject(x + 5), FloatObject(y + 5) ]), # 12.5.6.4 text annotation NameObject('/Open'): BooleanObject(False), NameObject('/Name'): NameObject('/Comment'), }) return linkAnnotation, commentAnnotation
def main(bookId): if bookId.startswith("http"): print("Trying to extract bookId from url") bookData = urllib.parse.parse_qs(bookId.split("?")[-1]) if (bookData.get("values", None)) is not None: bookData = { itemName: [itemValue] for itemName, itemValue in zip( *[iter(bookData["values"][0].split("::"))] * 2) } # Fix capitalization bookData["bookid"] = bookData["bookID"] bookId = bookData["bookid"][0] bookId = int(bookId) print( "Downloading book id {}. Please open an issue on GitHub if this book id is incorrect." .format(bookId)) print("Downloading metadata and eText information...") bookInfoGetUrl = bookInfoUrl.format(bookId) #print(hsidUrl(bookInfoGetUrl)) with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest: str_response = bookInfoRequest.read().decode('utf-8') bookInfo = json.loads(str_response) bookInfo = bookInfo[0]['userBookTOList'][0] pageInfoGetUrl = pageInfoUrl.format( userroleid=roletypeid, bookid=bookId, bookeditionid=bookInfo['bookEditionID']) with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest: pageInfo = json.loads(pageInfoRequest.read().decode('utf-8')) pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList'] def getPageUrl(pdfPage, isCover="N"): pdfPage = pdfPage.replace("/assets/", "") getPage = pagePath = pdfUrl.format(bookid=bookInfo['globalBookID'], pdfpage=pdfPage, iscover=isCover) return hsidUrl(getPage) with tempfile.TemporaryDirectory() as pdfDownloadDir: # Use a temporary directory to download all the pdf files to # First, download the cover file pdfPageTable = {} pdfPageLabelTable = {} urllib.request.urlretrieve( getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf")) # Then, download all the individual pages for the e-book def download(pdfPage): pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder'] savePath = os.path.join( pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber'])) urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath) threadPool = ThreadPool( 40) # 40 threads should download a book fairly quickly print("Downloading pages to \"{}\"...".format(pdfDownloadDir)) threadPool.map(download, pageInfo) print("Assembling PDF...") # Begin to assemble the final PDF, first by adding all the pages fileMerger = PdfFileWriter() for pdfFile in sorted(os.listdir(pdfDownloadDir)): fileMerger.addPage( PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0)) # And then add all the bookmarks to the final PDF bookmarkInfoGetUrl = bookmarkInfoUrl.format( userroleid=roletypeid, bookid=bookId, language=language, bookeditionid=bookInfo['bookEditionID'], scenarioid=1001) bookmarksExist = True with urllib.request.urlopen( hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest: try: bookmarkInfo = json.loads( bookmarkInfoRequest.read().decode('utf-8')) bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0] except Exception as e: bookmarksExist = False def recursiveSetBookmarks(aDict, parent=None): if isinstance(aDict, dict): aDict = [aDict] for bookmark in aDict: # These are the main bookmarks under this parent (or the whole document if parent is None) bookmarkName = bookmark['n'] # Name of the section pageNum = str(bookmark['lv'] ['content']) # First page (in the pdf's format) latestBookmark = fileMerger.addBookmark( bookmarkName, pdfPageTable[pageNum], parent) if 'be' in bookmark: recursiveSetBookmarks(bookmark['be'], latestBookmark) if bookmarksExist: print("Adding bookmarks...") fileMerger.addBookmark( "Cover", 0) # Add a bookmark to the cover at the beginning recursiveSetBookmarks(bookmarkInfo['document'][0]['bc']['b']['be']) else: print("Bookmarks don't exist for ID {}".format(bookId)) print("Fixing metadata...") # Hack to fix metadata and page numbers: pdfPageLabelTable = [(v, k) for k, v in pdfPageTable.items()] pdfPageLabelTable = sorted(pdfPageLabelTable, key=(lambda x: int(x[0]))) labels = ArrayObject([ NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")}) ]) lastMode = None lastPrefix = "" # Now we check to see the ranges where we have roman numerals or arabic numerals # The following code is not ideal for this, so I'd appreciate a PR with a better solution for pageNumber, pageLabel in pdfPageLabelTable: currMode = None prefix = "" style = DictionaryObject() if arabicRegex.match(pageLabel): currMode = "arabic" prefix = arabicRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/D")}) elif romanRegex.match(pageLabel): currMode = "roman" prefix = romanRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/r")}) if currMode != lastMode or prefix != lastPrefix: if prefix: style.update( {NameObject("/P"): NameObject("({})".format(prefix))}) labels.extend([ NumberObject(pageNumber), style, ]) lastMode = currMode lastPrefix = prefix rootObj = fileMerger._root_object # Todo: Fix the weird page numbering bug pageLabels = DictionaryObject() #fileMerger._addObject(pageLabels) pageLabels.update({NameObject("/Nums"): ArrayObject(labels)}) rootObj.update({NameObject("/PageLabels"): pageLabels}) print("Writing PDF...") with open( "{} - {}.pdf".format(bookId, bookInfo['title']).replace( "/", "").replace(":", "_"), "wb") as outFile: fileMerger.write(outFile)
def test_DictionaryObject_key_is_no_pdfobject(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) with pytest.raises(ValueError) as exc: do["foo"] = NameObject("/GoTo") assert exc.value.args[0] == "key must be PdfObject"
def _facturx_update_metadata_add_attachment(pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level, output_intents): md5sum = hashlib.md5(facturx_xml_str).hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(datetime.datetime.now().isoformat()), NameObject('/Size'): NameObject(str(len(facturx_xml_str))), }) file_entry = DecodedStreamObject() file_entry.setData(facturx_xml_str) # here we integrate the file itself file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = pdf_filestream._addObject(file_entry) # The Filespec entry ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) fname_obj = createStringObject("ZUGFeRD-invoice.xml") filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Desc"): createStringObject("Factur-X Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = pdf_filestream._addObject(filespec_dict) name_arrayobj_cdict = {fname_obj: filespec_obj} name_arrayobj_content_sort = list( sorted(name_arrayobj_cdict.items(), key=lambda x: x[0])) name_arrayobj_content_final = [] af_list = [] for (fname_obj, filespec_obj) in name_arrayobj_content_sort: name_arrayobj_content_final += [fname_obj, filespec_obj] af_list.append(filespec_obj) embedded_files_names_dict = DictionaryObject({ NameObject("/Names"): ArrayObject(name_arrayobj_content_final), }) # Then create the entry for the root, as it needs a # reference to the Filespec embedded_files_dict = DictionaryObject({ NameObject("/EmbeddedFiles"): embedded_files_names_dict, }) res_output_intents = [] for output_intent_dict, dest_output_profile_dict in output_intents: dest_output_profile_obj = pdf_filestream._addObject( dest_output_profile_dict) # TODO detect if there are no other objects in output_intent_dest_obj # than /DestOutputProfile output_intent_dict.update({ NameObject("/DestOutputProfile"): dest_output_profile_obj, }) output_intent_obj = pdf_filestream._addObject(output_intent_dict) res_output_intents.append(output_intent_obj) # Update the root metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata) metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_file_entry.update({ NameObject('/Subtype'): NameObject('/XML'), NameObject('/Type'): NameObject('/Metadata'), }) metadata_obj = pdf_filestream._addObject(metadata_file_entry) af_value_obj = pdf_filestream._addObject(ArrayObject(af_list)) pdf_filestream._root_object.update({ NameObject("/AF"): af_value_obj, NameObject("/Metadata"): metadata_obj, NameObject("/Names"): embedded_files_dict, # show attachments when opening PDF NameObject("/PageMode"): NameObject("/UseAttachments"), }) if res_output_intents: pdf_filestream._root_object.update({ NameObject("/OutputIntents"): ArrayObject(res_output_intents), }) metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata) pdf_filestream.addMetadata(metadata_txt_dict)
def test_DictionaryObject_setdefault_value(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) do.setdefault(NameObject("/S"), NameObject("/GoTo"))
def convert_to_pdfa(self): """ Transform the opened PDF file into a PDF/A compliant file """ # Set the PDF version to 1.7 (as PDF/A-3 is based on version 1.7) and make it PDF/A compliant. # See https://github.com/veraPDF/veraPDF-validation-profiles/wiki/PDFA-Parts-2-and-3-rules#rule-612-1 # " The file header shall begin at byte zero and shall consist of "%PDF-1.n" followed by a single EOL marker, # where 'n' is a single digit number between 0 (30h) and 7 (37h) " # " The aforementioned EOL marker shall be immediately followed by a % (25h) character followed by at least four # bytes, each of whose encoded byte values shall have a decimal value greater than 127 " self._header = b"%PDF-1.7\n%\xFF\xFF\xFF\xFF" # Add a document ID to the trailer. This is only needed when using encryption with regular PDF, but is required # when using PDF/A pdf_id = ByteStringObject(md5(self._reader.stream.getvalue()).digest()) # The first string is based on the content at the time of creating the file, while the second is based on the # content of the file when it was last updated. When creating a PDF, both are set to the same value. self._ID = ArrayObject((pdf_id, pdf_id)) with file_open('tools/data/files/sRGB2014.icc', mode='rb') as icc_profile: icc_profile_file_data = compress(icc_profile.read()) icc_profile_stream_obj = DecodedStreamObject() icc_profile_stream_obj.setData(icc_profile_file_data) icc_profile_stream_obj.update({ NameObject("/Filter"): NameObject("/FlateDecode"), NameObject("/N"): NumberObject(3), NameObject("/Length"): NameObject(str(len(icc_profile_file_data))), }) icc_profile_obj = self._addObject(icc_profile_stream_obj) output_intent_dict_obj = DictionaryObject() output_intent_dict_obj.update({ NameObject("/S"): NameObject("/GTS_PDFA1"), NameObject("/OutputConditionIdentifier"): createStringObject("sRGB"), NameObject("/DestOutputProfile"): icc_profile_obj, NameObject("/Type"): NameObject("/OutputIntent"), }) output_intent_obj = self._addObject(output_intent_dict_obj) self._root_object.update({ NameObject("/OutputIntents"): ArrayObject([output_intent_obj]), }) pages = self._root_object['/Pages']['/Kids'] # PDF/A needs the glyphs width array embedded in the pdf to be consistent with the ones from the font file. # But it seems like it is not the case when exporting from wkhtmltopdf. if TTFont: fonts = {} # First browse through all the pages of the pdf file, to get a reference to all the fonts used in the PDF. for page in pages: for font in page.getObject()['/Resources']['/Font'].values(): for descendant in font.getObject()['/DescendantFonts']: fonts[descendant.idnum] = descendant.getObject() # Then for each font, rewrite the width array with the information taken directly from the font file. # The new width are calculated such as width = round(1000 * font_glyph_width / font_units_per_em) # See: http://martin.hoppenheit.info/blog/2018/pdfa-validation-and-inconsistent-glyph-width-information/ for font in fonts.values(): font_file = font['/FontDescriptor']['/FontFile2'] stream = io.BytesIO(decompress(font_file._data)) ttfont = TTFont(stream) font_upm = ttfont['head'].unitsPerEm glyphs = ttfont.getGlyphSet()._hmtx.metrics glyph_widths = [] for key, values in glyphs.items(): if key[:5] == 'glyph': glyph_widths.append(NumberObject(round(1000.0 * values[0] / font_upm))) font[NameObject('/W')] = ArrayObject([NumberObject(1), ArrayObject(glyph_widths)]) stream.close() else: _logger.warning('The fonttools package is not installed. Generated PDF may not be PDF/A compliant.') outlines = self._root_object['/Outlines'].getObject() outlines[NameObject('/Count')] = NumberObject(1) # Set odoo as producer self.addMetadata({ '/Creator': "Odoo", '/Producer': "Odoo", }) self.is_pdfa = True
def zugferd_update_metadata_add_attachment(self, pdf_filestream, fname, fdata): '''This method is inspired from the code of the addAttachment() method of the PyPDF2 lib''' # The entry for the file moddate = DictionaryObject() moddate.update({ NameObject('/ModDate'): createStringObject(self._get_pdf_timestamp()) }) file_entry = DecodedStreamObject() file_entry.setData(fdata) file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): moddate, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = pdf_filestream._addObject(file_entry) # The Filespec entry efEntry = DictionaryObject() efEntry.update({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) fname_obj = createStringObject(fname) filespec = DictionaryObject() filespec.update({ NameObject("/AFRelationship"): NameObject("/Alternative"), NameObject("/Desc"): createStringObject("ZUGFeRD Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): efEntry, NameObject("/UF"): fname_obj, }) embeddedFilesNamesDictionary = DictionaryObject() embeddedFilesNamesDictionary.update({ NameObject("/Names"): ArrayObject([fname_obj, pdf_filestream._addObject(filespec)]) }) # Then create the entry for the root, as it needs a # reference to the Filespec embeddedFilesDictionary = DictionaryObject() embeddedFilesDictionary.update( {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary}) # Update the root metadata_xml_str = self._prepare_pdf_metadata() metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_value = pdf_filestream._addObject(metadata_file_entry) af_value = pdf_filestream._addObject( ArrayObject([pdf_filestream._addObject(filespec)])) pdf_filestream._root_object.update({ NameObject("/AF"): af_value, NameObject("/Metadata"): metadata_value, NameObject("/Names"): embeddedFilesDictionary, }) info_dict = self._prepare_pdf_info() pdf_filestream.addMetadata(info_dict)
def test_DictionaryObject_setdefault_value_is_no_pdfobject(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) with pytest.raises(ValueError) as exc: do.setdefault(NameObject("/S"), "/GoTo") assert exc.value.args[0] == "value must be PdfObject"