def createHighlight(self,x1, y1, x2, y2, meta, color = [1, 0, 0]): newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x1), FloatObject(y2), FloatObject(x2), FloatObject(y2), FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y1) ]), }) return newHighlight
def create_annot_box(x1, y1, x2, y2, meta, color=[1, 0, 0]): new_annot = DictionaryObject() new_annot.update({ # NameObject("/P"): parent, NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Square"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), }) return new_annot
def createHighlight(x0, y0, x1, y1, color=[0, 0, 0]): newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x0), FloatObject(y0), FloatObject(x1), FloatObject(y1) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x0), FloatObject(y1), FloatObject(x1), FloatObject(y1), FloatObject(x0), FloatObject(y0), FloatObject(x1), FloatObject(y0) ]), }) return newHighlight
def addAttachment(self, name, data, subtype=None): """ Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules. :param name: The name of the attachement :param data: The data of the attachement :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise. It should take the form of "/xxx#2Fxxx". E.g. for "text/xml": "/text#2Fxml" """ adapted_subtype = subtype if subtype: # If we receive the subtype in an 'unformated' (mimetype) format, we'll try to convert it to a pdf-valid one if REGEX_SUBTYPE_UNFORMATED.match(subtype): adapted_subtype = '/' + subtype.replace('/', '#2F') if not REGEX_SUBTYPE_FORMATED.match(adapted_subtype): # The subtype still does not match the correct format, so we will not add it to the document _logger.warning( "Attempt to add an attachment with the incorrect subtype '%s'. The subtype will be ignored.", subtype) adapted_subtype = '' attachment = self._create_attachment_object({ 'filename': name, 'content': data, 'subtype': adapted_subtype, }) if self._root_object.get('/Names') and self._root_object['/Names'].get( '/EmbeddedFiles'): names_array = self._root_object["/Names"]["/EmbeddedFiles"][ "/Names"] names_array.extend([attachment.getObject()['/F'], attachment]) else: names_array = ArrayObject() names_array.extend([attachment.getObject()['/F'], attachment]) embedded_files_names_dictionary = DictionaryObject() embedded_files_names_dictionary.update( {NameObject("/Names"): names_array}) embedded_files_dictionary = DictionaryObject() embedded_files_dictionary.update({ NameObject("/EmbeddedFiles"): embedded_files_names_dictionary }) self._root_object.update( {NameObject("/Names"): embedded_files_dictionary}) if self._root_object.get('/AF'): attachment_array = self._root_object['/AF'] attachment_array.extend([attachment]) else: # Create a new object containing an array referencing embedded file # And reference this array in the root catalogue attachment_array = self._addObject(ArrayObject([attachment])) self._root_object.update({NameObject("/AF"): attachment_array})
def createHighlight(bbox=(0, 0, 1, 1), contents="", color=[1, 1, 0], author="iwasakishuto(@cabernet_rock)"): """Create a Highlight Args: bbox (tuple) : a bounding box showing the location of highlight. contents (str) : Text comments for a highlight label. color (list) : Highlight color. Defaults to ``[1,1,0]``. (yellow) author (str) : Who wrote the annotation (comment). Defaults to ``"iwasakishuto(@cabernet_rock)"`` . Returns: DictionaryObject: Highlight information. Examples: >>> from gummy.utils import createHighlight, addHighlightToPage >>> from PyPDF2 import PdfFileWriter, PdfFileReader >>> page_no = 0 >>> pdfOutput = PdfFileWriter() >>> with open("input.pdf", mode="rb") as inPdf: ... pdfInput = PdfFileReader(inPdf) ... page = pdfInput.getPage(page_no) ... highlight = createHighlight(bbox=(10,10,90,90), contents="COMMENT", color=(1,1,0)) ... addHighlightToPage(highlight, page, pdfOutput) ... pdfOutput.addPage(page) ... with open("output.pdf", mode="wb") as outPdf: ... pdfOutput.write(outPdf) """ from PyPDF2.generic import (DictionaryObject, NumberObject, FloatObject, NameObject, TextStringObject, ArrayObject) x1, y1, x2, y2 = bbox newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(author), NameObject("/Contents"): TextStringObject(contents), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([FloatObject(e) for e in bbox]), NameObject("/QuadPoints"): ArrayObject([FloatObject(e) for e in [x1, y2, x2, y2, x1, y1, x2, y1]]), }) return newHighlight
def create_highlight(self, x1, y1, x2, y2, meta, color=[0, 1, 0]): """ Create a highlight for a PDF. Parameters ---------- x1, y1 : float bottom left corner x2, y2 : float top right corner meta : dict keys are "author" and "contents" color : iterable Three elements, (r,g,b) """ new_highlight = DictionaryObject() new_highlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x1), FloatObject(y2), FloatObject(x2), FloatObject(y2), FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y1) ]), }) return new_highlight
def _create_annotation(x1, y1, x2, y2, color, subtype): annotation = DictionaryObject() annotation.update({ NameObject('/Subtype'): NameObject(subtype), NameObject('/C'): ArrayObject([FloatObject(c) for c in color]), NameObject('/Rect'): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2)]), }) return annotation
def createAnnotPdf(geom_type, myShapePdf): # input variables # part 1: read geometry pdf to get the vertices and rectangle to use source = PdfFileReader(open(myShapePdf, 'rb')) geomPage = source.getPage(0) mystr = geomPage.getObject()['/Contents'].getData() # to pinpoint the string part: 1.19997 791.75999 m 1.19997 0.19466 l 611.98627 0.19466 l 611.98627 791.75999 l 1.19997 791.75999 l # the format seems to follow x1 y1 m x2 y2 l x3 y3 l x4 y4 l x5 y5 l geomString = mystr.split('S\r\n')[0].split('M\r\n')[1] coordsString = [ value for value in geomString.split(' ') if value not in ['m', 'l', ''] ] # part 2: update geometry in the map if geom_type.upper() == 'POLYGON': pdf_geom = PdfFileReader(open(annot_poly, 'rb')) elif geom_type.upper() == 'POLYLINE': pdf_geom = PdfFileReader(open(annot_line, 'rb')) page_geom = pdf_geom.getPage(0) annot = page_geom['/Annots'][0] updateVertices = "annot.getObject().update({NameObject('/Vertices'):ArrayObject([FloatObject(" + coordsString[ 0] + ")" for item in coordsString[1:]: updateVertices = updateVertices + ',FloatObject(' + item + ')' updateVertices = updateVertices + "])})" exec(updateVertices) xcoords = [] ycoords = [] for i in range(0, len(coordsString) - 1): if i % 2 == 0: xcoords.append(float(coordsString[i])) else: ycoords.append(float(coordsString[i])) # below rect seems to be geom bounding box coordinates: xmin, ymin, xmax,ymax annot.getObject().update({ NameObject('/Rect'): ArrayObject([ FloatObject(min(xcoords)), FloatObject(min(ycoords)), FloatObject(max(xcoords)), FloatObject(max(ycoords)) ]) }) annot.getObject().pop('/AP') # this is to get rid of the ghost shape annot.getObject().update({NameObject('/T'): createStringObject(u'ERIS')}) output = PdfFileWriter() output.addPage(page_geom) annotPdf = os.path.join(scratch, "annot.pdf") outputStream = open(annotPdf, "wb") #output.setPageMode('/UseOutlines') output.write(outputStream) outputStream.close() output = None return annotPdf
def addHighlightToPage(highlight, page, output): """Add a highlight to a page. Args: highlight (DictionaryObject) : Highlight information. page (PageObject) : A single page within a PDF file. output (PdfFileWriter) : A pdf writer. Examples: >>> from gummy.utils import createHighlight, addHighlightToPage >>> from PyPDF2 import PdfFileWriter, PdfFileReader >>> page_no = 0 >>> pdfOutput = PdfFileWriter() >>> with open("input.pdf", mode="rb") as inPdf: ... pdfInput = PdfFileReader(inPdf) ... page = pdfInput.getPage(page_no) ... highlight = createHighlight(bbox=(10,10,90,90), contents="COMMENT", color=(1,1,0)) ... addHighlightToPage(highlight, page, pdfOutput) ... pdfOutput.addPage(page) ... with open("output.pdf", mode="wb") as outPdf: ... pdfOutput.write(outPdf) """ from PyPDF2.generic import (NameObject, ArrayObject) highlight_ref = output._addObject(highlight) if "/Annots" in page: page[NameObject("/Annots")].append(highlight_ref) else: page[NameObject("/Annots")] = ArrayObject([highlight_ref])
def addHighlightToPage(self,highlight, page, output): highlight_ref = output._addObject(highlight) if "/Annots" in page: page[NameObject("/Annots")].append(highlight_ref) else: page[NameObject("/Annots")] = ArrayObject([highlight_ref])
def add_annot_to_page(annot, page, output): annot_ref = output._addObject(annot) if "/Annots" in page: page[NameObject("/Annots")].append(annot_ref) else: page[NameObject("/Annots")] = ArrayObject([annot_ref])
def _create_highlight(self, x0, y0, width, height, comment, author='', color=[0, 0, 0, 0]): self.add_rect(x0, y0, width, height) highlight = DictionaryObject() highlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(author), NameObject("/Contents"): TextStringObject(comment), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x0), FloatObject(y0), FloatObject(x0 + width), FloatObject(y0 + width) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x0), FloatObject(y0 + width), FloatObject(x0 + width), FloatObject(y0 + width), FloatObject(x0), FloatObject(y0), FloatObject(x0 + width), FloatObject(y0) ]), }) return highlight
def addAttachment(self, name, data, subtype=""): """ Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules. :param name: The name of the attachement :param data: The data of the attachement :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise. It should take the form of "/xxx%2Fxxx". E.g. for "text/xml": "/text%2Fxml" """ if subtype == 'application/xml': subtype = '/application#2Fxml' attachment = self._create_attachment_object({ 'filename': name, 'content': data, 'subtype': subtype, }) if self._root_object.get('/Names') and self._root_object['/Names'].get( '/EmbeddedFiles'): names_array = self._root_object["/Names"]["/EmbeddedFiles"][ "/Names"] names_array.extend([attachment.getObject()['/F'], attachment]) else: names_array = ArrayObject() names_array.extend([attachment.getObject()['/F'], attachment]) embedded_files_names_dictionary = DictionaryObject() embedded_files_names_dictionary.update( {NameObject("/Names"): names_array}) embedded_files_dictionary = DictionaryObject() embedded_files_dictionary.update({ NameObject("/EmbeddedFiles"): embedded_files_names_dictionary }) self._root_object.update( {NameObject("/Names"): embedded_files_dictionary}) if self._root_object.get('/AF'): attachment_array = self._root_object['/AF'] attachment_array.extend([attachment]) else: # Create a new object containing an array referencing embedded file # And reference this array in the root catalogue attachment_array = self._addObject(ArrayObject([attachment])) self._root_object.update({NameObject("/AF"): attachment_array})
def addHighlightToPage(highlight, page, output): highlight_ref = output._addObject(highlight) print(highlight_ref) if "/Annots" in page: print("Annots in page") page[NameObject("/Annots")].append(highlight_ref) else: print("Annots not in page") page[NameObject("/Annots")] = ArrayObject([highlight_ref])
def addHighlightToPage(highlight, page, output): ''' Add the annotation object to the page ''' highlight_ref = output._addObject(highlight) if "/Annots" in page: page[NameObject("/Annots")].append(highlight_ref) else: page[NameObject("/Annots")] = ArrayObject([highlight_ref])
def get_pdf_gpts(m, poly): """ Returns the GPTS array object containing the four corners of the map rect in map projection. The GPTS entry is an array of numbers, taken pairwise, defining points as latitude and longitude. m = mapnik map object poly = tuple of (x,y) tuples describing rect polygon - allows geocoding of rotated maps. """ gpts = ArrayObject() proj = mapnik.Projection(m.srs) for x, y in poly: latlon_corner = proj.inverse(mapnik.Coord(x, y)) # these are in lat,lon order according to the specification gpts.append(FloatObject(str(latlon_corner.y))) gpts.append(FloatObject(str(latlon_corner.x))) return gpts
def createHighlight(x1, y1, x2, y2, meta, color=[1, 0, 0]): ''' Create a highlight object which will be applied to a box in a PDF page (please, notice that coordinates start in the bottom left) with specific metadata and colors. ''' newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x1), FloatObject(y2), FloatObject(x2), FloatObject(y2), FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y1) ]), }) return newHighlight
def get_pdf_measure(m, gcs, poly, bounds_default): """ Returns the PDF Measure dictionary. The Measure dictionary is used in the viewport array and specifies the scale and units that apply to the output map. """ measure = DictionaryObject() measure[NameObject('/Type')] = NameObject('/Measure') measure[NameObject('/Subtype')] = NameObject('/GEO') bounds = ArrayObject() """ Returns the PDF BOUNDS array. The PDF's bounds array is equivalent to the map's neatline, i.e., the border delineating the extent of geographic data on the output map. """ for x in [0, 1, 0, 0, 1, 0, 1, 1]: bounds.append(FloatObject(str(x))) measure[NameObject('/Bounds')] = bounds measure[NameObject('/GPTS')] = get_pdf_gpts(m, poly) measure[NameObject('/LPTS')] = bounds measure[NameObject('/GCS')] = gcs return measure
def add_highlight_to_page(self, highlight, page, output): """ Add a highlight to a PDF page. Parameters ---------- highlight : Highlight object page : PDF page object output : PdfFileWriter object """ highlight_ref = output._addObject(highlight) if "/Annots" in page: page[NameObject("/Annots")].append(highlight_ref) else: page[NameObject("/Annots")] = ArrayObject([highlight_ref])
def _add_highlight(self, x0, y0, width, height, comment, author='', color=[0, 0, 0, 0]): highlight = self._create_highlight(x0, y0, width, height, comment, author, color) highlight_ref = self.pdf._addObject(highlight) if "/Annots" in self.pdf.getPage(0): self.pdf.getPage(0)[NameObject("/Annots")].append(highlight_ref) else: self.pdf.getPage(0)[NameObject("/Annots")] = ArrayObject( [highlight_ref])
def add_comment(output, page, text, rectangle): obj = output._addObject( DictionaryObject({ NameObject('/DA'): TextStringObject(' /Helv 10 Tf'), NameObject('/Subtype'): NameObject('/FreeText'), NameObject('/Rect'): RectangleObject(rectangle), NameObject('/Type'): NameObject('/Annot'), NameObject('/Contents'): TextStringObject(text), NameObject('/C'): ArrayObject([FloatObject(1), FloatObject(1), FloatObject(1)]), })) page['/Annots'].append(obj)
def append_attachment(writer: PdfFileWriter, fname: str, fdata: bytes): """Append attachments to a PDF.""" # The entry for the file file_entry = DecodedStreamObject() file_entry.setData(fdata) file_entry.update({NameObject("/Type"): NameObject("/EmbeddedFile")}) # The Filespec entry efEntry = DictionaryObject() efEntry.update({NameObject("/F"): file_entry}) filespec = DictionaryObject() filespec.update({ NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): createStringObject(fname), NameObject("/EF"): efEntry, }) if "/Names" not in writer._root_object.keys(): # No files attached yet. Create the entry for the root, as it needs a reference to the Filespec embeddedFilesNamesDictionary = DictionaryObject() embeddedFilesNamesDictionary.update({ NameObject("/Names"): ArrayObject([createStringObject(fname), filespec]) }) embeddedFilesDictionary = DictionaryObject() embeddedFilesDictionary.update( {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary}) writer._root_object.update( {NameObject("/Names"): embeddedFilesDictionary}) else: # There are files already attached. Append the new file. writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append( createStringObject(fname)) writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append( filespec)
def _update_metadata_add_attachment(self, pdf_metadata, output_intents): '''This method is inspired from the code of the addAttachment() method of the PyPDF2 lib''' # The entry for the file facturx_xml_str = self.factx.xml_str md5sum = hashlib.md5().hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(_get_pdf_timestamp()), NameObject('/Size'): NameObject(str(len(facturx_xml_str))), }) file_entry = DecodedStreamObject() file_entry.setData(facturx_xml_str) # here we integrate the file itself file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = self._addObject(file_entry) # The Filespec entry ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) xmp_filename = self.factx.flavor.details['xmp_filename'] fname_obj = createStringObject(xmp_filename) filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Desc"): createStringObject("Factur-X Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = self._addObject(filespec_dict) name_arrayobj_cdict = {fname_obj: filespec_obj} # TODO: add back additional attachments? logger.debug('name_arrayobj_cdict=%s', name_arrayobj_cdict) name_arrayobj_content_sort = list( sorted(name_arrayobj_cdict.items(), key=lambda x: x[0])) logger.debug('name_arrayobj_content_sort=%s', name_arrayobj_content_sort) name_arrayobj_content_final = [] af_list = [] for (fname_obj, filespec_obj) in name_arrayobj_content_sort: name_arrayobj_content_final += [fname_obj, filespec_obj] af_list.append(filespec_obj) embedded_files_names_dict = DictionaryObject({ NameObject("/Names"): ArrayObject(name_arrayobj_content_final), }) # Then create the entry for the root, as it needs a # reference to the Filespec embedded_files_dict = DictionaryObject({ NameObject("/EmbeddedFiles"): embedded_files_names_dict, }) res_output_intents = [] logger.debug('output_intents=%s', output_intents) for output_intent_dict, dest_output_profile_dict in output_intents: dest_output_profile_obj = self._addObject( dest_output_profile_dict) # TODO detect if there are no other objects in output_intent_dest_obj # than /DestOutputProfile output_intent_dict.update({ NameObject("/DestOutputProfile"): dest_output_profile_obj, }) output_intent_obj = self._addObject(output_intent_dict) res_output_intents.append(output_intent_obj) # Update the root xmp_level_str = self.factx.flavor.details['levels'][self.factx.flavor.level]['xmp_str'] xmp_template = self.factx.flavor.get_xmp_xml() metadata_xml_str = _prepare_pdf_metadata_xml(xmp_level_str, xmp_filename, xmp_template, pdf_metadata) metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_file_entry.update({ NameObject('/Subtype'): NameObject('/XML'), NameObject('/Type'): NameObject('/Metadata'), }) metadata_obj = self._addObject(metadata_file_entry) af_value_obj = self._addObject(ArrayObject(af_list)) self._root_object.update({ NameObject("/AF"): af_value_obj, NameObject("/Metadata"): metadata_obj, NameObject("/Names"): embedded_files_dict, # show attachments when opening PDF NameObject("/PageMode"): NameObject("/UseAttachments"), }) logger.debug('res_output_intents=%s', res_output_intents) if res_output_intents: self._root_object.update({ NameObject("/OutputIntents"): ArrayObject(res_output_intents), }) metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata) self.addMetadata(metadata_txt_dict)
def main(): print("Loading metadata and eText information...") with open("bookinfo.json", 'r') as bookInfoRequest: str_response = bookInfoRequest.read() bookInfo = json.loads(str_response) bookInfo = bookInfo[0]['userBookTOList'][0] with open("pageinfo.json", 'r') as pageInfoRequest: pageInfo = json.loads(pageInfoRequest.read()) pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList'] with open("pages.json", 'r') as file: downloadedData = json.loads(file.read())[0]["pdfPlayerPageInfoTOList"] def get_data(page_id): b = next((x['data'] for x in downloadedData if x['pageID'] == page_id), None) return bytearray(base64.standard_b64decode(b[len("data:application/pdf;base64,"):])) with tempfile.TemporaryDirectory() as pdfDownloadDir: # Use a temporary directory to download all the pdf files to # First, download the cover file pdfPageTable = {} pdf_page_label_table = {} # urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf")) with open(os.path.join(pdfDownloadDir, "0000 - cover.pdf"), 'w+b') as ous: ous.write(get_data(pageInfo[0]['pageID'])) # Then, download all the individual pages for the e-book def download(pdfPage): pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder'] savePath = os.path.join(pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber'])) with open(savePath, 'w+b') as out: out.write(get_data(pdfPage['pageID'])) # urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath) threadPool = ThreadPool(40) # 40 threads should download a book fairly quickly print("Reading pages from pageinfo.json to \"{}\"...".format(pdfDownloadDir)) threadPool.map(download, pageInfo) print("Assembling PDF...") # Begin to assemble the final PDF, first by adding all the pages fileMerger = PdfFileWriter() for pdfFile in sorted(os.listdir(pdfDownloadDir)): page = PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0) os.remove(os.path.join(pdfDownloadDir, pdfFile)) # Save on memory a bit fileMerger.addPage(page) bookmarksExist = True # TODO: Bookmarks currently not supported with open("bookmarks.json", 'r') as bookmarkInfoRequest: try: bookmarkInfo = json.loads(bookmarkInfoRequest.read()) bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0] except Exception as e: bookmarksExist = False def recursiveSetBookmarks(aDict, parent=None): if isinstance(aDict, dict): aDict = [aDict] for bookmark in aDict: # These are the main bookmarks under this parent (or the whole document if parent is None) bookmarkName = bookmark['name'] # Name of the section pageNum = str(bookmark['linkvalue']['content']) # First page (in the pdf's format) latestBookmark = fileMerger.addBookmark(bookmarkName, pdfPageTable[pageNum], parent) if 'basketentry' in bookmark: recursiveSetBookmarks(bookmark['basketentry'], latestBookmark) if bookmarksExist: print("Adding bookmarks...") fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning recursiveSetBookmarks(bookmarkInfo['document'][0]['basketcollection']['basket']['basketentry']) else: print("Bookmarks don't exist for book") print("Fixing metadata...") # Hack to fix metadata and page numbers: pdf_page_label_table = [(v, k) for k, v in pdfPageTable.items()] pdf_page_label_table = sorted(pdf_page_label_table, key=(lambda x: int(x[0]))) labels = ArrayObject([ NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")}) ]) last_mode = None last_prefix = "" # Now we check to see the ranges where we have roman numerals or arabic numerals # The following code is not ideal for this, so I'd appreciate a PR with a better solution for pageNumber, pageLabel in pdf_page_label_table: curr_mode = None prefix = "" style = DictionaryObject() if arabicRegex.match(pageLabel): curr_mode = "arabic" prefix = arabicRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/D")}) elif romanRegex.match(pageLabel): curr_mode = "roman" prefix = romanRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/r")}) if curr_mode != last_mode or prefix != last_prefix: if prefix: style.update({ NameObject("/P"): NameObject("({})".format(prefix)) }) labels.extend([ NumberObject(pageNumber), style, ]) last_mode = curr_mode last_prefix = prefix root_obj = fileMerger._root_object # Todo: Fix the weird page numbering bug pageLabels = DictionaryObject() # fileMerger._addObject(pageLabels) pageLabels.update({ NameObject("/Nums"): ArrayObject(labels) }) root_obj.update({ NameObject("/PageLabels"): pageLabels }) print("Writing PDF...") with open("{}.pdf".format(bookInfo['title']).replace("/", "").replace(":", "_"), "wb") as outFile: fileMerger.write(outFile)
def _facturx_update_metadata_add_attachment(pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level, output_intents): md5sum = hashlib.md5(facturx_xml_str).hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(datetime.datetime.now().isoformat()), NameObject('/Size'): NameObject(str(len(facturx_xml_str))), }) file_entry = DecodedStreamObject() file_entry.setData(facturx_xml_str) # here we integrate the file itself file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = pdf_filestream._addObject(file_entry) # The Filespec entry ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) fname_obj = createStringObject("ZUGFeRD-invoice.xml") filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Desc"): createStringObject("Factur-X Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = pdf_filestream._addObject(filespec_dict) name_arrayobj_cdict = {fname_obj: filespec_obj} name_arrayobj_content_sort = list( sorted(name_arrayobj_cdict.items(), key=lambda x: x[0])) name_arrayobj_content_final = [] af_list = [] for (fname_obj, filespec_obj) in name_arrayobj_content_sort: name_arrayobj_content_final += [fname_obj, filespec_obj] af_list.append(filespec_obj) embedded_files_names_dict = DictionaryObject({ NameObject("/Names"): ArrayObject(name_arrayobj_content_final), }) # Then create the entry for the root, as it needs a # reference to the Filespec embedded_files_dict = DictionaryObject({ NameObject("/EmbeddedFiles"): embedded_files_names_dict, }) res_output_intents = [] for output_intent_dict, dest_output_profile_dict in output_intents: dest_output_profile_obj = pdf_filestream._addObject( dest_output_profile_dict) # TODO detect if there are no other objects in output_intent_dest_obj # than /DestOutputProfile output_intent_dict.update({ NameObject("/DestOutputProfile"): dest_output_profile_obj, }) output_intent_obj = pdf_filestream._addObject(output_intent_dict) res_output_intents.append(output_intent_obj) # Update the root metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata) metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_file_entry.update({ NameObject('/Subtype'): NameObject('/XML'), NameObject('/Type'): NameObject('/Metadata'), }) metadata_obj = pdf_filestream._addObject(metadata_file_entry) af_value_obj = pdf_filestream._addObject(ArrayObject(af_list)) pdf_filestream._root_object.update({ NameObject("/AF"): af_value_obj, NameObject("/Metadata"): metadata_obj, NameObject("/Names"): embedded_files_dict, # show attachments when opening PDF NameObject("/PageMode"): NameObject("/UseAttachments"), }) if res_output_intents: pdf_filestream._root_object.update({ NameObject("/OutputIntents"): ArrayObject(res_output_intents), }) metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata) pdf_filestream.addMetadata(metadata_txt_dict)
def main(bookId): if bookId.startswith("http"): print("Trying to extract bookId from url") bookData = urllib.parse.parse_qs(bookId.split("?")[-1]) if (bookData.get("values", None)) is not None: bookData = { itemName: [itemValue] for itemName, itemValue in zip( *[iter(bookData["values"][0].split("::"))] * 2) } # Fix capitalization bookData["bookid"] = bookData["bookID"] bookId = bookData["bookid"][0] bookId = int(bookId) print( "Downloading book id {}. Please open an issue on GitHub if this book id is incorrect." .format(bookId)) print("Downloading metadata and eText information...") bookInfoGetUrl = bookInfoUrl.format(bookId) #print(hsidUrl(bookInfoGetUrl)) with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest: str_response = bookInfoRequest.read().decode('utf-8') bookInfo = json.loads(str_response) bookInfo = bookInfo[0]['userBookTOList'][0] pageInfoGetUrl = pageInfoUrl.format( userroleid=roletypeid, bookid=bookId, bookeditionid=bookInfo['bookEditionID']) with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest: pageInfo = json.loads(pageInfoRequest.read().decode('utf-8')) pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList'] def getPageUrl(pdfPage, isCover="N"): pdfPage = pdfPage.replace("/assets/", "") getPage = pagePath = pdfUrl.format(bookid=bookInfo['globalBookID'], pdfpage=pdfPage, iscover=isCover) return hsidUrl(getPage) with tempfile.TemporaryDirectory() as pdfDownloadDir: # Use a temporary directory to download all the pdf files to # First, download the cover file pdfPageTable = {} pdfPageLabelTable = {} urllib.request.urlretrieve( getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf")) # Then, download all the individual pages for the e-book def download(pdfPage): pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder'] savePath = os.path.join( pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber'])) urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath) threadPool = ThreadPool( 40) # 40 threads should download a book fairly quickly print("Downloading pages to \"{}\"...".format(pdfDownloadDir)) threadPool.map(download, pageInfo) print("Assembling PDF...") # Begin to assemble the final PDF, first by adding all the pages fileMerger = PdfFileWriter() for pdfFile in sorted(os.listdir(pdfDownloadDir)): fileMerger.addPage( PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0)) # And then add all the bookmarks to the final PDF bookmarkInfoGetUrl = bookmarkInfoUrl.format( userroleid=roletypeid, bookid=bookId, language=language, bookeditionid=bookInfo['bookEditionID'], scenarioid=1001) bookmarksExist = True with urllib.request.urlopen( hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest: try: bookmarkInfo = json.loads( bookmarkInfoRequest.read().decode('utf-8')) bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0] except Exception as e: bookmarksExist = False def recursiveSetBookmarks(aDict, parent=None): if isinstance(aDict, dict): aDict = [aDict] for bookmark in aDict: # These are the main bookmarks under this parent (or the whole document if parent is None) bookmarkName = bookmark['n'] # Name of the section pageNum = str(bookmark['lv'] ['content']) # First page (in the pdf's format) latestBookmark = fileMerger.addBookmark( bookmarkName, pdfPageTable[pageNum], parent) if 'be' in bookmark: recursiveSetBookmarks(bookmark['be'], latestBookmark) if bookmarksExist: print("Adding bookmarks...") fileMerger.addBookmark( "Cover", 0) # Add a bookmark to the cover at the beginning recursiveSetBookmarks(bookmarkInfo['document'][0]['bc']['b']['be']) else: print("Bookmarks don't exist for ID {}".format(bookId)) print("Fixing metadata...") # Hack to fix metadata and page numbers: pdfPageLabelTable = [(v, k) for k, v in pdfPageTable.items()] pdfPageLabelTable = sorted(pdfPageLabelTable, key=(lambda x: int(x[0]))) labels = ArrayObject([ NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")}) ]) lastMode = None lastPrefix = "" # Now we check to see the ranges where we have roman numerals or arabic numerals # The following code is not ideal for this, so I'd appreciate a PR with a better solution for pageNumber, pageLabel in pdfPageLabelTable: currMode = None prefix = "" style = DictionaryObject() if arabicRegex.match(pageLabel): currMode = "arabic" prefix = arabicRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/D")}) elif romanRegex.match(pageLabel): currMode = "roman" prefix = romanRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/r")}) if currMode != lastMode or prefix != lastPrefix: if prefix: style.update( {NameObject("/P"): NameObject("({})".format(prefix))}) labels.extend([ NumberObject(pageNumber), style, ]) lastMode = currMode lastPrefix = prefix rootObj = fileMerger._root_object # Todo: Fix the weird page numbering bug pageLabels = DictionaryObject() #fileMerger._addObject(pageLabels) pageLabels.update({NameObject("/Nums"): ArrayObject(labels)}) rootObj.update({NameObject("/PageLabels"): pageLabels}) print("Writing PDF...") with open( "{} - {}.pdf".format(bookId, bookInfo['title']).replace( "/", "").replace(":", "_"), "wb") as outFile: fileMerger.write(outFile)
def create_annotation(x, y, meta): color = [255.0 / 255.0, 209 / 255.0, 0] # link linkAnnotation = DictionaryObject() # https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf linkAnnotation.update({ # Table 165 NoZoom NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Link"), # Table 164 color, annotation rectangle NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x), FloatObject(y), FloatObject(x + 20), FloatObject(y + 20) ]), # Table 173 link annotation NameObject('/A'): DictionaryObject({ # Table 206 uri NameObject('/S'): NameObject('/URI'), NameObject('/URI'): TextStringObject(meta["contents"]) }), # Table 173 invert rect when mouse NameObject('/H'): NameObject('/I'), # table 164 hor corner radius, vert corner radius, border width # dash array table 56 NameObject('/Border'): ArrayObject([ NameObject(0), NameObject(0), NameObject(5), ]), }) commentAnnotation = DictionaryObject() # https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf commentAnnotation.update({ # Table 165 NoZoom NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Text"), # Table 170 titlebar NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), # Table 164 color, annotation rectangle NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x), FloatObject(y), FloatObject(x + 5), FloatObject(y + 5) ]), # 12.5.6.4 text annotation NameObject('/Open'): BooleanObject(False), NameObject('/Name'): NameObject('/Comment'), }) return linkAnnotation, commentAnnotation
def convert_to_pdfa(self): """ Transform the opened PDF file into a PDF/A compliant file """ # Set the PDF version to 1.7 (as PDF/A-3 is based on version 1.7) and make it PDF/A compliant. # See https://github.com/veraPDF/veraPDF-validation-profiles/wiki/PDFA-Parts-2-and-3-rules#rule-612-1 # " The file header shall begin at byte zero and shall consist of "%PDF-1.n" followed by a single EOL marker, # where 'n' is a single digit number between 0 (30h) and 7 (37h) " # " The aforementioned EOL marker shall be immediately followed by a % (25h) character followed by at least four # bytes, each of whose encoded byte values shall have a decimal value greater than 127 " self._header = b"%PDF-1.7\n%\xFF\xFF\xFF\xFF" # Add a document ID to the trailer. This is only needed when using encryption with regular PDF, but is required # when using PDF/A pdf_id = ByteStringObject(md5(self._reader.stream.getvalue()).digest()) # The first string is based on the content at the time of creating the file, while the second is based on the # content of the file when it was last updated. When creating a PDF, both are set to the same value. self._ID = ArrayObject((pdf_id, pdf_id)) with file_open('tools/data/files/sRGB2014.icc', mode='rb') as icc_profile: icc_profile_file_data = compress(icc_profile.read()) icc_profile_stream_obj = DecodedStreamObject() icc_profile_stream_obj.setData(icc_profile_file_data) icc_profile_stream_obj.update({ NameObject("/Filter"): NameObject("/FlateDecode"), NameObject("/N"): NumberObject(3), NameObject("/Length"): NameObject(str(len(icc_profile_file_data))), }) icc_profile_obj = self._addObject(icc_profile_stream_obj) output_intent_dict_obj = DictionaryObject() output_intent_dict_obj.update({ NameObject("/S"): NameObject("/GTS_PDFA1"), NameObject("/OutputConditionIdentifier"): createStringObject("sRGB"), NameObject("/DestOutputProfile"): icc_profile_obj, NameObject("/Type"): NameObject("/OutputIntent"), }) output_intent_obj = self._addObject(output_intent_dict_obj) self._root_object.update({ NameObject("/OutputIntents"): ArrayObject([output_intent_obj]), }) pages = self._root_object['/Pages']['/Kids'] # PDF/A needs the glyphs width array embedded in the pdf to be consistent with the ones from the font file. # But it seems like it is not the case when exporting from wkhtmltopdf. if TTFont: fonts = {} # First browse through all the pages of the pdf file, to get a reference to all the fonts used in the PDF. for page in pages: for font in page.getObject()['/Resources']['/Font'].values(): for descendant in font.getObject()['/DescendantFonts']: fonts[descendant.idnum] = descendant.getObject() # Then for each font, rewrite the width array with the information taken directly from the font file. # The new width are calculated such as width = round(1000 * font_glyph_width / font_units_per_em) # See: http://martin.hoppenheit.info/blog/2018/pdfa-validation-and-inconsistent-glyph-width-information/ for font in fonts.values(): font_file = font['/FontDescriptor']['/FontFile2'] stream = io.BytesIO(decompress(font_file._data)) ttfont = TTFont(stream) font_upm = ttfont['head'].unitsPerEm glyphs = ttfont.getGlyphSet()._hmtx.metrics glyph_widths = [] for key, values in glyphs.items(): if key[:5] == 'glyph': glyph_widths.append(NumberObject(round(1000.0 * values[0] / font_upm))) font[NameObject('/W')] = ArrayObject([NumberObject(1), ArrayObject(glyph_widths)]) stream.close() else: _logger.warning('The fonttools package is not installed. Generated PDF may not be PDF/A compliant.') outlines = self._root_object['/Outlines'].getObject() outlines[NameObject('/Count')] = NumberObject(1) # Set odoo as producer self.addMetadata({ '/Creator': "Odoo", '/Producer': "Odoo", }) self.is_pdfa = True
def zugferd_update_metadata_add_attachment(self, pdf_filestream, fname, fdata): '''This method is inspired from the code of the addAttachment() method of the PyPDF2 lib''' # The entry for the file moddate = DictionaryObject() moddate.update({ NameObject('/ModDate'): createStringObject(self._get_pdf_timestamp()) }) file_entry = DecodedStreamObject() file_entry.setData(fdata) file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): moddate, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = pdf_filestream._addObject(file_entry) # The Filespec entry efEntry = DictionaryObject() efEntry.update({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) fname_obj = createStringObject(fname) filespec = DictionaryObject() filespec.update({ NameObject("/AFRelationship"): NameObject("/Alternative"), NameObject("/Desc"): createStringObject("ZUGFeRD Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): efEntry, NameObject("/UF"): fname_obj, }) embeddedFilesNamesDictionary = DictionaryObject() embeddedFilesNamesDictionary.update({ NameObject("/Names"): ArrayObject([fname_obj, pdf_filestream._addObject(filespec)]) }) # Then create the entry for the root, as it needs a # reference to the Filespec embeddedFilesDictionary = DictionaryObject() embeddedFilesDictionary.update( {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary}) # Update the root metadata_xml_str = self._prepare_pdf_metadata() metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_value = pdf_filestream._addObject(metadata_file_entry) af_value = pdf_filestream._addObject( ArrayObject([pdf_filestream._addObject(filespec)])) pdf_filestream._root_object.update({ NameObject("/AF"): af_value, NameObject("/Metadata"): metadata_value, NameObject("/Names"): embeddedFilesDictionary, }) info_dict = self._prepare_pdf_info() pdf_filestream.addMetadata(info_dict)
def test_array_object_exception(): stream = BytesIO(b"False") with pytest.raises(PdfReadError) as exc: ArrayObject.read_from_stream(stream, None) assert exc.value.args[0] == "Could not read array"