def add_file_metadata(self, metadata_content): """ Set the XMP metadata of the pdf, wrapping it with the necessary XMP header/footer. These are required for a PDF/A file to be completely compliant. Ommiting them would result in validation errors. :param metadata_content: bytes of the metadata to add to the pdf. """ # See https://wwwimages2.adobe.com/content/dam/acom/en/devnet/xmp/pdfs/XMP%20SDK%20Release%20cc-2016-08/XMPSpecificationPart1.pdf # Page 10/11 header = b'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>' footer = b'<?xpacket end="w"?>' metadata = b'%s%s%s' % (header, metadata_content, footer) file_entry = DecodedStreamObject() file_entry.setData(metadata) file_entry.update({ NameObject("/Type"): NameObject("/Metadata"), NameObject("/Subtype"): NameObject("/XML"), NameObject("/Length"): NameObject(str(len(metadata))), }) # Add the new metadata to the pdf, then redirect the reference to refer to this new object. metadata_object = self._addObject(file_entry) self._root_object.update({NameObject("/Metadata"): metadata_object})
def _create_attachment_object(self, attachment): ''' Create a PyPdf2.generic object representing an embedded file. :param attachment: A dictionary containing: * filename: The name of the file to embed (required) * content: The bytes of the file to embed (required) * subtype: The mime-type of the file to embed (optional) :return: ''' file_entry = DecodedStreamObject() file_entry.setData(attachment['content']) file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): DictionaryObject({ NameObject('/CheckSum'): createStringObject(md5(attachment['content']).hexdigest()), NameObject('/ModDate'): createStringObject( datetime.now().strftime(DEFAULT_PDF_DATETIME_FORMAT)), NameObject('/Size'): NameObject(str(len(attachment['content']))), }), }) if attachment.get('subtype'): file_entry.update({ NameObject("/Subtype"): NameObject(attachment['subtype']), }) file_entry_object = self._addObject(file_entry) filename_object = createStringObject(attachment['filename']) filespec_object = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): filename_object, NameObject("/EF"): DictionaryObject({ NameObject("/F"): file_entry_object, NameObject('/UF'): file_entry_object, }), NameObject("/UF"): filename_object, }) if attachment.get('description'): filespec_object.update({ NameObject("/Desc"): createStringObject(attachment['description']) }) return self._addObject(filespec_object)
def __processContent(self, content): data = content.getData() # Replace data inside of the encoded file decodedData = data.decode('utf-8') replacedData = self.__replaceText(decodedData) encodedData = replacedData.encode('utf-8') # Save data as PDF page's content object decodedContent = DecodedStreamObject() decodedContent.setData(encodedData) return decodedContent
def _filespec_additional_attachments(pdf_filestream, name_arrayobj_cdict, file_dict, file_bin): filename = file_dict['filename'] logger.debug('_filespec_additional_attachments filename=%s', filename) mod_date_pdf = _get_pdf_timestamp(file_dict['mod_date']) md5sum = hashlib.md5(file_bin).hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(mod_date_pdf), NameObject('/Size'): NameObject(str(len(file_bin))), }) file_entry = DecodedStreamObject() file_entry.setData(file_bin) file_mimetype = mimetypes.guess_type(filename)[0] if not file_mimetype: file_mimetype = 'application/octet-stream' file_mimetype_insert = '/' + file_mimetype.replace('/', '#2f') file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, NameObject("/Subtype"): NameObject(file_mimetype_insert), }) file_entry_obj = pdf_filestream._addObject(file_entry) ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, }) fname_obj = createStringObject(filename) filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Unspecified"), NameObject("/Desc"): createStringObject(file_dict.get('desc', '')), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = pdf_filestream._addObject(filespec_dict) name_arrayobj_cdict[fname_obj] = filespec_obj
def append_attachment(writer: PdfFileWriter, fname: str, fdata: bytes): """Append attachments to a PDF.""" # The entry for the file file_entry = DecodedStreamObject() file_entry.setData(fdata) file_entry.update({NameObject("/Type"): NameObject("/EmbeddedFile")}) # The Filespec entry efEntry = DictionaryObject() efEntry.update({NameObject("/F"): file_entry}) filespec = DictionaryObject() filespec.update({ NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): createStringObject(fname), NameObject("/EF"): efEntry, }) if "/Names" not in writer._root_object.keys(): # No files attached yet. Create the entry for the root, as it needs a reference to the Filespec embeddedFilesNamesDictionary = DictionaryObject() embeddedFilesNamesDictionary.update({ NameObject("/Names"): ArrayObject([createStringObject(fname), filespec]) }) embeddedFilesDictionary = DictionaryObject() embeddedFilesDictionary.update( {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary}) writer._root_object.update( {NameObject("/Names"): embeddedFilesDictionary}) else: # There are files already attached. Append the new file. writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append( createStringObject(fname)) writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append( filespec)
def zugferd_update_metadata_add_attachment(self, pdf_filestream, fname, fdata): '''This method is inspired from the code of the addAttachment() method of the PyPDF2 lib''' # The entry for the file moddate = DictionaryObject() moddate.update({ NameObject('/ModDate'): createStringObject(self._get_pdf_timestamp()) }) file_entry = DecodedStreamObject() file_entry.setData(fdata) file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): moddate, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = pdf_filestream._addObject(file_entry) # The Filespec entry efEntry = DictionaryObject() efEntry.update({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) fname_obj = createStringObject(fname) filespec = DictionaryObject() filespec.update({ NameObject("/AFRelationship"): NameObject("/Alternative"), NameObject("/Desc"): createStringObject("ZUGFeRD Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): efEntry, NameObject("/UF"): fname_obj, }) embeddedFilesNamesDictionary = DictionaryObject() embeddedFilesNamesDictionary.update({ NameObject("/Names"): ArrayObject([fname_obj, pdf_filestream._addObject(filespec)]) }) # Then create the entry for the root, as it needs a # reference to the Filespec embeddedFilesDictionary = DictionaryObject() embeddedFilesDictionary.update( {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary}) # Update the root metadata_xml_str = self._prepare_pdf_metadata() metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_value = pdf_filestream._addObject(metadata_file_entry) af_value = pdf_filestream._addObject( ArrayObject([pdf_filestream._addObject(filespec)])) pdf_filestream._root_object.update({ NameObject("/AF"): af_value, NameObject("/Metadata"): metadata_value, NameObject("/Names"): embeddedFilesDictionary, }) info_dict = self._prepare_pdf_info() pdf_filestream.addMetadata(info_dict)
def convert_to_pdfa(self): """ Transform the opened PDF file into a PDF/A compliant file """ # Set the PDF version to 1.7 (as PDF/A-3 is based on version 1.7) and make it PDF/A compliant. # See https://github.com/veraPDF/veraPDF-validation-profiles/wiki/PDFA-Parts-2-and-3-rules#rule-612-1 # " The file header shall begin at byte zero and shall consist of "%PDF-1.n" followed by a single EOL marker, # where 'n' is a single digit number between 0 (30h) and 7 (37h) " # " The aforementioned EOL marker shall be immediately followed by a % (25h) character followed by at least four # bytes, each of whose encoded byte values shall have a decimal value greater than 127 " self._header = b"%PDF-1.7\n%\xFF\xFF\xFF\xFF" # Add a document ID to the trailer. This is only needed when using encryption with regular PDF, but is required # when using PDF/A pdf_id = ByteStringObject(md5(self._reader.stream.getvalue()).digest()) # The first string is based on the content at the time of creating the file, while the second is based on the # content of the file when it was last updated. When creating a PDF, both are set to the same value. self._ID = ArrayObject((pdf_id, pdf_id)) with file_open('tools/data/files/sRGB2014.icc', mode='rb') as icc_profile: icc_profile_file_data = compress(icc_profile.read()) icc_profile_stream_obj = DecodedStreamObject() icc_profile_stream_obj.setData(icc_profile_file_data) icc_profile_stream_obj.update({ NameObject("/Filter"): NameObject("/FlateDecode"), NameObject("/N"): NumberObject(3), NameObject("/Length"): NameObject(str(len(icc_profile_file_data))), }) icc_profile_obj = self._addObject(icc_profile_stream_obj) output_intent_dict_obj = DictionaryObject() output_intent_dict_obj.update({ NameObject("/S"): NameObject("/GTS_PDFA1"), NameObject("/OutputConditionIdentifier"): createStringObject("sRGB"), NameObject("/DestOutputProfile"): icc_profile_obj, NameObject("/Type"): NameObject("/OutputIntent"), }) output_intent_obj = self._addObject(output_intent_dict_obj) self._root_object.update({ NameObject("/OutputIntents"): ArrayObject([output_intent_obj]), }) pages = self._root_object['/Pages']['/Kids'] # PDF/A needs the glyphs width array embedded in the pdf to be consistent with the ones from the font file. # But it seems like it is not the case when exporting from wkhtmltopdf. if TTFont: fonts = {} # First browse through all the pages of the pdf file, to get a reference to all the fonts used in the PDF. for page in pages: for font in page.getObject()['/Resources']['/Font'].values(): for descendant in font.getObject()['/DescendantFonts']: fonts[descendant.idnum] = descendant.getObject() # Then for each font, rewrite the width array with the information taken directly from the font file. # The new width are calculated such as width = round(1000 * font_glyph_width / font_units_per_em) # See: http://martin.hoppenheit.info/blog/2018/pdfa-validation-and-inconsistent-glyph-width-information/ for font in fonts.values(): font_file = font['/FontDescriptor']['/FontFile2'] stream = io.BytesIO(decompress(font_file._data)) ttfont = TTFont(stream) font_upm = ttfont['head'].unitsPerEm glyphs = ttfont.getGlyphSet()._hmtx.metrics glyph_widths = [] for key, values in glyphs.items(): if key[:5] == 'glyph': glyph_widths.append(NumberObject(round(1000.0 * values[0] / font_upm))) font[NameObject('/W')] = ArrayObject([NumberObject(1), ArrayObject(glyph_widths)]) stream.close() else: _logger.warning('The fonttools package is not installed. Generated PDF may not be PDF/A compliant.') outlines = self._root_object['/Outlines'].getObject() outlines[NameObject('/Count')] = NumberObject(1) # Set odoo as producer self.addMetadata({ '/Creator': "Odoo", '/Producer': "Odoo", }) self.is_pdfa = True
def _update_metadata_add_attachment(self, pdf_metadata, output_intents): '''This method is inspired from the code of the addAttachment() method of the PyPDF2 lib''' # The entry for the file facturx_xml_str = self.factx.xml_str md5sum = hashlib.md5().hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(_get_pdf_timestamp()), NameObject('/Size'): NameObject(str(len(facturx_xml_str))), }) file_entry = DecodedStreamObject() file_entry.setData(facturx_xml_str) # here we integrate the file itself file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = self._addObject(file_entry) # The Filespec entry ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) xmp_filename = self.factx.flavor.details['xmp_filename'] fname_obj = createStringObject(xmp_filename) filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Desc"): createStringObject("Factur-X Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = self._addObject(filespec_dict) name_arrayobj_cdict = {fname_obj: filespec_obj} # TODO: add back additional attachments? logger.debug('name_arrayobj_cdict=%s', name_arrayobj_cdict) name_arrayobj_content_sort = list( sorted(name_arrayobj_cdict.items(), key=lambda x: x[0])) logger.debug('name_arrayobj_content_sort=%s', name_arrayobj_content_sort) name_arrayobj_content_final = [] af_list = [] for (fname_obj, filespec_obj) in name_arrayobj_content_sort: name_arrayobj_content_final += [fname_obj, filespec_obj] af_list.append(filespec_obj) embedded_files_names_dict = DictionaryObject({ NameObject("/Names"): ArrayObject(name_arrayobj_content_final), }) # Then create the entry for the root, as it needs a # reference to the Filespec embedded_files_dict = DictionaryObject({ NameObject("/EmbeddedFiles"): embedded_files_names_dict, }) res_output_intents = [] logger.debug('output_intents=%s', output_intents) for output_intent_dict, dest_output_profile_dict in output_intents: dest_output_profile_obj = self._addObject( dest_output_profile_dict) # TODO detect if there are no other objects in output_intent_dest_obj # than /DestOutputProfile output_intent_dict.update({ NameObject("/DestOutputProfile"): dest_output_profile_obj, }) output_intent_obj = self._addObject(output_intent_dict) res_output_intents.append(output_intent_obj) # Update the root xmp_level_str = self.factx.flavor.details['levels'][self.factx.flavor.level]['xmp_str'] xmp_template = self.factx.flavor.get_xmp_xml() metadata_xml_str = _prepare_pdf_metadata_xml(xmp_level_str, xmp_filename, xmp_template, pdf_metadata) metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_file_entry.update({ NameObject('/Subtype'): NameObject('/XML'), NameObject('/Type'): NameObject('/Metadata'), }) metadata_obj = self._addObject(metadata_file_entry) af_value_obj = self._addObject(ArrayObject(af_list)) self._root_object.update({ NameObject("/AF"): af_value_obj, NameObject("/Metadata"): metadata_obj, NameObject("/Names"): embedded_files_dict, # show attachments when opening PDF NameObject("/PageMode"): NameObject("/UseAttachments"), }) logger.debug('res_output_intents=%s', res_output_intents) if res_output_intents: self._root_object.update({ NameObject("/OutputIntents"): ArrayObject(res_output_intents), }) metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata) self.addMetadata(metadata_txt_dict)
def _facturx_update_metadata_add_attachment(pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level, output_intents): md5sum = hashlib.md5(facturx_xml_str).hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(datetime.datetime.now().isoformat()), NameObject('/Size'): NameObject(str(len(facturx_xml_str))), }) file_entry = DecodedStreamObject() file_entry.setData(facturx_xml_str) # here we integrate the file itself file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = pdf_filestream._addObject(file_entry) # The Filespec entry ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) fname_obj = createStringObject("ZUGFeRD-invoice.xml") filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Desc"): createStringObject("Factur-X Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = pdf_filestream._addObject(filespec_dict) name_arrayobj_cdict = {fname_obj: filespec_obj} name_arrayobj_content_sort = list( sorted(name_arrayobj_cdict.items(), key=lambda x: x[0])) name_arrayobj_content_final = [] af_list = [] for (fname_obj, filespec_obj) in name_arrayobj_content_sort: name_arrayobj_content_final += [fname_obj, filespec_obj] af_list.append(filespec_obj) embedded_files_names_dict = DictionaryObject({ NameObject("/Names"): ArrayObject(name_arrayobj_content_final), }) # Then create the entry for the root, as it needs a # reference to the Filespec embedded_files_dict = DictionaryObject({ NameObject("/EmbeddedFiles"): embedded_files_names_dict, }) res_output_intents = [] for output_intent_dict, dest_output_profile_dict in output_intents: dest_output_profile_obj = pdf_filestream._addObject( dest_output_profile_dict) # TODO detect if there are no other objects in output_intent_dest_obj # than /DestOutputProfile output_intent_dict.update({ NameObject("/DestOutputProfile"): dest_output_profile_obj, }) output_intent_obj = pdf_filestream._addObject(output_intent_dict) res_output_intents.append(output_intent_obj) # Update the root metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata) metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_file_entry.update({ NameObject('/Subtype'): NameObject('/XML'), NameObject('/Type'): NameObject('/Metadata'), }) metadata_obj = pdf_filestream._addObject(metadata_file_entry) af_value_obj = pdf_filestream._addObject(ArrayObject(af_list)) pdf_filestream._root_object.update({ NameObject("/AF"): af_value_obj, NameObject("/Metadata"): metadata_obj, NameObject("/Names"): embedded_files_dict, # show attachments when opening PDF NameObject("/PageMode"): NameObject("/UseAttachments"), }) if res_output_intents: pdf_filestream._root_object.update({ NameObject("/OutputIntents"): ArrayObject(res_output_intents), }) metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata) pdf_filestream.addMetadata(metadata_txt_dict)
def flate_string(s): o = DecodedStreamObject() o.setData(s) return o.flateEncode()