Exemple #1
0
    def add_file_metadata(self, metadata_content):
        """
        Set the XMP metadata of the pdf, wrapping it with the necessary XMP header/footer.
        These are required for a PDF/A file to be completely compliant. Ommiting them would result in validation errors.
        :param metadata_content: bytes of the metadata to add to the pdf.
        """
        # See https://wwwimages2.adobe.com/content/dam/acom/en/devnet/xmp/pdfs/XMP%20SDK%20Release%20cc-2016-08/XMPSpecificationPart1.pdf
        # Page 10/11
        header = b'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>'
        footer = b'<?xpacket end="w"?>'
        metadata = b'%s%s%s' % (header, metadata_content, footer)
        file_entry = DecodedStreamObject()
        file_entry.setData(metadata)
        file_entry.update({
            NameObject("/Type"):
            NameObject("/Metadata"),
            NameObject("/Subtype"):
            NameObject("/XML"),
            NameObject("/Length"):
            NameObject(str(len(metadata))),
        })

        # Add the new metadata to the pdf, then redirect the reference to refer to this new object.
        metadata_object = self._addObject(file_entry)
        self._root_object.update({NameObject("/Metadata"): metadata_object})
Exemple #2
0
    def _create_attachment_object(self, attachment):
        ''' Create a PyPdf2.generic object representing an embedded file.

        :param attachment: A dictionary containing:
            * filename: The name of the file to embed (required)
            * content:  The bytes of the file to embed (required)
            * subtype: The mime-type of the file to embed (optional)
        :return:
        '''
        file_entry = DecodedStreamObject()
        file_entry.setData(attachment['content'])
        file_entry.update({
            NameObject("/Type"):
            NameObject("/EmbeddedFile"),
            NameObject("/Params"):
            DictionaryObject({
                NameObject('/CheckSum'):
                createStringObject(md5(attachment['content']).hexdigest()),
                NameObject('/ModDate'):
                createStringObject(
                    datetime.now().strftime(DEFAULT_PDF_DATETIME_FORMAT)),
                NameObject('/Size'):
                NameObject(str(len(attachment['content']))),
            }),
        })
        if attachment.get('subtype'):
            file_entry.update({
                NameObject("/Subtype"):
                NameObject(attachment['subtype']),
            })
        file_entry_object = self._addObject(file_entry)
        filename_object = createStringObject(attachment['filename'])
        filespec_object = DictionaryObject({
            NameObject("/AFRelationship"):
            NameObject("/Data"),
            NameObject("/Type"):
            NameObject("/Filespec"),
            NameObject("/F"):
            filename_object,
            NameObject("/EF"):
            DictionaryObject({
                NameObject("/F"): file_entry_object,
                NameObject('/UF'): file_entry_object,
            }),
            NameObject("/UF"):
            filename_object,
        })
        if attachment.get('description'):
            filespec_object.update({
                NameObject("/Desc"):
                createStringObject(attachment['description'])
            })
        return self._addObject(filespec_object)
    def __processContent(self, content):
        data = content.getData()

        # Replace data inside of the encoded file
        decodedData = data.decode('utf-8')
        replacedData = self.__replaceText(decodedData)
        encodedData = replacedData.encode('utf-8')

        # Save data as PDF page's content object
        decodedContent = DecodedStreamObject()
        decodedContent.setData(encodedData)

        return decodedContent
Exemple #4
0
def _filespec_additional_attachments(pdf_filestream, name_arrayobj_cdict,
                                     file_dict, file_bin):
    filename = file_dict['filename']
    logger.debug('_filespec_additional_attachments filename=%s', filename)
    mod_date_pdf = _get_pdf_timestamp(file_dict['mod_date'])
    md5sum = hashlib.md5(file_bin).hexdigest()
    md5sum_obj = createStringObject(md5sum)
    params_dict = DictionaryObject({
        NameObject('/CheckSum'):
        md5sum_obj,
        NameObject('/ModDate'):
        createStringObject(mod_date_pdf),
        NameObject('/Size'):
        NameObject(str(len(file_bin))),
    })
    file_entry = DecodedStreamObject()
    file_entry.setData(file_bin)
    file_mimetype = mimetypes.guess_type(filename)[0]
    if not file_mimetype:
        file_mimetype = 'application/octet-stream'
    file_mimetype_insert = '/' + file_mimetype.replace('/', '#2f')
    file_entry.update({
        NameObject("/Type"): NameObject("/EmbeddedFile"),
        NameObject("/Params"): params_dict,
        NameObject("/Subtype"): NameObject(file_mimetype_insert),
    })
    file_entry_obj = pdf_filestream._addObject(file_entry)
    ef_dict = DictionaryObject({
        NameObject("/F"): file_entry_obj,
    })
    fname_obj = createStringObject(filename)
    filespec_dict = DictionaryObject({
        NameObject("/AFRelationship"):
        NameObject("/Unspecified"),
        NameObject("/Desc"):
        createStringObject(file_dict.get('desc', '')),
        NameObject("/Type"):
        NameObject("/Filespec"),
        NameObject("/F"):
        fname_obj,
        NameObject("/EF"):
        ef_dict,
        NameObject("/UF"):
        fname_obj,
    })
    filespec_obj = pdf_filestream._addObject(filespec_dict)
    name_arrayobj_cdict[fname_obj] = filespec_obj
Exemple #5
0
def append_attachment(writer: PdfFileWriter, fname: str, fdata: bytes):
    """Append attachments to a PDF."""
    # The entry for the file
    file_entry = DecodedStreamObject()
    file_entry.setData(fdata)
    file_entry.update({NameObject("/Type"): NameObject("/EmbeddedFile")})

    # The Filespec entry
    efEntry = DictionaryObject()
    efEntry.update({NameObject("/F"): file_entry})

    filespec = DictionaryObject()
    filespec.update({
        NameObject("/Type"): NameObject("/Filespec"),
        NameObject("/F"): createStringObject(fname),
        NameObject("/EF"): efEntry,
    })

    if "/Names" not in writer._root_object.keys():
        # No files attached yet. Create the entry for the root, as it needs a reference to the Filespec
        embeddedFilesNamesDictionary = DictionaryObject()
        embeddedFilesNamesDictionary.update({
            NameObject("/Names"):
            ArrayObject([createStringObject(fname), filespec])
        })

        embeddedFilesDictionary = DictionaryObject()
        embeddedFilesDictionary.update(
            {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary})
        writer._root_object.update(
            {NameObject("/Names"): embeddedFilesDictionary})
    else:
        # There are files already attached. Append the new file.
        writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append(
            createStringObject(fname))
        writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append(
            filespec)
Exemple #6
0
    def zugferd_update_metadata_add_attachment(self, pdf_filestream, fname,
                                               fdata):
        '''This method is inspired from the code of the addAttachment()
        method of the PyPDF2 lib'''
        # The entry for the file
        moddate = DictionaryObject()
        moddate.update({
            NameObject('/ModDate'):
            createStringObject(self._get_pdf_timestamp())
        })
        file_entry = DecodedStreamObject()
        file_entry.setData(fdata)
        file_entry.update({
            NameObject("/Type"):
            NameObject("/EmbeddedFile"),
            NameObject("/Params"):
            moddate,
            # 2F is '/' in hexadecimal
            NameObject("/Subtype"):
            NameObject("/text#2Fxml"),
        })
        file_entry_obj = pdf_filestream._addObject(file_entry)
        # The Filespec entry
        efEntry = DictionaryObject()
        efEntry.update({
            NameObject("/F"): file_entry_obj,
            NameObject('/UF'): file_entry_obj,
        })

        fname_obj = createStringObject(fname)
        filespec = DictionaryObject()
        filespec.update({
            NameObject("/AFRelationship"):
            NameObject("/Alternative"),
            NameObject("/Desc"):
            createStringObject("ZUGFeRD Invoice"),
            NameObject("/Type"):
            NameObject("/Filespec"),
            NameObject("/F"):
            fname_obj,
            NameObject("/EF"):
            efEntry,
            NameObject("/UF"):
            fname_obj,
        })
        embeddedFilesNamesDictionary = DictionaryObject()
        embeddedFilesNamesDictionary.update({
            NameObject("/Names"):
            ArrayObject([fname_obj,
                         pdf_filestream._addObject(filespec)])
        })
        # Then create the entry for the root, as it needs a
        # reference to the Filespec
        embeddedFilesDictionary = DictionaryObject()
        embeddedFilesDictionary.update(
            {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary})
        # Update the root
        metadata_xml_str = self._prepare_pdf_metadata()
        metadata_file_entry = DecodedStreamObject()
        metadata_file_entry.setData(metadata_xml_str)
        metadata_value = pdf_filestream._addObject(metadata_file_entry)
        af_value = pdf_filestream._addObject(
            ArrayObject([pdf_filestream._addObject(filespec)]))
        pdf_filestream._root_object.update({
            NameObject("/AF"):
            af_value,
            NameObject("/Metadata"):
            metadata_value,
            NameObject("/Names"):
            embeddedFilesDictionary,
        })
        info_dict = self._prepare_pdf_info()
        pdf_filestream.addMetadata(info_dict)
Exemple #7
0
    def convert_to_pdfa(self):
        """
        Transform the opened PDF file into a PDF/A compliant file
        """
        # Set the PDF version to 1.7 (as PDF/A-3 is based on version 1.7) and make it PDF/A compliant.
        # See https://github.com/veraPDF/veraPDF-validation-profiles/wiki/PDFA-Parts-2-and-3-rules#rule-612-1

        # " The file header shall begin at byte zero and shall consist of "%PDF-1.n" followed by a single EOL marker,
        # where 'n' is a single digit number between 0 (30h) and 7 (37h) "
        # " The aforementioned EOL marker shall be immediately followed by a % (25h) character followed by at least four
        # bytes, each of whose encoded byte values shall have a decimal value greater than 127 "
        self._header = b"%PDF-1.7\n%\xFF\xFF\xFF\xFF"

        # Add a document ID to the trailer. This is only needed when using encryption with regular PDF, but is required
        # when using PDF/A
        pdf_id = ByteStringObject(md5(self._reader.stream.getvalue()).digest())
        # The first string is based on the content at the time of creating the file, while the second is based on the
        # content of the file when it was last updated. When creating a PDF, both are set to the same value.
        self._ID = ArrayObject((pdf_id, pdf_id))

        with file_open('tools/data/files/sRGB2014.icc', mode='rb') as icc_profile:
            icc_profile_file_data = compress(icc_profile.read())

        icc_profile_stream_obj = DecodedStreamObject()
        icc_profile_stream_obj.setData(icc_profile_file_data)
        icc_profile_stream_obj.update({
            NameObject("/Filter"): NameObject("/FlateDecode"),
            NameObject("/N"): NumberObject(3),
            NameObject("/Length"): NameObject(str(len(icc_profile_file_data))),
        })

        icc_profile_obj = self._addObject(icc_profile_stream_obj)

        output_intent_dict_obj = DictionaryObject()
        output_intent_dict_obj.update({
            NameObject("/S"): NameObject("/GTS_PDFA1"),
            NameObject("/OutputConditionIdentifier"): createStringObject("sRGB"),
            NameObject("/DestOutputProfile"): icc_profile_obj,
            NameObject("/Type"): NameObject("/OutputIntent"),
        })

        output_intent_obj = self._addObject(output_intent_dict_obj)
        self._root_object.update({
            NameObject("/OutputIntents"): ArrayObject([output_intent_obj]),
        })

        pages = self._root_object['/Pages']['/Kids']

        # PDF/A needs the glyphs width array embedded in the pdf to be consistent with the ones from the font file.
        # But it seems like it is not the case when exporting from wkhtmltopdf.
        if TTFont:
            fonts = {}
            # First browse through all the pages of the pdf file, to get a reference to all the fonts used in the PDF.
            for page in pages:
                for font in page.getObject()['/Resources']['/Font'].values():
                    for descendant in font.getObject()['/DescendantFonts']:
                        fonts[descendant.idnum] = descendant.getObject()

            # Then for each font, rewrite the width array with the information taken directly from the font file.
            # The new width are calculated such as width = round(1000 * font_glyph_width / font_units_per_em)
            # See: http://martin.hoppenheit.info/blog/2018/pdfa-validation-and-inconsistent-glyph-width-information/
            for font in fonts.values():
                font_file = font['/FontDescriptor']['/FontFile2']
                stream = io.BytesIO(decompress(font_file._data))
                ttfont = TTFont(stream)
                font_upm = ttfont['head'].unitsPerEm
                glyphs = ttfont.getGlyphSet()._hmtx.metrics
                glyph_widths = []
                for key, values in glyphs.items():
                    if key[:5] == 'glyph':
                        glyph_widths.append(NumberObject(round(1000.0 * values[0] / font_upm)))

                font[NameObject('/W')] = ArrayObject([NumberObject(1), ArrayObject(glyph_widths)])
                stream.close()
        else:
            _logger.warning('The fonttools package is not installed. Generated PDF may not be PDF/A compliant.')

        outlines = self._root_object['/Outlines'].getObject()
        outlines[NameObject('/Count')] = NumberObject(1)

        # Set odoo as producer
        self.addMetadata({
            '/Creator': "Odoo",
            '/Producer': "Odoo",
        })
        self.is_pdfa = True
Exemple #8
0
    def _update_metadata_add_attachment(self, pdf_metadata, output_intents):
        '''This method is inspired from the code of the addAttachment()
        method of the PyPDF2 lib'''
        
        # The entry for the file
        facturx_xml_str = self.factx.xml_str
        md5sum = hashlib.md5().hexdigest()
        md5sum_obj = createStringObject(md5sum)
        params_dict = DictionaryObject({
            NameObject('/CheckSum'): md5sum_obj,
            NameObject('/ModDate'): createStringObject(_get_pdf_timestamp()),
            NameObject('/Size'): NameObject(str(len(facturx_xml_str))),
            })
        file_entry = DecodedStreamObject()
        file_entry.setData(facturx_xml_str)  # here we integrate the file itself
        file_entry.update({
            NameObject("/Type"): NameObject("/EmbeddedFile"),
            NameObject("/Params"): params_dict,
            # 2F is '/' in hexadecimal
            NameObject("/Subtype"): NameObject("/text#2Fxml"),
            })
        file_entry_obj = self._addObject(file_entry)
        # The Filespec entry
        ef_dict = DictionaryObject({
            NameObject("/F"): file_entry_obj,
            NameObject('/UF'): file_entry_obj,
            })

        xmp_filename = self.factx.flavor.details['xmp_filename']
        fname_obj = createStringObject(xmp_filename)
        filespec_dict = DictionaryObject({
            NameObject("/AFRelationship"): NameObject("/Data"),
            NameObject("/Desc"): createStringObject("Factur-X Invoice"),
            NameObject("/Type"): NameObject("/Filespec"),
            NameObject("/F"): fname_obj,
            NameObject("/EF"): ef_dict,
            NameObject("/UF"): fname_obj,
            })
        filespec_obj = self._addObject(filespec_dict)
        name_arrayobj_cdict = {fname_obj: filespec_obj}
        
        # TODO: add back additional attachments?
        logger.debug('name_arrayobj_cdict=%s', name_arrayobj_cdict)
        name_arrayobj_content_sort = list(
            sorted(name_arrayobj_cdict.items(), key=lambda x: x[0]))
        logger.debug('name_arrayobj_content_sort=%s', name_arrayobj_content_sort)
        name_arrayobj_content_final = []
        af_list = []
        for (fname_obj, filespec_obj) in name_arrayobj_content_sort:
            name_arrayobj_content_final += [fname_obj, filespec_obj]
            af_list.append(filespec_obj)
        embedded_files_names_dict = DictionaryObject({
            NameObject("/Names"): ArrayObject(name_arrayobj_content_final),
            })
        
        # Then create the entry for the root, as it needs a
        # reference to the Filespec
        embedded_files_dict = DictionaryObject({
            NameObject("/EmbeddedFiles"): embedded_files_names_dict,
            })
        res_output_intents = []
        logger.debug('output_intents=%s', output_intents)
        for output_intent_dict, dest_output_profile_dict in output_intents:
            dest_output_profile_obj = self._addObject(
                dest_output_profile_dict)
            # TODO detect if there are no other objects in output_intent_dest_obj
            # than /DestOutputProfile
            output_intent_dict.update({
                NameObject("/DestOutputProfile"): dest_output_profile_obj,
                })
            output_intent_obj = self._addObject(output_intent_dict)
            res_output_intents.append(output_intent_obj)
        
        # Update the root
        xmp_level_str = self.factx.flavor.details['levels'][self.factx.flavor.level]['xmp_str']
        xmp_template = self.factx.flavor.get_xmp_xml()
        metadata_xml_str = _prepare_pdf_metadata_xml(xmp_level_str, xmp_filename, xmp_template, pdf_metadata)
        metadata_file_entry = DecodedStreamObject()
        metadata_file_entry.setData(metadata_xml_str)
        metadata_file_entry.update({
            NameObject('/Subtype'): NameObject('/XML'),
            NameObject('/Type'): NameObject('/Metadata'),
            })
        metadata_obj = self._addObject(metadata_file_entry)
        af_value_obj = self._addObject(ArrayObject(af_list))
        self._root_object.update({
            NameObject("/AF"): af_value_obj,
            NameObject("/Metadata"): metadata_obj,
            NameObject("/Names"): embedded_files_dict,
            # show attachments when opening PDF
            NameObject("/PageMode"): NameObject("/UseAttachments"),
            })
        logger.debug('res_output_intents=%s', res_output_intents)
        if res_output_intents:
            self._root_object.update({
                NameObject("/OutputIntents"): ArrayObject(res_output_intents),
            })
        metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata)
        self.addMetadata(metadata_txt_dict)
Exemple #9
0
def _facturx_update_metadata_add_attachment(pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level,
                                            output_intents):
    md5sum = hashlib.md5(facturx_xml_str).hexdigest()
    md5sum_obj = createStringObject(md5sum)
    params_dict = DictionaryObject({
        NameObject('/CheckSum'): md5sum_obj,
        NameObject('/ModDate'): createStringObject(datetime.datetime.now().isoformat()),
        NameObject('/Size'): NameObject(str(len(facturx_xml_str))),
    })
    file_entry = DecodedStreamObject()
    file_entry.setData(facturx_xml_str)  # here we integrate the file itself
    file_entry.update({
        NameObject("/Type"): NameObject("/EmbeddedFile"),
        NameObject("/Params"): params_dict,
        # 2F is '/' in hexadecimal
        NameObject("/Subtype"): NameObject("/text#2Fxml"),
    })
    file_entry_obj = pdf_filestream._addObject(file_entry)
    # The Filespec entry
    ef_dict = DictionaryObject({
        NameObject("/F"): file_entry_obj,
        NameObject('/UF'): file_entry_obj,
    })

    fname_obj = createStringObject("ZUGFeRD-invoice.xml")
    filespec_dict = DictionaryObject({
        NameObject("/AFRelationship"): NameObject("/Data"),
        NameObject("/Desc"): createStringObject("Factur-X Invoice"),
        NameObject("/Type"): NameObject("/Filespec"),
        NameObject("/F"): fname_obj,
        NameObject("/EF"): ef_dict,
        NameObject("/UF"): fname_obj,
    })
    filespec_obj = pdf_filestream._addObject(filespec_dict)
    name_arrayobj_cdict = {fname_obj: filespec_obj}
    name_arrayobj_content_sort = list(
        sorted(name_arrayobj_cdict.items(), key=lambda x: x[0]))
    name_arrayobj_content_final = []
    af_list = []
    for (fname_obj, filespec_obj) in name_arrayobj_content_sort:
        name_arrayobj_content_final += [fname_obj, filespec_obj]
        af_list.append(filespec_obj)
    embedded_files_names_dict = DictionaryObject({
        NameObject("/Names"): ArrayObject(name_arrayobj_content_final),
    })
    # Then create the entry for the root, as it needs a
    # reference to the Filespec
    embedded_files_dict = DictionaryObject({
        NameObject("/EmbeddedFiles"): embedded_files_names_dict,
    })
    res_output_intents = []
    for output_intent_dict, dest_output_profile_dict in output_intents:
        dest_output_profile_obj = pdf_filestream._addObject(
            dest_output_profile_dict)
        # TODO detect if there are no other objects in output_intent_dest_obj
        # than /DestOutputProfile
        output_intent_dict.update({
            NameObject("/DestOutputProfile"): dest_output_profile_obj,
        })
        output_intent_obj = pdf_filestream._addObject(output_intent_dict)
        res_output_intents.append(output_intent_obj)
    # Update the root
    metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata)
    metadata_file_entry = DecodedStreamObject()
    metadata_file_entry.setData(metadata_xml_str)
    metadata_file_entry.update({
        NameObject('/Subtype'): NameObject('/XML'),
        NameObject('/Type'): NameObject('/Metadata'),
    })
    metadata_obj = pdf_filestream._addObject(metadata_file_entry)
    af_value_obj = pdf_filestream._addObject(ArrayObject(af_list))
    pdf_filestream._root_object.update({
        NameObject("/AF"): af_value_obj,
        NameObject("/Metadata"): metadata_obj,
        NameObject("/Names"): embedded_files_dict,
        # show attachments when opening PDF
        NameObject("/PageMode"): NameObject("/UseAttachments"),
    })
    if res_output_intents:
        pdf_filestream._root_object.update({
            NameObject("/OutputIntents"): ArrayObject(res_output_intents),
        })
    metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata)
    pdf_filestream.addMetadata(metadata_txt_dict)
Exemple #10
0
def flate_string(s):
  o = DecodedStreamObject()
  o.setData(s)
  return o.flateEncode()
Exemple #11
0
def flate_string(s):
    o = DecodedStreamObject()
    o.setData(s)
    return o.flateEncode()