Example #1
0
def appendAttachment(myPdfFileWriterObj, fname, fdata):
    file_entry = DecodedStreamObject()
    file_entry.setData(fdata)
    file_entry.update({NameObject("/Type"): NameObject("/EmbeddedFile")})

    efEntry = DictionaryObject()
    efEntry.update({NameObject("/F"): file_entry})

    filespec = DictionaryObject()
    filespec.update({
        NameObject("/Type"): NameObject("/Filespec"),
        NameObject("/F"): createStringObject(fname),
        NameObject("/EF"): efEntry
    })

    if "/Names" not in myPdfFileWriterObj._root_object.keys():
        embeddedFilesNamesDictionary = DictionaryObject()
        embeddedFilesNamesDictionary.update({
            NameObject("/Names"):
            ArrayObject([createStringObject(fname), filespec])
        })

        embeddedFilesDictionary = DictionaryObject()
        embeddedFilesDictionary.update(
            {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary})
        myPdfFileWriterObj._root_object.update(
            {NameObject("/Names"): embeddedFilesDictionary})
    else:
        myPdfFileWriterObj._root_object["/Names"]["/EmbeddedFiles"][
            "/Names"].append(createStringObject(fname))
        myPdfFileWriterObj._root_object["/Names"]["/EmbeddedFiles"][
            "/Names"].append(filespec)
def remove_watermark(wm_text, inputFile, outputFile):
    from PyPDF4 import PdfFileReader, PdfFileWriter
    from PyPDF4.pdf import ContentStream
    from PyPDF4.generic import TextStringObject, NameObject
    from PyPDF4.utils import b_

    with open(inputFile, "rb") as f:
        source = PdfFileReader(f, "rb")
        output = PdfFileWriter()

        for page in range(source.getNumPages()):
            page = source.getPage(page)
            content_object = page["/Contents"].getObject()
            content = ContentStream(content_object, source)

            for operands, operator in content.operations:
                if operator == b_("Tj"):
                    text = operands[0]

                    if isinstance(text, str) and text.startswith(wm_text):
                        operands[0] = TextStringObject('')

            page.__setitem__(NameObject('/Contents'), content)
            output.addPage(page)

        with open(outputFile, "wb") as outputStream:
            output.write(outputStream)
Example #3
0
def unwatermark_pdf(input_file: str, wm_text: str, pages: Tuple = None):
    """
    Removes watermark from the pdf file.
    """
    pdf_reader = PdfFileReader(open(input_file, 'rb'), strict=False)
    pdf_writer = PdfFileWriter()
    for page in range(pdf_reader.getNumPages()):
        # If required for specific pages
        if pages:
            if str(page) not in pages:
                continue
        page = pdf_reader.getPage(page)
        # Get the page content
        content_object = page["/Contents"].getObject()
        content = ContentStream(content_object, pdf_reader)
        # Loop through all the elements page elements
        for operands, operator in content.operations:
            # Checks the TJ operator and replaces the corresponding string operand (Watermark text) with ''
            if operator == b_("Tj"):
                text = operands[0]
                if isinstance(text, str) and text.startswith(wm_text):
                    operands[0] = TextStringObject('')
        page.__setitem__(NameObject('/Contents'), content)
        pdf_writer.addPage(page)
    return True, pdf_reader, pdf_writer
def remove_noise(inputFile):
    template_id = "TID" + str(uuid.uuid4().node)
    outputFile = template_id + '.' + inputFile.split('.')[-1]
    with open('./static/img/' + inputFile, "rb") as f:
        source = PdfFileReader(f, "rb")
        output = PdfFileWriter()

        for page in range(source.getNumPages()):
            page = source.getPage(page)
            content_object = page["/Contents"].getObject()
            content = ContentStream(content_object, source)

            for operands, operator in content.operations:
                if operator == b_("Tf") or operator == b_("Tj"):
                    operands[0] = TextStringObject('')

            page.__setitem__(NameObject('/Contents'), content)
            output.addPage(page)

        # try:
        with open('./static/template/' + outputFile, "wb") as outputStream:
            output.write(outputStream)
    return template_id
def _facturx_update_metadata_add_attachment(pdf_filestream,
                                            facturx_xml_str,
                                            pdf_metadata,
                                            facturx_level,
                                            output_intents=[],
                                            additional_attachments={}):
    '''This method is inspired from the code of the addAttachment()
    method of the PyPDF2 lib'''
    # The entry for the file
    # facturx_xml_str = facturx_xml_str.encode('utf-8')
    md5sum = hashlib.md5(facturx_xml_str).hexdigest()
    md5sum_obj = createStringObject(md5sum)
    params_dict = DictionaryObject({
        NameObject('/CheckSum'):
        md5sum_obj,
        NameObject('/ModDate'):
        createStringObject(_get_pdf_timestamp()),
        NameObject('/Size'):
        NameObject(str(len(facturx_xml_str))),
    })
    file_entry = DecodedStreamObject()
    file_entry.setData(facturx_xml_str)  # here we integrate the file itself
    file_entry = file_entry.flateEncode()
    file_entry.update({
        NameObject("/Type"):
        NameObject("/EmbeddedFile"),
        NameObject("/Params"):
        params_dict,
        # 2F is '/' in hexadecimal
        NameObject("/Subtype"):
        NameObject("/text#2Fxml"),
    })
    file_entry_obj = pdf_filestream._addObject(file_entry)
    # The Filespec entry
    ef_dict = DictionaryObject({
        NameObject("/F"): file_entry_obj,
        NameObject('/UF'): file_entry_obj,
    })

    fname_obj = createStringObject(FACTURX_FILENAME)
    filespec_dict = DictionaryObject({
        NameObject("/AFRelationship"):
        NameObject("/Data"),
        NameObject("/Desc"):
        createStringObject("Factur-X Invoice"),
        NameObject("/Type"):
        NameObject("/Filespec"),
        NameObject("/F"):
        fname_obj,
        NameObject("/EF"):
        ef_dict,
        NameObject("/UF"):
        fname_obj,
    })
    filespec_obj = pdf_filestream._addObject(filespec_dict)
    name_arrayobj_cdict = {fname_obj: filespec_obj}
    for attach_bin, attach_dict in additional_attachments.items():
        _filespec_additional_attachments(pdf_filestream, name_arrayobj_cdict,
                                         attach_dict, attach_bin)
    name_arrayobj_content_sort = list(
        sorted(name_arrayobj_cdict.items(), key=lambda x: x[0]))
    name_arrayobj_content_final = []
    af_list = []
    for (fname_obj, filespec_obj) in name_arrayobj_content_sort:
        name_arrayobj_content_final += [fname_obj, filespec_obj]
        af_list.append(filespec_obj)
    embedded_files_names_dict = DictionaryObject({
        NameObject("/Names"):
        ArrayObject(name_arrayobj_content_final),
    })
    # Then create the entry for the root, as it needs a
    # reference to the Filespec
    embedded_files_dict = DictionaryObject({
        NameObject("/EmbeddedFiles"):
        embedded_files_names_dict,
    })
    res_output_intents = []
    for output_intent_dict, dest_output_profile_dict in output_intents:
        dest_output_profile_obj = pdf_filestream._addObject(
            dest_output_profile_dict)
        # TODO detect if there are no other objects in output_intent_dest_obj
        # than /DestOutputProfile
        output_intent_dict.update({
            NameObject("/DestOutputProfile"):
            dest_output_profile_obj,
        })
        output_intent_obj = pdf_filestream._addObject(output_intent_dict)
        res_output_intents.append(output_intent_obj)
    # Update the root
    metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata)
    metadata_file_entry = DecodedStreamObject()
    metadata_file_entry.setData(metadata_xml_str)
    metadata_file_entry = metadata_file_entry.flateEncode()
    metadata_file_entry.update({
        NameObject('/Subtype'): NameObject('/XML'),
        NameObject('/Type'): NameObject('/Metadata'),
    })
    metadata_obj = pdf_filestream._addObject(metadata_file_entry)
    af_value_obj = pdf_filestream._addObject(ArrayObject(af_list))
    pdf_filestream._root_object.update({
        NameObject("/AF"):
        af_value_obj,
        NameObject("/Metadata"):
        metadata_obj,
        NameObject("/Names"):
        embedded_files_dict,
        # show attachments when opening PDF
        NameObject("/PageMode"):
        NameObject("/UseAttachments"),
    })
    if res_output_intents:
        pdf_filestream._root_object.update({
            NameObject("/OutputIntents"):
            ArrayObject(res_output_intents),
        })
    metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata)
    pdf_filestream.addMetadata(metadata_txt_dict)
def _filespec_additional_attachments(pdf_filestream, name_arrayobj_cdict,
                                     file_dict, file_bin):
    filename = file_dict['filename']
    mod_date_pdf = _get_pdf_timestamp(file_dict['mod_date'])
    md5sum = hashlib.md5(file_bin).hexdigest()
    md5sum_obj = createStringObject(md5sum)
    params_dict = DictionaryObject({
        NameObject('/CheckSum'):
        md5sum_obj,
        NameObject('/ModDate'):
        createStringObject(mod_date_pdf),
        NameObject('/Size'):
        NameObject(str(len(file_bin))),
    })
    file_entry = DecodedStreamObject()
    file_entry.setData(file_bin)
    file_entry = file_entry.flateEncode()
    file_mimetype = mimetypes.guess_type(filename)[0]
    if not file_mimetype:
        file_mimetype = 'application/octet-stream'
    file_mimetype_insert = '/' + file_mimetype.replace('/', '#2f')
    file_entry.update({
        NameObject("/Type"): NameObject("/EmbeddedFile"),
        NameObject("/Params"): params_dict,
        NameObject("/Subtype"): NameObject(file_mimetype_insert),
    })
    file_entry_obj = pdf_filestream._addObject(file_entry)
    ef_dict = DictionaryObject({
        NameObject("/F"): file_entry_obj,
    })
    fname_obj = createStringObject(filename)
    filespec_dict = DictionaryObject({
        NameObject("/AFRelationship"):
        NameObject("/Unspecified"),
        NameObject("/Desc"):
        createStringObject(file_dict.get('desc', '')),
        NameObject("/Type"):
        NameObject("/Filespec"),
        NameObject("/F"):
        fname_obj,
        NameObject("/EF"):
        ef_dict,
        NameObject("/UF"):
        fname_obj,
    })
    filespec_obj = pdf_filestream._addObject(filespec_dict)
    name_arrayobj_cdict[fname_obj] = filespec_obj
Example #7
0
def readPDF(inFileName, outFileName):
    try:
        PDFInputName = inFileName
        PDFOutputName = outFileName
        PDFInterimName = "output.pdf"

        ## Put a white ractangle on page 1
        # Create a a borderless white rectangle
        packet = io.BytesIO()
        can = canvas.Canvas(packet, pagesize=letter)
        can.setFillColorRGB(255, 255, 255)
        can.rect(450, 550, 100, 40, fill=1, stroke=0)
        can.save()

        # set to beginning of bytestream and create a new PDF
        packet.seek(0)
        newPdf = PdfFileReader(packet)
        interimOutput = PdfFileWriter()

        with open(PDFInputName, 'rb') as fileStream:
            existingPdf = PdfFileReader(fileStream)

            # Get first Page and merge with rectangle
            page = existingPdf.getPage(0)
            page.mergePage(newPdf.getPage(0))
            numPages = existingPdf.getNumPages()
            for n in range(numPages):
                interimOutput.addPage(existingPdf.getPage(n))

            with open(PDFInterimName, "wb") as fileStream:
                interimOutput.write(fileStream)

        pdfWriter = PdfFileWriter()

        with open(PDFInterimName, 'rb') as fileHandle:
            # Read & extract Information
            pdfReader = PdfFileReader(fileHandle)
            pdfInfo = pdfReader.getDocumentInfo()
            numPages = pdfReader.getNumPages()

            PDFAllInfo = {
                'author': pdfInfo.author,
                'creator': pdfInfo.creator,
                'producer': pdfInfo.producer,
                'subject': pdfInfo.subject,
                'title': pdfInfo.title,
                'num_pages': numPages
            }
            # Get content in all pages
            for pageNum in range(numPages):
                # Get page
                pageObject = pdfReader.getPage(pageNum)

                # Get only the /Contents item in dictionary Eg : [IndirectObject(4, 0)]
                pageContentsObject = pageObject['/Contents']

                # Extract the elements fo the /Contents object as a contentstream (cant print this directly)
                pageContent = ContentStream(
                    pageContentsObject,
                    pdfReader)  # Check operands and operators

                # Add page to pdf writer
                pdfWriter.addPage(pageObject)

                # loop through operators and operands in contents
                for operands, operator in pageContent.operations:
                    if operator == b_("Tj") and operands == [
                            b'\x00C\x00O\x00P\x00Y\x00-\x00O\x00N\x00L\x00Y'
                    ]:
                        operands[0] = TextStringObject('')

                # Replace /Contents in pageObject
                pageObject.__setitem__(NameObject('/Contents'), pageContent)

            # Write to output file
            with open(PDFOutputName, "wb") as outStream:
                pdfWriter.write(outStream)

        return (f"{PDFOutputName}")

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(f"{exc_type}, {exc_obj} , {fname} : {exc_tb.tb_lineno}")