def appendAttachment(myPdfFileWriterObj, fname, fdata): file_entry = DecodedStreamObject() file_entry.setData(fdata) file_entry.update({NameObject("/Type"): NameObject("/EmbeddedFile")}) efEntry = DictionaryObject() efEntry.update({NameObject("/F"): file_entry}) filespec = DictionaryObject() filespec.update({ NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): createStringObject(fname), NameObject("/EF"): efEntry }) if "/Names" not in myPdfFileWriterObj._root_object.keys(): embeddedFilesNamesDictionary = DictionaryObject() embeddedFilesNamesDictionary.update({ NameObject("/Names"): ArrayObject([createStringObject(fname), filespec]) }) embeddedFilesDictionary = DictionaryObject() embeddedFilesDictionary.update( {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary}) myPdfFileWriterObj._root_object.update( {NameObject("/Names"): embeddedFilesDictionary}) else: myPdfFileWriterObj._root_object["/Names"]["/EmbeddedFiles"][ "/Names"].append(createStringObject(fname)) myPdfFileWriterObj._root_object["/Names"]["/EmbeddedFiles"][ "/Names"].append(filespec)
def remove_watermark(wm_text, inputFile, outputFile): from PyPDF4 import PdfFileReader, PdfFileWriter from PyPDF4.pdf import ContentStream from PyPDF4.generic import TextStringObject, NameObject from PyPDF4.utils import b_ with open(inputFile, "rb") as f: source = PdfFileReader(f, "rb") output = PdfFileWriter() for page in range(source.getNumPages()): page = source.getPage(page) content_object = page["/Contents"].getObject() content = ContentStream(content_object, source) for operands, operator in content.operations: if operator == b_("Tj"): text = operands[0] if isinstance(text, str) and text.startswith(wm_text): operands[0] = TextStringObject('') page.__setitem__(NameObject('/Contents'), content) output.addPage(page) with open(outputFile, "wb") as outputStream: output.write(outputStream)
def unwatermark_pdf(input_file: str, wm_text: str, pages: Tuple = None): """ Removes watermark from the pdf file. """ pdf_reader = PdfFileReader(open(input_file, 'rb'), strict=False) pdf_writer = PdfFileWriter() for page in range(pdf_reader.getNumPages()): # If required for specific pages if pages: if str(page) not in pages: continue page = pdf_reader.getPage(page) # Get the page content content_object = page["/Contents"].getObject() content = ContentStream(content_object, pdf_reader) # Loop through all the elements page elements for operands, operator in content.operations: # Checks the TJ operator and replaces the corresponding string operand (Watermark text) with '' if operator == b_("Tj"): text = operands[0] if isinstance(text, str) and text.startswith(wm_text): operands[0] = TextStringObject('') page.__setitem__(NameObject('/Contents'), content) pdf_writer.addPage(page) return True, pdf_reader, pdf_writer
def remove_noise(inputFile): template_id = "TID" + str(uuid.uuid4().node) outputFile = template_id + '.' + inputFile.split('.')[-1] with open('./static/img/' + inputFile, "rb") as f: source = PdfFileReader(f, "rb") output = PdfFileWriter() for page in range(source.getNumPages()): page = source.getPage(page) content_object = page["/Contents"].getObject() content = ContentStream(content_object, source) for operands, operator in content.operations: if operator == b_("Tf") or operator == b_("Tj"): operands[0] = TextStringObject('') page.__setitem__(NameObject('/Contents'), content) output.addPage(page) # try: with open('./static/template/' + outputFile, "wb") as outputStream: output.write(outputStream) return template_id
def _facturx_update_metadata_add_attachment(pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level, output_intents=[], additional_attachments={}): '''This method is inspired from the code of the addAttachment() method of the PyPDF2 lib''' # The entry for the file # facturx_xml_str = facturx_xml_str.encode('utf-8') md5sum = hashlib.md5(facturx_xml_str).hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(_get_pdf_timestamp()), NameObject('/Size'): NameObject(str(len(facturx_xml_str))), }) file_entry = DecodedStreamObject() file_entry.setData(facturx_xml_str) # here we integrate the file itself file_entry = file_entry.flateEncode() file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = pdf_filestream._addObject(file_entry) # The Filespec entry ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) fname_obj = createStringObject(FACTURX_FILENAME) filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Desc"): createStringObject("Factur-X Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = pdf_filestream._addObject(filespec_dict) name_arrayobj_cdict = {fname_obj: filespec_obj} for attach_bin, attach_dict in additional_attachments.items(): _filespec_additional_attachments(pdf_filestream, name_arrayobj_cdict, attach_dict, attach_bin) name_arrayobj_content_sort = list( sorted(name_arrayobj_cdict.items(), key=lambda x: x[0])) name_arrayobj_content_final = [] af_list = [] for (fname_obj, filespec_obj) in name_arrayobj_content_sort: name_arrayobj_content_final += [fname_obj, filespec_obj] af_list.append(filespec_obj) embedded_files_names_dict = DictionaryObject({ NameObject("/Names"): ArrayObject(name_arrayobj_content_final), }) # Then create the entry for the root, as it needs a # reference to the Filespec embedded_files_dict = DictionaryObject({ NameObject("/EmbeddedFiles"): embedded_files_names_dict, }) res_output_intents = [] for output_intent_dict, dest_output_profile_dict in output_intents: dest_output_profile_obj = pdf_filestream._addObject( dest_output_profile_dict) # TODO detect if there are no other objects in output_intent_dest_obj # than /DestOutputProfile output_intent_dict.update({ NameObject("/DestOutputProfile"): dest_output_profile_obj, }) output_intent_obj = pdf_filestream._addObject(output_intent_dict) res_output_intents.append(output_intent_obj) # Update the root metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata) metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_file_entry = metadata_file_entry.flateEncode() metadata_file_entry.update({ NameObject('/Subtype'): NameObject('/XML'), NameObject('/Type'): NameObject('/Metadata'), }) metadata_obj = pdf_filestream._addObject(metadata_file_entry) af_value_obj = pdf_filestream._addObject(ArrayObject(af_list)) pdf_filestream._root_object.update({ NameObject("/AF"): af_value_obj, NameObject("/Metadata"): metadata_obj, NameObject("/Names"): embedded_files_dict, # show attachments when opening PDF NameObject("/PageMode"): NameObject("/UseAttachments"), }) if res_output_intents: pdf_filestream._root_object.update({ NameObject("/OutputIntents"): ArrayObject(res_output_intents), }) metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata) pdf_filestream.addMetadata(metadata_txt_dict)
def _filespec_additional_attachments(pdf_filestream, name_arrayobj_cdict, file_dict, file_bin): filename = file_dict['filename'] mod_date_pdf = _get_pdf_timestamp(file_dict['mod_date']) md5sum = hashlib.md5(file_bin).hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(mod_date_pdf), NameObject('/Size'): NameObject(str(len(file_bin))), }) file_entry = DecodedStreamObject() file_entry.setData(file_bin) file_entry = file_entry.flateEncode() file_mimetype = mimetypes.guess_type(filename)[0] if not file_mimetype: file_mimetype = 'application/octet-stream' file_mimetype_insert = '/' + file_mimetype.replace('/', '#2f') file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, NameObject("/Subtype"): NameObject(file_mimetype_insert), }) file_entry_obj = pdf_filestream._addObject(file_entry) ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, }) fname_obj = createStringObject(filename) filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Unspecified"), NameObject("/Desc"): createStringObject(file_dict.get('desc', '')), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = pdf_filestream._addObject(filespec_dict) name_arrayobj_cdict[fname_obj] = filespec_obj
def readPDF(inFileName, outFileName): try: PDFInputName = inFileName PDFOutputName = outFileName PDFInterimName = "output.pdf" ## Put a white ractangle on page 1 # Create a a borderless white rectangle packet = io.BytesIO() can = canvas.Canvas(packet, pagesize=letter) can.setFillColorRGB(255, 255, 255) can.rect(450, 550, 100, 40, fill=1, stroke=0) can.save() # set to beginning of bytestream and create a new PDF packet.seek(0) newPdf = PdfFileReader(packet) interimOutput = PdfFileWriter() with open(PDFInputName, 'rb') as fileStream: existingPdf = PdfFileReader(fileStream) # Get first Page and merge with rectangle page = existingPdf.getPage(0) page.mergePage(newPdf.getPage(0)) numPages = existingPdf.getNumPages() for n in range(numPages): interimOutput.addPage(existingPdf.getPage(n)) with open(PDFInterimName, "wb") as fileStream: interimOutput.write(fileStream) pdfWriter = PdfFileWriter() with open(PDFInterimName, 'rb') as fileHandle: # Read & extract Information pdfReader = PdfFileReader(fileHandle) pdfInfo = pdfReader.getDocumentInfo() numPages = pdfReader.getNumPages() PDFAllInfo = { 'author': pdfInfo.author, 'creator': pdfInfo.creator, 'producer': pdfInfo.producer, 'subject': pdfInfo.subject, 'title': pdfInfo.title, 'num_pages': numPages } # Get content in all pages for pageNum in range(numPages): # Get page pageObject = pdfReader.getPage(pageNum) # Get only the /Contents item in dictionary Eg : [IndirectObject(4, 0)] pageContentsObject = pageObject['/Contents'] # Extract the elements fo the /Contents object as a contentstream (cant print this directly) pageContent = ContentStream( pageContentsObject, pdfReader) # Check operands and operators # Add page to pdf writer pdfWriter.addPage(pageObject) # loop through operators and operands in contents for operands, operator in pageContent.operations: if operator == b_("Tj") and operands == [ b'\x00C\x00O\x00P\x00Y\x00-\x00O\x00N\x00L\x00Y' ]: operands[0] = TextStringObject('') # Replace /Contents in pageObject pageObject.__setitem__(NameObject('/Contents'), pageContent) # Write to output file with open(PDFOutputName, "wb") as outStream: pdfWriter.write(outStream) return (f"{PDFOutputName}") except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(f"{exc_type}, {exc_obj} , {fname} : {exc_tb.tb_lineno}")