def __init__(self, id: int): self.report_model = Report.objects.get(report_id=id) self.pdf_writer = PyPDF2.PdfFileWriter()
import PyPDF2, os #it can only extract text from pdf and not other data #and have limited set of actions pdf1File = open('meetingminutes.pdf', 'rb') #open in read binary reader1 = PyPDF2.PdfFileReader(pdf1File) #reading first pdf pdf2File = open('meetingminutes2.pdf', 'rb') reader2 = PyPDF2.PdfFileReader(pdf2File) #reading second pdf writer = PyPDF2.PdfFileWriter( ) #writer variable for the pdf which is created by python only for page_num in range(reader1.numPages): #going through each page page = reader1.getPage(page_num) #getting page writer.addPage(page) #writing to new pdf for page_num in range(reader2.numPages): #taking second pdf's all pages page = reader2.getPage(page_num) #getting each page writer.addPage(page) #writing(appending) to the new(created) pdf outputfile = open('CombinedPDFs.pdf', 'wb') #write binary writer.write(outputfile) outputfile.close() pdf1File.close() pdf2File.close()
def __init__(self, in_file: str, out_file: str): self.__reader = PyPDF2.PdfFileReader(in_file, strict=False) self.__writer = PyPDF2.PdfFileWriter() self.__writer.cloneReaderDocumentRoot(self.__reader) self.out_file = out_file
def add_files_to_pdf( self, files: list = None, target_document: str = None, ) -> None: """Add images and/or pdfs to new PDF document Image formats supported are JPEG, PNG and GIF. The file can be added with extra properties by denoting `:` at the end of the filename.add() Supported extra properties are: - for PDF, the page numbers to be included in the new PDF document - for images, the position of the image in the new PDF document **Examples** **Robot Framework** .. code-block:: robotframework ***Settings*** Library RPA.PDF ***Tasks*** Add files to pdf ${files}= Create List ... ${TESTDATA_DIR}${/}invoice.pdf ... ${TESTDATA_DIR}${/}approved.png:center ... ${TESTDATA_DIR}${/}robot.pdf:1 ... ${TESTDATA_DIR}${/}approved.png:x=0,y=0 ... ${TESTDATA_DIR}${/}robot.pdf:2-10,15 ... ${TESTDATA_DIR}${/}approved.png Add Files To PDF ${files} newdoc.pdf **Python** .. code-block:: python from RPA.PDF import PDF pdf = PDF() list_of_files = [ 'invoice.pdf', 'approved.png:center', 'robot.pdf:1', 'approved.png:x=0,y=0', ] def example_keyword(): pdf.add_files_to_pdf( files=list_of_files, target_document="output/output.pdf" ) :param files: list of filepaths to add into PDF (can be either images or PDFs) :param target_document: filepath of target PDF """ writer = PyPDF2.PdfFileWriter() for f in files: file_to_add = Path(f) namesplit = file_to_add.name.rsplit(":", 1) basename = namesplit[0] parameters = namesplit[1] if len(namesplit) == 2 else None file_to_add = file_to_add.parent / basename image_filetype = imghdr.what(str(file_to_add)) self.logger.info("File %s type: %s" % (str(file_to_add), image_filetype)) if basename.endswith(".pdf"): reader = PyPDF2.PdfFileReader(str(file_to_add), strict=False) pagecount = reader.getNumPages() pages = self._get_pages(pagecount, parameters) for n in pages: try: page = reader.getPage(n - 1) writer.addPage(page) except IndexError: self.logger.warning("File %s does not have page %d" % (file_to_add, n)) elif image_filetype in ["png", "jpg", "jpeg", "gif"]: temp_pdf = os.path.join(tempfile.gettempdir(), "temp.pdf") pdf = FPDF() pdf.set_margin(0) pdf.add_page() x, y, width, height = self._get_image_coordinates( str(file_to_add), parameters) pdf.image(name=file_to_add, x=x, y=y, w=width, h=height) pdf.output(name=temp_pdf) reader = PyPDF2.PdfFileReader(temp_pdf) writer.addPage(reader.getPage(0)) with open(target_document, "wb") as f: writer.write(f)
import PyPDF2 import os import sys if len(sys.argv) != 4: raise RuntimeError('Usage {} watermark.pdf source.pdf output.pdf'.format( sys.argv[0])) watermark_path = sys.argv[1] if not os.path.isfile(watermark_path): raise RuntimeError('Watermark file {} not found'.format(watermark_path)) watermark = PyPDF2.PdfFileReader(open(watermark_path, 'rb')) source_path = sys.argv[2] if not os.path.isfile(source_path): raise RuntimeError('Source file {} not found'.format(source_path)) source = PyPDF2.PdfFileReader(open(source_path, 'rb')) watermark_page = watermark.getPage(0) pdf_output = PyPDF2.PdfFileWriter() for pageNum in range(0, source.numPages): pageObj = source.getPage(pageNum) pageObj.mergePage(watermark_page) pdf_output.addPage(pageObj) output_file = open(sys.argv[3], 'wb') pdf_output.write(output_file) output_file.close()
def exportPdf(fin, abpath_out, annotations): '''Export PDF with annotations. Args: fin (str): abspath to input PDF file. abpath_out (str): abspath to output PDF file. annotations (dict): annotation info. See import_mendeley.py getHighlights() for more info. ''' try: inpdf = PyPDF2.PdfFileReader(open(fin, 'rb')) if inpdf.isEncrypted: # PyPDF2 seems to think some files are encrypted even # if they are not. We just ignore the encryption. # This seems to work for the one file where I saw this issue #inpdf._override_encryption = True #inpdf._flatten() # UPDATE: trying to decrypt takes a lot of time, # as this rarely happens to academic docs I'm skipping this # and simply treat as fail #raise Exception("Skip encrypt") return except IOError: LOGGER.warning('Could not open pdf file %s' %fin) # retain meta data meta = inpdf.getDocumentInfo() outpdf = PyPDF2.PdfFileWriter() outpdf.addMetadata(meta) highlights=annotations.get('highlights',None) if highlights is None: hlpages=[] else: hlpages=list(highlights.keys()) hlpages.sort() notes=annotations.get('notes',None) if notes is None: ntpages=[] else: ntpages=list(notes.keys()) ntpages.sort() #----------------Loop through pages---------------- pages=range(1,inpdf.getNumPages()+1) for pii in pages: inpg = inpdf.getPage(pii-1) #----------------Process highlights---------------- if pii in hlpages: for hjj in highlights[pii]: # Changes suggested by matteosecli: add author of highlight: anno = pdfannotation.createHighlight(hjj["rect"], author=hjj['author'], cdate=hjj["cdate"], color=hjj['color']) inpg=pdfannotation.addAnnotation(inpg,outpdf,anno) #------------------Process notes------------------ if pii in ntpages: for njj in notes[pii]: note = pdfannotation.createNote(njj["rect"], \ contents=njj["content"], author=njj["author"],\ cdate=njj["cdate"]) inpg=pdfannotation.addAnnotation(inpg,outpdf,note) outpdf.addPage(inpg) #-----------------------Save----------------------- if os.path.isfile(abpath_out): os.remove(abpath_out) with open(abpath_out, mode='wb') as fout: outpdf.write(fout) LOGGER.debug('Exported annotated pdf.') return
def rotate_page( self, pages: ListOrString, source_path: str = None, output_path: str = None, clockwise: bool = True, angle: int = 90, ) -> None: """Rotate pages in source PDF document and save to target PDF document. If no source path given, assumes a PDF is already opened. **Examples** **Robot Framework** .. code-block:: robotframework ***Settings*** Library RPA.PDF ***Tasks*** Example Keyword Rotate Page ... source_path=/tmp/sample.pdf ... output_path=/tmp/output.pdf ... pages=5 **Python** .. code-block:: python from RPA.PDF import PDF pdf = PDF() def rotate_page(): pages = pdf.rotate_page( source_path="/tmp/sample.pdf", output_path="/tmp/output.pdf", pages=5 ) :param pages: page numbers to extract from PDF (numbers start from 0). :param source_path: filepath to the source pdf. :param output_path: filepath to the target pdf, stored by default to `output_directory`. :param clockwise: directorion that page will be rotated to, default True. :param angle: number of degrees to rotate, default 90. """ # TODO: don't save to a new file every time self.switch_to_pdf(source_path) reader = self.ctx.active_pdf_document.reader writer = PyPDF2.PdfFileWriter() output_filepath = Path( output_path) if output_path else self.default_output pages = self._get_page_numbers(pages, reader) for page in range(reader.getNumPages()): source_page = reader.getPage(int(page)) if page in pages: if clockwise: source_page.rotateClockwise(int(angle)) else: source_page.rotateCounterClockwise(int(angle)) else: source_page = reader.getPage(int(page)) writer.addPage(source_page) with open(str(output_filepath), "wb") as f: writer.write(f)
pdf_reader.getIsEncrypted() # False pdf_reader.getNumPages() # It will return number of pages in pdf # 3 page1 = pdf_reader.getPage(0) page1.extractText() page2 = pdf_reader.getPage(1) page2.extractText() # Append Write or Merge PDf pdf_writer = pdf.PdfFileWriter() pdf_writer.addPage(page1) pdf_writer.addPage(page2) output = open('E:\Projects\Extract Text From PDF File\Pages.pdf','wb') pdf_writer.write(output) output.close()
def pdf_replace(request): if request.method == 'POST': # 如果用户通过POST提交 form = PdfReplaceForm(request.POST, request.FILES) if form.is_valid(): # 获取需要插入的PDF页面文件1 f1 = form.cleaned_data['file1'] # 获取需要被替换的文件2 f2 = form.cleaned_data['file2'] # 获取替换页码数 page = form.cleaned_data['page'] # 获取文件2总页数 pdfFileObj = PyPDF2.PdfFileReader(f2) total_page = pdfFileObj.getNumPages() # 获取文件2第一部分-人为可读页码 page_start = 1 page_end = page - 1 pdfOutputFile1 = open(os.path.join('media', 'part_1.pdf'), 'wb+') # 利用PyPDF2创建新的Pdf Writer pdfWriter = PyPDF2.PdfFileWriter() for page_num in range(page_start, page_end + 1): # pdf文档页码对象编码是从0开始,所以减一 page_index = int(page_num) - 1 # 利用PyPDF2提取页码对象 pageObj = pdfFileObj.getPage(page_index) # 从0编码 # 添加已读取的页面对象 pdfWriter.addPage(pageObj) pdfWriter.write(pdfOutputFile1) pdfOutputFile1.close() # 获取文件2第2部分-人为可读页码 page_start = page + 1 page_end = total_page pdfOutputFile2 = open(os.path.join('media', 'part_2.pdf'), 'wb+') # 利用PyPDF2创建新的Pdf Writer pdfWriter = PyPDF2.PdfFileWriter() for page_num in range(page_start, page_end + 1): # pdf文档页码对象编码是从0开始,所以减一 page_index = int(page_num) - 1 # 利用PyPDF2提取页码对象 pageObj = pdfFileObj.getPage(page_index) # 从0编码 # 添加已读取的页面对象 pdfWriter.addPage(pageObj) pdfWriter.write(pdfOutputFile2) pdfOutputFile2.close() f2_part_1 = open(os.path.join('media', 'part_1.pdf'), 'rb+') f2_part_2 = open(os.path.join('media', 'part_2.pdf'), 'rb+') # 创建PDF文件合并对象,添加合并文件 pdfMerger = PyPDF2.PdfFileMerger() pdfMerger.append(PyPDF2.PdfFileReader(f2_part_1)) pdfMerger.append(PyPDF2.PdfFileReader(f1)) pdfMerger.append(PyPDF2.PdfFileReader(f2_part_2)) # 将合并文件对象写入到replaced_file.pdf with open(os.path.join('media', 'replaced_file.pdf'), 'wb') as pdfOutputFile: pdfMerger.write(pdfOutputFile) # 打开合并的replaced_file.pdf,通过HttpResponse输出 response = FileResponse( open(os.path.join('media', 'replaced_file.pdf'), 'rb')) response['content_type'] = "application/octet-stream" response[ 'Content-Disposition'] = 'attachment; filename="replaced_file.pdf"' return response else: # 如果通过POST提交,但表单未通过验证 form = PdfReplaceForm() else: # 如果用户没有通过POST,提交生成空表单 form = PdfReplaceForm() return render(request, 'pdf/pdf_replace.html', {'form': form})
): # Loop, for each page of the input pdf page = input_reader.getPage( page_num ) # Creating the Page Object for each page of the input pdf page.mergePage( watermark_page ) # Merging pages using the method of the Page Object writer_object.addPage( page ) # Passing the merged page to the writer object with addPage method with open( 'super_watermarked.pdf', 'wb' ) as result_file: # Creating the new archive, with merged page writer_object.write( result_file ) # Writing each merged page into new archive # watermark('super.pdf', 'wtr.pdf.pdf') template_pdf = PyPDF2.PdfFileReader(open('super.pdf', 'rb')) watermark_pdf = PyPDF2.PdfFileReader(open('wtr.pdf.pdf', 'rb')) writer_object = PyPDF2.PdfFileWriter() for i in range(template_pdf.numPages): page = template_pdf.getPage(i) page.mergePage(watermark_pdf.getPage(0)) writer_object.addPage(page) with open('watermarked_output.pdf', 'wb') as file: writer_object.write(file)
import PyPDF2 as ppdf pdffile1 = open('meetingminutes1.pdf', 'rb') reader1 = ppdf.PdfFileReader(pdffile1) #print(reader.numPages) #page=reader.getPage(0); #print(page.extractText()) pdffile2 = open('meetingminutes2.pdf', 'rb') reader2 = ppdf.PdfFileReader(pdffile2) writer = ppdf.PdfFileWriter() for p1 in range(reader1.numPages): if p1 % 2: page = reader1.getPage(p1 - 1) else: page = reader1.getPage(p1) writer.addPage(page) for p2 in range(reader2.numPages): page = reader2.getPage(p2) writer.addPage(page) outputfile = open('combined.pdf', 'wb') writer.write(outputfile) pdffile1.close() pdffile2.close()
def split_pdf_file(self): if not self.pdf_name.text(): error_msg = QtWidgets.QMessageBox(self) error_msg.setIcon(QtWidgets.QMessageBox.Warning) error_msg.setText('For processing please choose pdf-file!') error_msg.setWindowTitle('Error split pdf-file') error_msg.exec_() return if not self.export_directory.text(): error_msg = QtWidgets.QMessageBox(self) error_msg.setIcon(QtWidgets.QMessageBox.Warning) error_msg.setText('For processing please choose export directory!') error_msg.setWindowTitle('Error split pdf-file') error_msg.exec_() return self.progressBar.show() self.label_help.hide() self.label_qty.show() try: pdfFileObj = open(self.pdf_name.text(), 'rb') except Exception: error_msg = QtWidgets.QMessageBox(self.centralwidget) error_msg.setIcon(QtWidgets.QMessageBox.Warning) error_msg.setText('Error open pdf-file!') error_msg.setWindowTitle('Error split pdf-file') error_msg.exec_() return pdfReader = PyPDF2.PdfFileReader(pdfFileObj) self.progressBar.setRange(0, pdfReader.numPages - 1) maxVal = self.progressBar.maximum() writer = PyPDF2.PdfFileWriter() qty_suppl = 0 for i in range(0, pdfReader.numPages): self.progressBar.setValue(i + (maxVal - i) / 100) QtWidgets.QApplication.processEvents() pageObj = pdfReader.getPage(i) check_text = pageObj.extractText().split() if len(check_text) > 5 and check_text[1] == '10.11.98': if i == 0: suppl_no = list( filter(lambda f: len(f) == 10 and f[:2] == '26', check_text))[0][-5:] writer.addPage(pdfReader.getPage(i)) else: output_file = '\\Reconciliation_act~Acc_act~60~' + suppl_no + '~' + datetime.date.today( ).strftime('%Y%m%d') + '.pdf' try: with open(self.export_directory.text() + output_file, 'wb') as outfile: writer.write(outfile) qty_suppl += 1 self.label_qty.setText('Processed: ' + str(qty_suppl) + ' supplier(s)') QtWidgets.QApplication.processEvents() except Exception: error_msg = QtWidgets.QMessageBox(self.centralwidget) error_msg.setIcon(QtWidgets.QMessageBox.Warning) error_msg.setText('Error save pdf-file!') error_msg.setWindowTitle('Error split pdf-file') error_msg.exec_() return outfile.close() suppl_no = list( filter(lambda f: len(f) == 10 and f[:2] == '26', check_text))[0][-5:] writer = PyPDF2.PdfFileWriter() writer.addPage(pdfReader.getPage(i)) else: if len(check_text) > 5: writer.addPage(pdfReader.getPage(i)) output_file = '\\Reconciliation_act~Acc_act~60~' + suppl_no + '~' + datetime.date.today( ).strftime('%Y%m%d') + '.pdf' try: with open(self.export_directory.text() + output_file, 'wb') as outfile: writer.write(outfile) outfile.close() qty_suppl += 1 self.label_qty.setText('Processed: ' + str(qty_suppl) + ' supplier(s)') except Exception: error_msg = QtWidgets.QMessageBox(self.centralwidget) error_msg.setIcon(QtWidgets.QMessageBox.Warning) error_msg.setText('Error save pdf-file!') error_msg.setWindowTitle('Error split pdf-file') error_msg.exec_() return success_msg = QtWidgets.QMessageBox(self) success_msg.setIcon(QtWidgets.QMessageBox.Information) success_msg.setText('Processed: ' + str(qty_suppl) + ' supplier(s)') QtWidgets.QApplication.processEvents() success_msg.setWindowTitle('Success split pdf-file') success_msg.exec_() self.progressBar.hide() self.label_help.show()
def lambda_handler(event, context): # Get the object from the event and show its content type bucket = event['Records'][0]['s3']['bucket']['name'] key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8') print(bucket, key) try: # バケツの取得 response = s3.get_object(Bucket=bucket, Key=key) # レスポンスの内容の確認 if response['ContentType'] != 'application/zip' and response[ 'ContentType'] != 'application/x-zip-compressed' and response[ 'ContentType'] != 'binary/octet-stream': return 'Unsupported extension. : ' + response['ContentType'] # src/yyyymmddhhmmssfff/filename.zip keyItems = key.split('/') createDate = keyItems[1] zipname = keyItems[2] # 拡張をpdfに変更 name, ext = getNameAndExtention(zipname) # ワークフォルダと格納パスの作成 global basedir pdfname = name + '.pdf' # pdfのファイル名 zipPath = os.path.join(basedir, createDate + "_" + zipname) # zipの保存名 imgPath = os.path.join(basedir, 'img', createDate, name) # imgの解凍先 pdfPath = os.path.join(basedir, createDate + "_" + pdfname) # pdfの保存名 pdfkey = '/'.join(['dst', createDate, pdfname]) # pdfの保存key os.makedirs(imgPath, exist_ok=True) print(zipPath) print(imgPath) print(pdfPath) # S3から/tmpにダウンロード s3_client.download_file(bucket, key, zipPath) # 解凍処理 unzip(zipPath, imgPath, ext) # 画像を検索 imgList = [] searchImg(imgList, imgPath) # imageをPDFに置き換え pdflist = [] changeImg2Pdf(imgList, pdflist) # PDFの読み書き pdfWriter = PyPDF2.PdfFileWriter() margePdf(pdflist, pdfWriter) writePdf(pdfWriter, pdfPath) # S3にPDFをアップロード s3_client.upload_file(pdfPath, bucket, pdfkey) return 'success!!!' except Exception as e: print(e) print( 'Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.' .format(key, bucket)) raise e
file1 = open(file_path, 'rb') read1 = PyPDF2.PdfFileReader(file1) page1 = read1.getPage(0) #page.rotateClockwise(90) #pdf = PdfFileReader(file('Java Printing.pdf')) page = read1.getPage(0).mediaBox if page.getUpperRight_x() - page.getUpperLeft_x() > page.getUpperRight_y( ) - page.getLowerRight_y(): print('Landscape') else: print('Portrait') page1.rotateClockwise(90) # save (rotated) page file2 = PyPDF2.PdfFileWriter() file2.addPage(page1) result = open('Balances.pdf', 'wb') file2.write(result) result.close() file1.close() # read from pdf fp = open('Balances.pdf', 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams()
def send_public_report(rating_decision_obj): """ Download public report from AWS S3 and send to issuer. :param RatingDecision: rating decision object """ target_file = None # Send this file to the issuer # Fetch uploaded files from AWS S3 document = get_public_report(rating_decision_obj) try: legal_name = rating_decision_obj.issuer.legal_name legal_name = str( unicodedata.normalize('NFKD', legal_name).encode('ASCII', 'ignore').decode('utf-8')) target_file = (legal_name + ' draft report' + ' (' + datetime.datetime.today().strftime('%Y-%m-%d, %H%M') + ").pdf") filepath = AWS_ANALYTICAL_MEDIA_LOCATION + '/' + str(document.upload) download_file(filepath, target_file) # Password protect file path, filename = os.path.split(target_file) output_file = os.path.join(path, "temp_" + filename) output = PyPDF2.PdfFileWriter() input_stream = PyPDF2.PdfFileReader(open(target_file, "rb")) for i in range(0, input_stream.getNumPages()): output.addPage(input_stream.getPage(i)) outputStream = open(output_file, "wb") user_password = urandom(16).hex()[:5] # Set user and owner password to pdf file output.encrypt(user_pwd=user_password, owner_pwd='owner_pass', use_128bit=True) output.write(outputStream) outputStream.close() # Rename temporary output file with original filename, this # will automatically delete temporary file os.rename(output_file, target_file) attachments = [] # start with an empty list # add the attachment to the list attachments.append(target_file) contact_list = list( RatingDecisionInsiderLink.objects.filter( rating_decision=rating_decision_obj)) to_list = [] for row in contact_list: to_list.append(row.insider.email) # Send email with link to admin control to editor send_email.delay( header=ISSUER_HEADER, body=ISSUER_EMAIL % (rating_decision_obj.issuer.analyst.primary_analyst.first_name), to=to_list, from_sender=None, cc=rating_decision_obj.issuer.analyst.primary_analyst.email, attachments=attachments) # Send email with password to primary analyst send_email.delay( header=ISSUER_EMAIL_HEADER_PASSWORD, body=ISSUER_EMAIL_BODY_PASSWORD.format( target_file, user_password, rating_decision_obj.issuer.analyst.primary_analyst.first_name), to=to_list, from_sender=None, cc=rating_decision_obj.issuer.analyst.primary_analyst.email, ) # Delete file later = datetime.datetime.utcnow() + timedelta(minutes=10) files = [os.path.abspath(target_file)] delete_files_task.apply_async(files, eta=later) except AnalyticalDocument.DoesNotExist: pass
import PyPDF2 with open('super.pdf', 'rb') as file: reader = PyPDF2.PdfFileReader(file) print(reader.numPages) with open('wtr.pdf', 'rb') as file_wtm: watermark = PyPDF2.PdfFileReader(file_wtm) lst = [] for page in range(0, reader.numPages): n_page = reader.getPage(page) n_page_watermark = watermark.getPage(0) n_page.mergePage(n_page_watermark) lst.append(n_page) pdf_writer = PyPDF2.PdfFileWriter() for i in lst: pdf_writer.addPage(i) with open('output.pdf', 'wb') as output_file: pdf_writer.write(output_file) #Or this way --> more elegant way :) template = PyPDF2.PdfFileReader(open('super.pdf', 'rb')) watermark2 = PyPDF2.PdfFileReader(open('wtr.pdf', 'rb')) output = PyPDF2.PdfFileWriter() for i in range(template.getNumPages()): page = template.getPage(i)
然后从已有的文档中拷贝内容到新文件中 有关PDF文件操作一般遵循以下步骤: 1.打开一个或者多个已有的PDF文件,得到PdfFileReader对象 2.创建一个新的PdfFileWriter对象 3.将页面从PdfFileReader对象拷贝到PdfFileWriter对象 4.最后利用PdfFileWriter对象的write()写入新创建的PDF文件 ''' minutesFile = open('meetingminutes.pdf', 'rb') pdfReader = PyPDF2.PdfFileReader(minutesFile) # 创建PdfFileReader对象 minutesFirstPage = pdfReader.getPage(0) pdfWatermarkReader = PyPDF2.PdfFileReader(open('watermark.pdf', 'rb')) # 创建PdfFileReader对象 minutesFirstPage.mergePage(pdfWatermarkReader.getPage(0)) pdfWriter = PyPDF2.PdfFileWriter() # 创建PdfFileWriter对象 pdfWriter.addPage(minutesFirstPage) for pageNum in range(1, pdfReader.numPages): pageObj = pdfReader.getPage(pageNum) pdfWriter.addPage(pageObj) resultPdfFile = open('watermarkedCover.pdf', 'wb') pdfWriter.write(resultPdfFile) # 创建新的PDF文件,并将PdfFileWriter内容写进该文件 minutesFile.close() resultPdfFile.close()
import PyPDF2 as pdf import os SONGS_PATH = "C:/Users/tomas/zpevnik" paths = sorted(os.listdir(SONGS_PATH)) paths.insert(39, paths.pop()) # hack: dá Čechomor na správné místo v abecedě files = [open(os.path.join(SONGS_PATH, path), "rb") for path in paths] output = pdf.PdfFileWriter() for i, (file, f) in enumerate(zip(paths, files)): print(f"Working on {i}: ", end=" ") page = pdf.PdfFileReader(f) output.addPage(page.getPage(0)) page_name = os.path.splitext(os.path.basename(file))[0] print(page_name) output.addBookmark(page_name, i) with open("out.pdf", "wb") as f: output.write(f)
def extract_pages_from_pdf( self, source_path: str = None, output_path: str = None, pages: ListOrString = None, ) -> None: """Extract pages from source PDF and save to a new PDF document. Page numbers start from 1. If no source path given, assumes a PDF is already opened. **Examples** **Robot Framework** .. code-block:: robotframework ***Settings*** Library RPA.PDF ***Tasks*** Example Keyword ${pages}= Extract Pages From PDF ... source_path=/tmp/sample.pdf ... output_path=/tmp/output.pdf ... pages=5 **Python** .. code-block:: python from RPA.PDF import PDF pdf = PDF() def example_keyword(): pages = pdf.extract_pages_from_pdf( source_path="/tmp/sample.pdf", output_path="/tmp/output.pdf", pages=5 ) :param source_path: filepath to the source pdf. :param output_path: filepath to the target pdf, stored by default in `output_directory`. :param pages: page numbers to extract from PDF (numbers start from 0) if None then extracts all pages. """ self.switch_to_pdf(source_path) reader = self.ctx.active_pdf_document.reader writer = PyPDF2.PdfFileWriter() output_filepath = Path( output_path) if output_path else self.default_output pages = self._get_page_numbers(pages, reader) for pagenum in pages: writer.addPage(reader.getPage(int(pagenum) - 1)) with open(str(output_filepath), "wb") as f: writer.write(f)
def make_writer(pdf): with open(pdf, 'rb') as f: r = PyPDF2.PdfFileReader(f) w = PyPDF2.PdfFileWriter() w.appendPagesFromReader(r) return w
def add_watermark_image_to_pdf( self, image_path: str, output_path: str, source_path: str = None, coverage: float = 0.2, ) -> None: """Add image to PDF which can be new or existing PDF. If no source path given, assumes a PDF is already opened. **Examples** **Robot Framework** .. code-block:: robotframework ***Settings*** Library RPA.PDF ***Tasks*** Example Keyword Add Watermark Image To PDF ... image_path=approved.png ... source_path=/tmp/sample.pdf ... output_path=output/output.pdf **Python** .. code-block:: python from RPA.PDF import PDF pdf = PDF() def example_keyword(): pdf.add_watermark_image_to_pdf( image_path="approved.png" source_path="/tmp/sample.pdf" output_path="output/output.pdf" ) :param image_path: filepath to image file to add into PDF :param source: filepath to source, if not given add image to currently active PDF :param output_path: filepath of target PDF :param coverage: how the watermark image should be scaled on page, defaults to 0.2 """ self.switch_to_pdf(source_path) temp_pdf = os.path.join(tempfile.gettempdir(), "temp.pdf") writer = PyPDF2.PdfFileWriter() pdf = FPDF() pdf.add_page() reader = self.ctx.active_pdf_document.reader mediabox = reader.getPage(0).mediaBox im = Image.open(image_path) max_width = int(float(mediabox.getWidth()) * coverage) max_height = int(float(mediabox.getHeight()) * coverage) width, height = self.fit_dimensions_to_box(*im.size, max_width, max_height) pdf.image(name=image_path, x=40, y=60, w=width, h=height) pdf.output(name=temp_pdf) img = PyPDF2.PdfFileReader(temp_pdf) watermark = img.getPage(0) for n in range(reader.getNumPages()): page = reader.getPage(n) page.mergePage(watermark) writer.addPage(page) with open(output_path, "wb") as f: writer.write(f)
import os import PyPDF2 import glob # todo learn gitIgnore filePaths = glob.glob("Data/*.pdf") with open("Password", mode="r", encoding="utf-8") as f: password = f.read().replace("\n", "") for i in filePaths: pdf = PyPDF2.PdfFileReader(i) pdf.decrypt(password) dst_pdf = PyPDF2.PdfFileWriter() dst_pdf.cloneReaderDocumentRoot(pdf) with open(i, mode="wb") as f: dst_pdf.write(f)
# ch18_8.py import PyPDF2, os, re try: for dirName, sub_dirNames, fileNames in os.walk( 'F:\\Book\\電腦書\\Python入門\\範例檔案\\ch18'): for fn in fileNames: txt = re.search(r'\w*\.pdf', fn, re.IGNORECASE) if txt != None: pdfFn = txt.group() pdfObj = open(pdfFn, 'rb') pdfRd = PyPDF2.PdfFileReader(pdfObj) pdfWr = PyPDF2.PdfFileWriter() # 新的PDF物件 for pageNum in range(pdfRd.numPages): pdfWr.addPage(pdfRd.getPage(pageNum)) # 一次將一頁放入新的PDF物件 pdfWr.encrypt('python') # 執行加密 refn = re.search(r'\w+[^.]', pdfFn, re.IGNORECASE) encryptPdf = refn.group() encryptPdf = open(encryptPdf + '_encryt.pdf', 'wb') # 開啟二進位檔案供寫入 pdfWr.write(encryptPdf) # 執行寫入 print(pdfFn + '寫入成功') encryptPdf.close() except FileNotFoundError as e: print(e) finally: print('程式結束!!')
import PyPDF2 import sys import os template = sys.argv[1] watermark = sys.argv[2] template1 = PyPDF2.PdfFileReader(open(template, 'rb')) #creates a file reader watermark1 = PyPDF2.PdfFileReader(open(watermark, 'rb')) output = PyPDF2.PdfFileWriter() #creates a write object for i in range(template1.getNumPages()):#getNumPages gives how many pages the template file has page = template1.getPage(i) #take one page at a time page.mergePage(watermark1.getPage(0)) #watermark file has only one page output.addPage(page) with open('watermarked_output2.pdf', 'wb') as file: output.write(file)
pdfFile = open('file.pdf', 'rb') # must open in read binary mode reader = PyPDF2.PdfFileReader(pdfFile) print(reader.numPages) # number of pages in file page = reader.getPage(0) # gets specific page print(page.extractText()) for pageNum in range(reader.numPages): print(reader.getPage(pageNum).extractText()) # can add remove and reorder pages can not edit text pdf1 = open('file1', 'rb') pdf2 = open('file2', 'rb') reader1 = PyPDF2.PdfFileReader(pdf1) reader2 = PyPDF2.PdfFileReader(pdf2) writer = PyPDF2.PdfFileWriter() # blank pdf in memory for pageNum in range(reader1.numPages): page = reader1.getPage(pageNum) writer.addPage(page) for pageNum in range(reader2.numPages): page = reader2.getPage(pageNum) writer.addPage(page) outputFile = open('combinedPdf.pdf', 'wb') writer.write(outputFile) outputFile.close() pdf1.close() pdf2.close()
def exportPdf(fin, outdir, annotations, verbose): '''Export PDF with annotations. <fin>: string, absolute path to input PDF file. <outdir>: string, absolute path to the output directory. <annotations>: FileAnno obj. Update time: 2016-02-19 14:32:56. ''' #---------------Skip unlinked files--------------- assert annotations.hasfile, 'no file of %s' % fin #if not annotations.hasfile: #return try: inpdf = PyPDF2.PdfFileReader(open(fin, 'rb')) if inpdf.isEncrypted: # PyPDF2 seems to think some files are encrypted even # if they are not. We just ignore the encryption. # This seems to work for the one file where I saw this issue #inpdf._override_encryption = True #inpdf._flatten() # UPDATE: trying to decrypt takes a lot of time, # as this rarely happens to academic docs I'm skipping this # and simply treat as fail raise Exception("Skip encrypt") except IOError: print('Could not find pdf file %s' % fin) # retain meta data meta = inpdf.getDocumentInfo() outpdf = PyPDF2.PdfFileWriter() outpdf.addMetadata(meta) #----------------Loop through pages---------------- pages = range(1, inpdf.getNumPages() + 1) for pii in pages: inpg = inpdf.getPage(pii - 1) #----------------Process highlights---------------- if pii in annotations.hlpages: for hjj in annotations.highlights[pii]: # Changes suggested by matteosecli: add author of highlight: anno = pdfannotation.createHighlight(hjj["rect"], author=hjj['author'], cdate=hjj["cdate"], color=hjj['color']) inpg = pdfannotation.addAnnotation(inpg, outpdf, anno) #------------------Process notes------------------ if pii in annotations.ntpages: for njj in annotations.notes[pii]: note = pdfannotation.createNote(njj["rect"], \ contents=njj["content"], author=njj["author"],\ cdate=njj["cdate"]) inpg = pdfannotation.addAnnotation(inpg, outpdf, note) outpdf.addPage(inpg) # this is a fix provided by rongmu regarding issue #22: # https://github.com/Xunius/Menotexport/issues/22 # As I'm not giving it tests I'll put it inside a try # fix start ----------------------------------{{{ # Copy the root (document catalog) except for /Pages # PDF Reference, Sixth Edition, version 1.7, p.137 # https://www.adobe.com/devnet/pdf/pdf_reference_archive.html try: for k, v in inpdf.trailer["/Root"].items(): if k.getObject() != "/Pages": outpdf._root_object.update({k: v}) except: pass # fix end ----------------------------------}}} #-----------------------Save----------------------- filename = annotations.filename if not os.path.isdir(outdir): os.makedirs(outdir) abpath_out = os.path.join(outdir, filename) if os.path.isfile(abpath_out): os.remove(abpath_out) with open(abpath_out, mode='wb') as fout: outpdf.write(fout) return
pdf_path = "./PDF/" #os.chdir(pdf_path) #Get the name of the parent pdf files from user #This file will be demerged into 2 files parent_pdf = getFileNameFromUser("Give parent PDF name with .pdf extn", pdf_path) #Create file objects for both the files pdfFileObj = open(os.path.join(pdf_path + parent_pdf), "rb") #Pass the file objects to the file reader pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #create a Pdf writer object pdfWriter1 = PyPDF2.PdfFileWriter() pdfWriter2 = PyPDF2.PdfFileWriter() #Add individual pages from pdf files to writer object addPageToWriter(pdfReader, pdfWriter1, 0, pdfReader.numPages / 2) addPageToWriter(pdfReader, pdfWriter2, pdfReader.numPages / 2, pdfReader.numPages) pdfOutputFileObj1 = open( os.path.join(pdf_path + getFinalPdfNameFromUser()), "wb") pdfOutputFileObj2 = open( os.path.join(pdf_path + getFinalPdfNameFromUser()), "wb") print(".. Demerging parent pdf file....") pdfWriter1.write(pdfOutputFileObj1) pdfWriter2.write(pdfOutputFileObj2) print("... done...")
import PyPDF2 as p template = p.PdfFileReader(open('merged_pdf.pdf', 'rb')) water = p.PdfFileReader(open('pdf3.pdf', 'rb')) output = p.PdfFileWriter() for i in range(template.getNumPages()): page = template.getPage(i) page.mergePage(water.getPage(0)) output.addPage(page) with open('watermarkedPdf.pdf', 'wb') as file: output.write(file)
fileList.append(pdfFileObj) # Create PdfFileReader objects from PDF file list readerList = [] for pdfFile in fileList: pdfReader = PyPDF2.PdfFileReader(pdfFile) readerList.append(pdfReader) totalPages = 0 for pr in readerList: print(pr.numPages) totalPages += pr.numPages print("total pages:", totalPages) # combine read PDF files pdfWriter = PyPDF2.PdfFileWriter() for pr in readerList: for pageNum in range(pr.numPages): pageObj = pr.getPage(pageNum) pdfWriter.addPage(pageObj) # write to a new PDF file if os.path.isfile(mergedFile + ".pdf"): print('File already exists, specify another filename') else: pdfOutputFile = open(mergedFile + ".pdf", 'wb') pdfWriter.write(pdfOutputFile) pdfOutputFile.close() # Close all read files for pdfFile in fileList:
import PyPDF2 pdf1 = open('JadaDixon_rates_of_reaction_lab.pdf', 'rb') pdf2 = open('JadaDixon_reaction_rate_p_and_d_.pdf', 'rb') pdf3 = open('JadaDixon_Redox_Lab_Template.pdf', 'rb') pdf4 = open('JadaDixonchemistry.pdf', 'rb') pdf5 = open('jadadixonenergeticslab.pdf', 'rb') pdf6 = open('JadaDixonSBaLab2.pdf', 'rb') pdf7 = open('Chemistry_P_and_D__Jada_Dixon.pdf', 'rb') pdf1reader = PyPDF2.PdfFileReader(pdf1) pdf2reader = PyPDF2.PdfFileReader(pdf2) pdf3reader = PyPDF2.PdfFileReader(pdf3) pdf4reader = PyPDF2.PdfFileReader(pdf4) pdf5reader = PyPDF2.PdfFileReader(pdf5) pdf6reader = PyPDF2.PdfFileReader(pdf6) pdf7reader = PyPDF2.PdfFileReader(pdf7) pdfwrite = PyPDF2.PdfFileWriter() for i in range(pdf1reader.numPages): pageO = pdf1reader.getPage(i) pdfWRITE