def split(path, name_of_split): """Split a pdf into multiple pdfs which containing each of the pages""" read = PdfFileReader(path) # For each page in pdf, create a single file of that page for page in range(read.getNumPages()): # Create reader each time in loop to prevent a crash pdf = PdfFileReader(path) pdf_writer = PdfFileWriter() # Write page to file pdf_writer.addPage(pdf.getPage(page)) # Setting output file name output = f'{name_of_split}{page}.pdf' with open(output, 'wb') as output_pdf: pdf_writer.write(output_pdf)
def pdf_add_metadata(path, file, key, value, out_file, out_path=''): if out_path == '': out_path = path with open(os.path.join(path, file), 'rb') as pdf: try: pdf_reader = PdfFileReader(pdf) metadata = pdf_reader.getDocumentInfo() print(metadata) pdf_writer = PdfFileWriter() pdf_writer.appendPagesFromReader(pdf_reader) pdf_writer.addMetadata({ '/' + key: value, # # '/Title': 'PDF in Python' }) file_out = open(os.path.join(out_path, out_file), 'wb') pdf_writer.write(file_out) # pdf.close() file_out.close() # print('File ' + os.path.basename(file) + ' has ' + str(pg) + ' page(s)') except ValueError: print(ValueError, 'rrrt')
def pdf_add_Stamp(path, file, stamp): with open(os.path.join(path, file), 'rb') as pdf: pdf_reader = PdfFileReader(pdf) with open(os.path.join(path, stamp), 'rb') as file_stamp: watermark = PdfFileReader(file_stamp) first_page = pdf_reader.getPage(0) first_page_watermark = watermark.getPage(0) first_page.mergePage(first_page_watermark) pdf_writer = PdfFileWriter() pdf_writer.addPage(first_page) pdf_writer.addMetadata({ '/NumeroCadastre': 'LaTeteAToto', # # '/Title': 'PDF in Python' }) file_out = open(os.path.join(path, 'new_w_stamp.pdf'), 'wb') pdf_writer.write(file_out) # pdf.close() file_out.close() watermark.close()
def join(paths, output): """ Joins an unlimited number of pdfs together. Enter the paths of the pdfs to be joined in the order they will appear in the output pdf, and end with the output path. """ pdf_writer = PdfFileWriter() for path in paths: pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.write(output)
def write_pdf_subset(input_path, output_path, page_start, page_end): pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(input_path) if pdf_reader.getNumPages() < page_end: Exception("too few pages") for page in range(page_start, page_end): # Add each page to the writer object pdf_writer.addPage(pdf_reader.getPage(page)) # Write out the merged PDF with open(output_path, 'wb') as out: pdf_writer.write(out)
def create_watermark(input_pdf, output, watermark): watermark_obj = PdfFileReader(watermark) watermark_page = watermark_obj.getPage(0) pdf_reader = PdfFileReader(input_pdf) pdf_writer = PdfFileWriter() # Watermark all the pages, need to change this to one. for page in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(page) page.mergePage(watermark_page) pdf_writer.addPage(page) with open(output, 'wb') as out: pdf_writer.write(out)
def _download(self, url, path): code = url.split("/")[-1] page_size = self._get_pdf_page_size(url) pdf_writer = PdfFileWriter() for page in trange(1, page_size + 1): first_page_io = self._get_one_page_pdf(code, page) pdf_reader = PdfFileReader(first_page_io) page_r = pdf_reader.getPage(0) pdf_writer.addPage(page_r) with open(path, "wb") as f: pdf_writer.write(f) return path
def Split_pdf(path, file, Out_Path): log_file = 'PDF_utils_log' pdf = PdfFileReader(os.path.join(path, file)) for frame in range(pdf.getNumPages()): pdf = PdfFileReader(os.path.join(path, file)) pdf_writer = PdfFileWriter() pdf_writer.addPage(pdf.getPage(frame)) Out_filename = os.path.splitext(file)[0] + "-F" + format( (frame + 1), '03d') + ".pdf" with open(os.path.join(Out_Path, Out_filename), 'wb') as out: pdf_writer.write(out) log_info( "File " + file + " frame " + format( (frame + 1), '03d') + " successfully splitted", log_file) out.close()
def put_watermark(input_pdf, output_pdf, watermark): # , logo_img): # print(f"38: {logo_img}") # picture_path = logo_img #'everest_logo.jpg' # text = None #'Produite pour' # # c = canvas.Canvas(watermark) # # if picture_path: # c.drawImage(picture_path, 420, 560) # # if text: # c.setFontSize(14) # c.setFont('Helvetica-Bold', 14) # c.drawString(45, 20, text) # # c.save() # reads the watermark pdf file through # PdfFileReader watermark_instance = PdfFileReader(watermark) # fetches the respective page of # watermark(1st page) watermark_page = watermark_instance.getPage(0) # reads the input pdf file pdf_reader = PdfFileReader(input_pdf) # It creates a pdf writer object for the # output file pdf_writer = PdfFileWriter() # iterates through the original pdf to # merge watermarks for page in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(page) # will overlay the watermark_page on top # of the current page. page.mergePage(watermark_page) # add that newly merged page to the # pdf_writer object. pdf_writer.addPage(page) with open(output_pdf, 'wb') as out: # writes to the respective output_pdf provided pdf_writer.write(out)
def merge_pdfs(file_path, pdf_writer=None, page=None): """ Utility function to compose a PDF file from parts of other PDF files. The output of the function is a PdfFileWriter object from PyPDF4. In order to save the resulting object output 'pdf_writer' as a PDF file, use the following logic in your routine: ``` with open('save/path/here.pdf', 'wb') as out: pdf_writer.write(out) ``` @param file_path: path of the PDF file to get pages from. @param pdf_writer: PyPDF4 object PdfFileWriter. If None, starts a new writer. @param page: int or list of ints. Number of the page from the file to be added. If None, all pages from 'file_path' are added in order. If list of ints, selected pages are added in the order of the list. @return: PdfFileWriter object with the new pages added. """ pdf_reader = PdfFileReader(str(file_path)) if pdf_writer is None: pdf_writer = PdfFileWriter() if page is None: for pg in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(pg)) else: try: for pg in page: pdf_writer.addPage(pdf_reader.getPage(pg)) except TypeError: pdf_writer.addPage(pdf_reader.getPage(page)) return pdf_writer
def readPdf_test_read_write(): #获取一个pdf对象 pdf_input = PdfFileReader(open(r'd:/data/pdf-scan/普通生物学(清晰PDF版).pdf', 'rb')) #获取pdf页数 page_count = pdf_input.getNumPages() #获取pdf第四页的内容 page = pdf_input.getPage(3) #page.extractText() #page['/Contents'] #获取一个pdfWriter对象 pdf_output = PdfFileWriter() # 将一个 PageObject 加入到 PdfFileWriter 中 pdf_output.addPage(page) #把新pdf保存 pdf_output.write(open(r'd:/data/pdf-scan/n.pdf', 'wb'))
def frasers2up(inputPdfFileWriter): edited_file = Pdfwrite() blank_file = Pdfwrite() leftpage = inputPdfFileWriter.getPage(0) leftx = leftpage.mediaBox.upperRight[0] lefty = leftpage.mediaBox.upperRight[1] for page in range(0, inputPdfFileWriter.getNumPages() - 1, 2): leftpage = inputPdfFileWriter.getPage(page) rightpage = inputPdfFileWriter.getPage(page + 1) leftx = leftpage.mediaBox.upperRight[0] lefty = leftpage.mediaBox.upperRight[1] blank_file.insertBlankPage(leftx, lefty, 0) blank_page = blank_file.getPage(0) blank_page.mergeTranslatedPage(rightpage, leftx, 0, 1) blank_page.mergePage(leftpage) edited_file.addPage(blank_page) # leftpage, leftx, 0, True) return edited_file
def split_pdf(self, path: str, name_of_split: str): """ 切分pdf文件 path: 原始文件存放路径 name_of_split:切分后的文件名,不包含后缀 """ pdf = PdfFileReader(path) pdf_writer = PdfFileWriter() for page in range(pdf.getNumPages()): pdf_writer.addPage(pdf.getPage(page)) output = self.processed + name_of_split + str(page) + ".pdf" with open(output, 'wb') as output_pdf: pdf_writer.write(output_pdf) # print("切分完成!") return "切分完成!"
def SaveButton(self, event): input_filename = Path(self.input_file) dlg = wx.FileDialog(self, message="Save file as...", defaultDir=os.getcwd(), defaultFile=(f"{input_filename.with_suffix('')}+" f"{self.no_pages.GetValue()}"), wildcard=pdfs, style=wx.FD_SAVE | wx.FD_OVERWRITE_PROMPT) if not self.input_path: self.error_message("Please select a pdf") elif dlg.ShowModal() == wx.ID_OK: output_path = dlg.GetPath() pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(self.input_path) try: blank_pages = int(self.no_pages.GetValue()) except ValueError: self.error_message( "The number of pages added must be an integer") pdf_writer.appendPagesFromReader(pdf_reader) for _ in range(blank_pages): pdf_writer.addBlankPage() with open(output_path, 'wb') as output: pdf_writer.write(output) success_dlg = wx.MessageDialog( self, f"""You created {dlg.GetFilename()} and saved it at {dlg.GetPath()}.""", "Success!", wx.OK | wx.ICON_INFORMATION) success_dlg.ShowModal() success_dlg.Destroy() dlg.Destroy() self.clear_func()
def pdf_generate(inpath, generate_list, outpath): fname = os.path.splitext(os.path.basename(inpath))[0] if not outpath: outpath = '{}_output.pdf'.format(fname) pdf_writer = PdfFileWriter() with open(outpath, 'wb') as out: for file in generate_list: pdf = PdfFileReader(file) for page in range(pdf.getNumPages()): pdf_writer.addPage(pdf.getPage(page)) pdf_writer.write(out) print('Created: {}'.format(outpath))
def split(input_path, page_ranges, output_name): """ Extracts the specified page ranges from a pdf. Enter the input paths, then the desired page ranges (as single page numbers or hyphen separated ranges) and finally the output path. The output pdfs will be named '<output_name>_p<page_range>'. """ for page_range in page_ranges: input_pdf = PdfFileReader(input_path) pdf_writer = PdfFileWriter() start, stop = format_range(page_range) for page in range(start, stop): pdf_writer.addPage(input_pdf.getPage(page)) output = f"{output_name}_p{page_range}.pdf" with open(output, 'wb') as output_pdf: pdf_writer.write(output_pdf)
def cut_left(file_path, output_file, points=66): with open(str(file_path), 'rb') as pfl: in_pdf = PdfFileReader(pfl) out_pdf = PdfFileWriter() n_pages = in_pdf.getNumPages() for i in trange(n_pages): page = in_pdf.getPage(i) # print_box(page) p_width = page.mediaBox.getWidth() p_height = page.mediaBox.getHeight() if p_width > p_height: lower_left = page.mediaBox.getLowerLeft() lower_left = (lower_left[0]+points, lower_left[1]) #(55.479, 0) upper_left = page.mediaBox.getUpperLeft() upper_left = (upper_left[0]+points, upper_left[1]) #(55.479, 604.321) page.mediaBox.lowerLeft = lower_left page.mediaBox.upperLeft = upper_left else: upper_left = page.mediaBox.getUpperLeft() upper_left = (upper_left[0], upper_left[1] - points) #(55.479, 0) upper_right = page.mediaBox.getUpperRight() upper_right = (upper_right[0], upper_right[1] - points) #(55.479, 604.321) page.mediaBox.upperLeft = upper_left page.mediaBox.upperRight = upper_right page.artBox = page.mediaBox page.bleedBox = page.mediaBox page.cropBox = page.mediaBox # print_box(page) out_pdf.addPage(page) print("Writing file ...") with open(output_file, 'wb') as outfl: out_pdf.write(outfl)
def rotate_pdf(original_filename, new_filename, rotation): with open(original_filename, 'rb') as pdf: # create a pdf reader object pdf_reader = PdfFileReader(pdf) # create a pdf writer object for new pdf pdf_writer = PdfFileWriter() # rotating each page for page in range(pdf_reader.numPages): # create rotated page object page_obj = pdf_reader.getPage(page) page_obj.rotateClockwise(rotation) # adding rotated page object to pdf writer pdf_writer.addPage(page_obj) with open(new_filename, 'wb') as new_pdf: # writing rotated pages to new file pdf_writer.write(new_pdf)
def watermark(input, watermark, output, verbosity, **kwargs): """use first page of pdf and add it as watermark to other document es. pdfcli watermark wm.pdf source.pdf -o final.pdf """ watermark = PdfFileReader(watermark) watermarkpage = watermark.getPage(0) pdf = PdfFileReader(input) pdfwrite = PdfFileWriter() for page in range(pdf.getNumPages()): pdfpage = pdf.getPage(page) pdfpage.mergePage(watermarkpage) pdfwrite.addPage(pdfpage) pdfwrite.write(output)
def paginate_pdf(pdf_name, number_page, pagination_template): """Create a new pdf with the pagination footer starting from the second page of the manual. """ writer = PdfFileWriter() stream_manuals = open(pdf_name, "rb") manuals = PdfFileReader(stream_manuals) stream_pagination = open(pagination_template, "rb") pagination = PdfFileReader(stream_pagination) for i in range(number_page): manuals_page = manuals.getPage(i) # pageNumber: 0 pagination_page = pagination.getPage(i) # Stack blank page translated_page = PageObject.createBlankPage( None, width=manuals_page.mediaBox.getHeight(), # width = 1224 height=manuals_page.mediaBox.getWidth(), # height = 792 ) # Stack pagination translated_page.mergePage(pagination_page) # Stack manual translated_page.mergeRotatedScaledTranslatedPage(manuals_page, rotation=-90, scale=1, tx=0, ty=792, expand=True) writer.addPage(translated_page) pdf_out = "./paginatedPDFs/" + pdf_name # New name of the output pdf file. with open(pdf_out, "wb") as _: writer.write(_) stream_manuals.close() stream_pagination.close() print(f"{pdf_out} copied to paginatedPDFs\n")
def merge_doc(self): '''合并封面和文件''' self.file_lists = list( zip(self.doc_code, self.name_list, self.final_names)) for pdfnames in self.file_lists: output = PdfFileWriter() for pdfname in pdfnames[0:2]: input = PdfFileReader(open(pdfname, "rb"), strict=False) pageCount = input.getNumPages() for iPage in range(0, pageCount): output.addPage(input.getPage(iPage)) pdfoutname = str(pdfnames[2]) outputStream = open(pdfoutname, "wb") output.write(outputStream) outputStream.close() print("文件合并完成:", pdfoutname) print("文件合并完成!") print("=><=" * 25)
def mergePdf(inFileList, outFile): ''' 合并文档 :param inFileList: 要合并的文档的 list :param outFile: 合并后的输出文件 :return: ''' pdfFileWriter = PdfFileWriter() for inFile in inFileList: # 依次循环打开要合并文件 pdfReader = PdfFileReader(open(inFile, 'rb')) numPages = pdfReader.getNumPages() for index in range(0, numPages): pageObj = pdfReader.getPage(index) pdfFileWriter.addPage(pageObj) # 最后,统一写入到输出文件中 pdfFileWriter.write(open(outFile, 'wb'))
def put_watermark(input_pdf, output_pdf, watermark): watermark_instance = PdfFileReader( watermark) #reads the watermark pdf file through PdfFileReader watermark_page = watermark_instance.getPage( 0) #fetches the respective page of watermark(1st page) pdf_reader = PdfFileReader( input_pdf) # Reads the pdf where watermark is to be placed pdf_writer = PdfFileWriter() #to write the modified pdf with watermarks for page in range(pdf_reader.getNumPages() ): #to loop through every page to put watermark page = pdf_reader.getPage(page) page.mergePage(watermark_page) pdf_writer.addPage(page) with open(output_pdf, 'wb') as out: pdf_writer.write(out) #writes to the respective output_pdf provided
def merge_pdfs(paths, output): pdf_writer = PdfFileWriter() for path in paths: print(f'path {path[1]} {path[0]}') pdf_reader = PdfFileReader(path[0]) for idx in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(idx) if USE_SCALING: if path[1] == 'A4': print('scaling...') page.scaleTo(812, 595) else: print('merge as is...') pdf_writer.addPage(page) with open(output, 'wb') as out: pdf_writer.write(out)
def parse(pdf_file): """解析PDF文本,并保存到TXT文件中""" fp = open(pdf_file, 'rb') # 来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档对象存储文档结构 document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: print('nono') raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr = PDFResourceManager() # 设定参数进行分析 laparams = LAParams() # 创建一个PDF设备对象 # device=PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理每一页 pageindex = [] i = 0 pattern = re.compile("微信") for page in PDFPage.create_pages(document): interpreter.process_page(page) # # 接受该页面的LTPage对象 layout = device.get_result() # return text image line curve for x in layout: if isinstance(x, LTText): if pattern.search(x.get_text()): pageindex.append(i) i += 1 pdf_output = PdfFileWriter() pdf_input = PdfFileReader(fp) # 获取 pdf 共用多少页 for j in pageindex: pdf_output.addPage(pdf_input.getPage(j)) final_path = os.path.join(r"C:\Users\big\Desktop\final.pdf") with open(final_path, "wb") as f: pdf_output.write(f) fp.close()
def merge_pdf(self, file_list, outpdf): """ 合并pdf文件 outpdf: 输出的pdf名称,不包含路径,如:merge_res.pdf """ pdf_writer = PdfFileWriter() for eve in file_list: pdf_reader = PdfFileReader(eve) for page in range(pdf_reader.getNumPages()): # 将每一页添加到writer对象中 pdf_writer.addPage(pdf_reader.getPage(page)) # 写入合并的pdf文件 with open(self.processed + outpdf, "wb") as f: pdf_writer.write(f) # print("合并完成!") return "合并完成!"
def merge_pdfs(paths, output): """Combine individually downloaded dashboard files into a compile report. Optionally scales the PDF.""" pdf_writer = PdfFileWriter() for path in paths: file_log.debug(f'path {path[1]} {path[0]}') pdf_reader = PdfFileReader(path[0]) for idx in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(idx) if USE_SCALING: if path[1] == 'A4': file_log.debug('scaling...') page.scaleTo(812, 595) else: file_log.debug('merge as is...') pdf_writer.addPage(page) with open(output, 'wb') as out: pdf_writer.write(out)
def MergePDF(filepath): try: in_file_path = filepath + r"\input\\" pdf_fileName = getFileName(in_file_path) for pdfnames in pdf_fileName: output = PdfFileWriter() for pdfname in pdfnames: input = PdfFileReader(open(pdfname, "rb")) pageCount = input.getNumPages() for iPage in range(0, pageCount): output.addPage(input.getPage(iPage)) pdfoutname = str(pdfnames[0]).replace("input", "output") outputStream = open(pdfoutname, "wb") output.write(outputStream) outputStream.close() messagebox.showinfo("Complete!", "Complete!") except Exception as err: print("Something went wrong") print(err) sys.exit()
def create_watermark(input_pdf, output, watermark): """ 添加水印 :param input_pdf: 要加水印的PDF文件路径 :param output: 要保存PDF的水印版本的路径 :param watermark: 包含水印图像或文本的PDF :return: """ watermark_obj = PdfFileReader(watermark) watermark_page = watermark_obj.getPage(0) pdf_reader = PdfFileReader(input_pdf) pdf_writer = PdfFileWriter() # 给所有页面添加水印 for page in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(page) page.mergePage(watermark_page) pdf_writer.addPage(page) with open(output, 'wb') as out: pdf_writer.write(out)
def pdf_to_pdfs(pdf_folder, page_count): """ 将单页的pdf文件合并成整个文件 :param pdf_folder: :param page_count: :return: """ # 创建一个pdf空白文档 pdf_writer = PdfFileWriter() pdf_file_path = '{}/{}.pdf'.format(pdf_folder, 'combine') # 读取每页的pdf for page_index in range(page_count): page_pdf_file = '{}/{}.pdf'.format(pdf_folder, page_index) # 读取单页的pdf # 开始进行pdf 到 image的转换 reader = PdfFileReader(page_pdf_file, strict=False) pdf_writer.addPage(reader.getPage(0)) page_index += 1 # 保存 pdf_writer.write(open(pdf_file_path, 'wb'))