def split(input_path, page_ranges, output_name): """ Extracts the specified page ranges from a pdf. Enter the input paths, then the desired page ranges (as single page numbers or hyphen separated ranges) and finally the output path. The output pdfs will be named '<output_name>_p<page_range>'. """ for page_range in page_ranges: input_pdf = PdfFileReader(input_path) pdf_writer = PdfFileWriter() start, stop = format_range(page_range) for page in range(start, stop): pdf_writer.addPage(input_pdf.getPage(page)) output = f"{output_name}_p{page_range}.pdf" with open(output, 'wb') as output_pdf: pdf_writer.write(output_pdf)
def split_pdf(self, path: str, name_of_split: str): """ 切分pdf文件 path: 原始文件存放路径 name_of_split:切分后的文件名,不包含后缀 """ pdf = PdfFileReader(path) pdf_writer = PdfFileWriter() for page in range(pdf.getNumPages()): pdf_writer.addPage(pdf.getPage(page)) output = self.processed + name_of_split + str(page) + ".pdf" with open(output, 'wb') as output_pdf: pdf_writer.write(output_pdf) # print("切分完成!") return "切分完成!"
def addBlankpage(): # PDF写入操作 readFile = './01.pdf' outFile = './01_new.pdf' pdfFileWriter = PdfFileWriter() # 获取 PdfFileReader 对象 pdfFileReader = PdfFileReader(readFile) # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb')) numPages = pdfFileReader.getNumPages() for index in range(0, numPages): pageObj = pdfFileReader.getPage(index) pdfFileWriter.addPage(pageObj) # 根据每页返回的 PageObject,写入到文件 pdfFileWriter.write(open(outFile, 'wb')) pdfFileWriter.addBlankPage() # 在文件的最后一页写入一个空白页,保存至文件中 pdfFileWriter.write(open(outFile, 'wb'))
def add_watermark(orig_cover, key): c = canvas.Canvas('watermark.pdf') if key == 0: c.drawImage(CONST_ENGLISH, 190, 220) elif key == 1: c.drawImage(CONST_MATH, 220, 220) elif key == 2: c.drawImage(CONST_READING, 190, 220) elif key == 3: c.drawImage(CONST_SCIENCE, 190, 220) c.save() watermark = PdfFileReader(open("watermark.pdf", "rb")) orig_cover.mergePage(watermark.getPage(0)) os.remove("watermark.pdf") return orig_cover
def pdfConcat(inputFiles, outputFileName, testPath): pdfWriter = PdfFileWriter() temp = open(testPath, 'rb') pdf = PdfFileReader(temp) pdfWriter.addPage(pdf.getPage(0)) for filePath in inputFiles: f = open(filePath, 'rb') pdfReader = PdfFileReader(f) for page in pdfReader.pages[1:]: pdfWriter.addPage(page) with open(outputFileName, 'wb') as out: pdfWriter.write(out) temp.close()
def add_page(self, path: str): """ 添加空白页面 path:文件完整路径 """ pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.addBlankPage() # 插入空白页面 with open("./processed/add_blank.pdf", "wb") as f: pdf_writer.write(f) # print("加入空白页完成!") return "添加空白页完成!"
def mergePdf(inFileList, outFile): ''' 合并文档 :param inFileList: 要合并的文档的 list :param outFile: 合并后的输出文件 :return: ''' pdfFileWriter = PdfFileWriter() for inFile in inFileList: # 依次循环打开要合并文件 pdfReader = PdfFileReader(open(inFile, 'rb')) numPages = pdfReader.getNumPages() for index in range(0, numPages): pageObj = pdfReader.getPage(index) pdfFileWriter.addPage(pageObj) # 最后,统一写入到输出文件中 pdfFileWriter.write(open(outFile, 'wb'))
def merge_doc(self): '''合并封面和文件''' self.file_lists = list( zip(self.doc_code, self.name_list, self.final_names)) for pdfnames in self.file_lists: output = PdfFileWriter() for pdfname in pdfnames[0:2]: input = PdfFileReader(open(pdfname, "rb"), strict=False) pageCount = input.getNumPages() for iPage in range(0, pageCount): output.addPage(input.getPage(iPage)) pdfoutname = str(pdfnames[2]) outputStream = open(pdfoutname, "wb") output.write(outputStream) outputStream.close() print("文件合并完成:", pdfoutname) print("文件合并完成!") print("=><=" * 25)
def decrypt_pdf(input_file: str, password: str): """ Decrypts a file using PyPDF4 library. Precondition: A file is already encrypted """ pdf_reader = PdfFileReader(open(input_file, 'rb'), strict=False) if not pdf_reader.isEncrypted: print(f"PDF File {input_file} not encrypted") return False, None, None pdf_reader.decrypt(password=password) pdf_writer = PdfFileWriter() try: for page_number in range(pdf_reader.numPages): pdf_writer.addPage(pdf_reader.getPage(page_number)) except utils.PdfReadError as e: print(f"Error reading PDF File {input_file} = {e}") return False, None, None return True, pdf_reader, pdf_writer
def merge_pdfs(paths, output): pdf_writer = PdfFileWriter() for path in paths: print(f'path {path[1]} {path[0]}') pdf_reader = PdfFileReader(path[0]) for idx in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(idx) if USE_SCALING: if path[1] == 'A4': print('scaling...') page.scaleTo(812, 595) else: print('merge as is...') pdf_writer.addPage(page) with open(output, 'wb') as out: pdf_writer.write(out)
def parse(pdf_file): """解析PDF文本,并保存到TXT文件中""" fp = open(pdf_file, 'rb') # 来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档对象存储文档结构 document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: print('nono') raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr = PDFResourceManager() # 设定参数进行分析 laparams = LAParams() # 创建一个PDF设备对象 # device=PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理每一页 pageindex = [] i = 0 pattern = re.compile("微信") for page in PDFPage.create_pages(document): interpreter.process_page(page) # # 接受该页面的LTPage对象 layout = device.get_result() # return text image line curve for x in layout: if isinstance(x, LTText): if pattern.search(x.get_text()): pageindex.append(i) i += 1 pdf_output = PdfFileWriter() pdf_input = PdfFileReader(fp) # 获取 pdf 共用多少页 for j in pageindex: pdf_output.addPage(pdf_input.getPage(j)) final_path = os.path.join(r"C:\Users\big\Desktop\final.pdf") with open(final_path, "wb") as f: pdf_output.write(f) fp.close()
def merge_pdf(self, file_list, outpdf): """ 合并pdf文件 outpdf: 输出的pdf名称,不包含路径,如:merge_res.pdf """ pdf_writer = PdfFileWriter() for eve in file_list: pdf_reader = PdfFileReader(eve) for page in range(pdf_reader.getNumPages()): # 将每一页添加到writer对象中 pdf_writer.addPage(pdf_reader.getPage(page)) # 写入合并的pdf文件 with open(self.processed + outpdf, "wb") as f: pdf_writer.write(f) # print("合并完成!") return "合并完成!"
def merge_pdfs(paths, output): """Combine individually downloaded dashboard files into a compile report. Optionally scales the PDF.""" pdf_writer = PdfFileWriter() for path in paths: file_log.debug(f'path {path[1]} {path[0]}') pdf_reader = PdfFileReader(path[0]) for idx in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(idx) if USE_SCALING: if path[1] == 'A4': file_log.debug('scaling...') page.scaleTo(812, 595) else: file_log.debug('merge as is...') pdf_writer.addPage(page) with open(output, 'wb') as out: pdf_writer.write(out)
def extractImages(): # parser = argparse.ArgumentParser(prog='PDFtoImage',description='Extracting images from PDF') # parser.add_argument('infile', help='Enter name for input source PDF file') # args = parser.parse_args() # fileName= args.infile filePDF = join(workDir, fileName + ext) pdfReader = PdfFileReader(open(filePDF, "rb")) for pgNum in range(pdfReader.numPages): page = pdfReader.getPage(pgNum) xObject = page['/Resources']['/XObject'].getObject() for obj in xObject: if xObject[obj]['/Subtype'] == '/Image': size = (xObject[obj]['/Width'], xObject[obj]['/Height']) data = xObject[obj].getData() if xObject[obj]['/ColorSpace'] == '/DeviceRGB': mode = "RGB" else: mode = "P" if xObject[obj]['/Filter'] == '/FlateDecode': img = Image.frombytes(mode, size, data) outFile = join(workDir, f'{pgNum}_{fileName}{png}') img.save(outFile) elif xObject[obj]['/Filter'] == '/DCTDecode': img = Image.open(io.BytesIO(data)) outFile = join(workDir, f'{pgNum}_{fileName}{jpg}') img.save(outFile) # img = open(join(workDir, f'{pgNum} {fileName}{jpg}'), "wb") # img.write(data) # img.close() elif xObject[obj]['/Filter'] == '/JPXDecode': img = open(join(workDir, f'{pgNum} {fileName}{jp2}'), "wb") img.write(data) img.close() elif xObject[obj]['/Filter'] == '/CCITTFaxDecode': img = open(join(workDir, f'{pgNum} {fileName}{tiff}'), "wb") img.write(data) img.close()
def pdf_to_pdfs(pdf_folder, page_count): """ 将单页的pdf文件合并成整个文件 :param pdf_folder: :param page_count: :return: """ # 创建一个pdf空白文档 pdf_writer = PdfFileWriter() pdf_file_path = '{}/{}.pdf'.format(pdf_folder, 'combine') # 读取每页的pdf for page_index in range(page_count): page_pdf_file = '{}/{}.pdf'.format(pdf_folder, page_index) # 读取单页的pdf # 开始进行pdf 到 image的转换 reader = PdfFileReader(page_pdf_file, strict=False) pdf_writer.addPage(reader.getPage(0)) page_index += 1 # 保存 pdf_writer.write(open(pdf_file_path, 'wb'))
def test_change(fs): path = str(Path(__file__).parent.joinpath('test.pdf')) fs.add_real_file(path, read_only=False) acc = PDFAccessor(path) acc.edit(SetTitleCmd(path, 'Why Donuts Are Great')) acc.edit( SetCreatedCmd(path, datetime.fromisoformat('1999-02-04T06:08:10+00:00'))) acc.edit(AddTagCmd(path, 'tag3')) acc.edit(DelTagCmd(path, 'tag2')) assert acc.save() with open(path, 'rb') as file: pdf = PdfFileReader(file) assert 'I like donuts' in pdf.getPage(0).extractText() # Make sure we didn't destroy preexisting metadata assert pdf.getDocumentInfo()['/Creator'] == 'Pages' info = PDFAccessor(path).info() assert info.title == 'Why Donuts Are Great' assert info.created == datetime.fromisoformat('1999-02-04T06:08:10+00:00') assert info.tags == {'tag1', 'tag3'}
def fetch_stuff(pno): pg = r.content_sync(startview=pno, nviews=1, mode="pdf").value reader = PdfFileReader(BytesIO(pg)) data, type_ = extract_image(reader.getPage(2)) ocr = r.ocr_data_sync(view=pno).value soup = BeautifulSoup(ocr.decode()) upper_bound = [0, 0] lower_bound = [0, 0] page = soup.find("page") height, width = int(page.get("height")), int(page.get("width")) xscale = data.height / height yscale = data.width / width height *= yscale printspace = soup.find("printspace") text_height = round(int(printspace.get("height")) * yscale) text_width = round(int(printspace.get("width")) * xscale) vpos = int(printspace.get("vpos")) * yscale hpos = int(printspace.get("hpos")) * xscale upper = Point(round(hpos), round(vpos)) return upper, text_height, text_width, data, height
def MergePDF(filepath): try: in_file_path = filepath + r"\input\\" pdf_fileName = getFileName(in_file_path) for pdfnames in pdf_fileName: output = PdfFileWriter() for pdfname in pdfnames: input = PdfFileReader(open(pdfname, "rb")) pageCount = input.getNumPages() for iPage in range(0, pageCount): output.addPage(input.getPage(iPage)) pdfoutname = str(pdfnames[0]).replace("input", "output") outputStream = open(pdfoutname, "wb") output.write(outputStream) outputStream.close() messagebox.showinfo("Complete!", "Complete!") except Exception as err: print("Something went wrong") print(err) sys.exit()
def add_encryption(input_pdf, output_pdf, password): """ PDF加密 :param input_pdf: :param output_pdf: :param password: :return: """ pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(input_pdf) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.encrypt(user_pwd=password, owner_pwd=None, use_128bit=True) with open(output_pdf, 'wb') as fh: pdf_writer.write(fh)
def encrypt_pdf(self, inputpdf, outpdf, password): """ 加密pdf inputpdf: 输入的pdf名称,包含路径 outpdf: 输出的pdf名称,不包含路径 """ pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(inputpdf) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.encrypt(user_pwd=password, use_128bit=True) # 默认为40bit加密 with open(self.processed + outpdf, 'wb') as f: pdf_writer.write(f) # print("文档加密完成!") return "文档加密完成!"
def SaveButton(self, event): dlg = wx.FileDialog(self, message="Save file as...", defaultDir=os.getcwd(), defaultFile="", wildcard=pdfs, style=wx.FD_SAVE | wx.FD_OVERWRITE_PROMPT) input_paths = [ self.files_and_paths[file] for file in self.input_files.GetStrings() ] if len(input_paths) < 2: error_dlg = wx.MessageDialog( self, "Please select at least two pdfs to join", "Something went wrong...", wx.OK | wx.ICON_ERROR) error_dlg.ShowModal() error_dlg.Destroy() elif dlg.ShowModal() == wx.ID_OK: output_path = dlg.GetPath() pdf_writer = PdfFileWriter() for path in input_paths: pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) with open(output_path, 'wb') as output: pdf_writer.write(output) success_dlg = wx.MessageDialog( self, f"""You created {dlg.GetFilename()} and saved it at {dlg.GetPath()}.""", "Success!", wx.OK | wx.ICON_INFORMATION) success_dlg.ShowModal() success_dlg.Destroy() dlg.Destroy() self.clear_func()
def encrypt_pdf(input_file: str, password: str): """ Encrypts a file using PyPDF4 library. Precondition: File is not encrypted. """ pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(open(input_file, 'rb'), strict=False) if pdf_reader.isEncrypted: print(f"PDF File {input_file} already encrypted") return False, None, None try: # To encrypt all the pages of the input file, you need to loop over all of them # and to add them to the writer. for page_number in range(pdf_reader.numPages): pdf_writer.addPage(pdf_reader.getPage(page_number)) except utils.PdfReadError as e: print(f"Error reading PDF File {input_file} = {e}") return False, None, None # The default is 128 bit encryption (if false then 40 bit encryption). pdf_writer.encrypt(user_pwd=password, owner_pwd=None, use_128bit=True) return True, pdf_reader, pdf_writer
def pdfConcat(each_section, new_filename, test_path): pdf_writer = PdfFileWriter() temp = open(test_path, 'rb') pdf = PdfFileReader(temp) pdf_writer.addPage(pdf.getPage(0)) for file_path in each_section: f = open(file_path , 'rb') pdf_reader = PdfFileReader(f) for page in pdf_reader.pages[1:]: pdf_writer.addPage(page) local_filename = os.path.join(CONST_LOCAL, new_filename) with open(local_filename, 'wb') as out: pdf_writer.write(out) # upload_dropbox(local_filename, output_filename) temp.close()
def pdf_merge(files_paths, file_name): pdf_writer = PdfFileWriter() descs = get_files_descriptors(files_paths) for fd in descs: try: pdf_reader = PdfFileReader(fd) except Exception: print('Cant read file') for page in range(pdf_reader.getNumPages()): try: pdf_writer.addPage(pdf_reader.getPage(page)) except Exception: print('Cant merge file') with open(file_name, 'wb') as f: try: pdf_writer.write(f) except Exception: print('Cant write file') finally: close_file_descriptors(descs) print('All descriptors are closed')
def test(): sql = "INSERT INTO letters (words) VALUES (%s)" filename='dan_brown.pdf' print('...Starting..........') #urllib.request.urlretrieve(name, filename) pdf = PdfFileReader(open(filename, "rb")) info = pdf.getDocumentInfo() print(info) info = pdf.getNumPages() print("Numbers of pages {} ". format(info)) for i in range(info): x=pdf.getPage(i) text=x.extractText() lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) #print(text) data=text.split("\n") for line in data: #print((words)) words=line.split(" ") for letter in words: print("--{}--". format(letter)) mycursor.execute(sql, (letter, )) #rd=str(words) #print('-{}-'. format(rd)) #print(len(rd)) #lr=rd.split() # data=text.split("\n") #print("--{}--". format(text) ) #mycursor.execute(sql, (pid,i,text, )) print('Finish') mydb.commit()
def encrypt_single_pdf(input_file, output_file, password): """Encrypt single pdf as another file Args: input_file: input file path name output_file: output file path name password: password used when encryption Return: None """ with open(input_file, 'rb') as input_stream: reader = PdfFileReader(input_stream) writer = PdfFileWriter() #writer.appendPagesFromReader(reader) for page in range(reader.getNumPages()): writer.addPage(reader.getPage(page)) writer.encrypt(password) for page in range(reader.getNumPages()): a = writer.getPage(page) #a.scaleTo(480, 270) with open(output_file, 'wb') as output_stream: writer.write(output_stream)
def remove_noise(inputFile): template_id = "TID" + str(uuid.uuid4().node) outputFile = template_id + '.' + inputFile.split('.')[-1] with open('./static/img/' + inputFile, "rb") as f: source = PdfFileReader(f, "rb") output = PdfFileWriter() for page in range(source.getNumPages()): page = source.getPage(page) content_object = page["/Contents"].getObject() content = ContentStream(content_object, source) for operands, operator in content.operations: if operator == b_("Tf") or operator == b_("Tj"): operands[0] = TextStringObject('') page.__setitem__(NameObject('/Contents'), content) output.addPage(page) # try: with open('./static/template/' + outputFile, "wb") as outputStream: output.write(outputStream) return template_id
def SaveButton(self, event): dlg = wx.FileDialog(self, message="Save file as...", defaultDir=os.getcwd(), defaultFile="", wildcard=pdfs, style=wx.FD_SAVE | wx.FD_OVERWRITE_PROMPT) if not self.input_path: error_dlg = wx.MessageDialog(self, "Please select a pdf to rotate", "Something went wrong...", wx.OK | wx.ICON_ERROR) error_dlg.ShowModal() error_dlg.Destroy() elif dlg.ShowModal() == wx.ID_OK: selection = self.set_degrees.GetSelection() degrees = int(self.degrees_list[selection]) output_path = dlg.GetPath() pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(self.input_path) for page in range(pdf_reader.getNumPages()): original = pdf_reader.getPage(page) pdf_writer.addPage(original.rotateClockwise(degrees)) with open(output_path, 'wb') as output: pdf_writer.write(output) success_dlg = wx.MessageDialog( self, f"""You created {dlg.GetFilename()} and saved it at {dlg.GetPath()}.""", "Success!", wx.OK | wx.ICON_INFORMATION) success_dlg.ShowModal() success_dlg.Destroy() dlg.Destroy() self.clear_func()
def merge_doc(self): '''合并封面和文件''' file_lists = list(zip(self.doc_codes, self.name_lists)) for pdfnames in file_lists: try: output = PdfFileWriter() for pdfname in pdfnames: input = PdfFileReader(open(pdfname, "rb"), strict=False) pageCount = input.getNumPages() for iPage in range(0, pageCount): output.addPage(input.getPage(iPage)) pdfoutname = str(pdfnames[1]).replace("input", "output") outputStream = open(pdfoutname, "wb") output.write(outputStream) outputStream.close() print("文件合并完成:", pdfoutname) except Exception as err: messagebox.showerror("Warning!", err) with open(os.path.join(os.getcwd(), "error.txt"), "a") as f: traceback.print_exc(file=f) print(err) continue print("文件合并完成!") print("=><=" * 25)
# can_2.drawString(85, 141, "Torino") # # can_2.drawString(180, 209, user["Cognome"].capitalize()) # can_2.drawString(350, 209, user["Nome"].capitalize()) # # can_2.save() # # packet_2.seek(0) # new_pdf_2 = PdfFileReader(packet_2) # read your existing PDF existing_pdf = PdfFileReader(open("./assets/partecip.pdf", "rb")) output = PdfFileWriter() # add the "watermark" (which is the new pdf) on the existing page_1 page_1 = existing_pdf.getPage(0) page_1.mergePage(new_pdf_1.getPage(0)) output.addPage(page_1) # page_2 = existing_pdf.getPage(1) # page_2.mergePage(new_pdf_2.getPage(0)) # output.addPage(page_2) # output.addPage(existing_pdf.getPage(1)) # finally, write "output" to a real file right_folder = "other/" if box != "crossfittorino" else "cfto/" outputStream = open( "assets/" + right_folder + user["Cognome"].replace(" ", "").lower() + user["Nome"].replace(" ", "") + ".pdf", "wb")