def __init__(self, pdf_file_path, mode=PDFHandleMode.COPY): ''' 用一个PDF文件初始化 :param pdf_file_path: PDF文件路径 :param mode: 处理PDF文件的模式,默认为PDFHandleMode.COPY模式 ''' # 只读的PDF对象 self.__pdf = reader(pdf_file_path) # 获取PDF文件名(不带路径) self.file_name = os.path.basename(pdf_file_path) # self.metadata = self.__pdf.getXmpMetadata() # self.doc_info = self.__pdf.getDocumentInfo() # self.pages_num = self.__pdf.getNumPages() # 可写的PDF对象,根据不同的模式进行初始化 self.__writeable_pdf = writer() if mode == PDFHandleMode.COPY: self.__writeable_pdf.cloneDocumentFromReader(self.__pdf) elif mode == PDFHandleMode.NEWLY: for idx in range(self.pages_num): page = self.__pdf.getPage(idx) self.__writeable_pdf.insertPage(page, idx)
def rotate(filenames, angle, overwrite): for inFp in filenames.get().split(','): inPdf = reader(inFp) n = inPdf.getNumPages() outPdf = writer() outFp = inFp if overwrite.get() else inFp.replace( '.pdf', '_rotate.pdf') for i in range(n): p = inPdf.getPage(i).rotateClockwise(int(angle.get())) outPdf.addPage(p) outPdf.write(open(outFp, 'wb')) showinfo(message="完成,保存在源文件所在文件夹")
def __init__(self, pdf_file_path): ''' 用一个PDF文件路径初始化 :param pdf_file_path: PDF文件路径 ''' # 读取PDF pdf = reader(pdf_file_path) # PDF页面列表对象 self.pages = pdf.pages # PDF写对象 self.pdf_writer = writer() # 当前已保存的页码总数,初始化为0 self.saved_page_num = 0
def main(): book = reader('./book.pdf') pdf = writer() pdf.cloneDocumentFromReader(book) # 添加书签 # 注意:页数是从0开始的,中文要用unicode字符串,否则会出现乱码 # 如果这里的页码超过文档的最大页数,会报IndexError异常 pdf.addBookmark(u'Hello World! 你好,世界!', 2) # 保存修改后的PDF文件内容到文件中 # 注意:这里必须用二进制的'wb'模式来写文件,否则写到文件中的内容都为乱码 with open('./book-with-bookmark.pdf', 'wb') as fout: pdf.write(fout)
def split(path,name): f=reader(path+'/'+name) page=f.getNumPages() part=500 w=writer() if page>part: newPath=path+'/'+name.split('.')[0] os.mkdir(newPath) div=page//part for i in range(div): start=i*part end=(i+1)*part-1 dirs=f'{newPath}/{i}_{name}' for p in range(start,end): pdf=f.getPage(i) w.addPage(pdf) w.write(open(dirs,'wb'))
def main(): # 读取PDF文件,创建PdfFileReader对象 book = reader('./book.pdf') # 创建PdfFileWriter对象,并用reader对象进行初始化 pdf = writer() pdf.cloneDocumentFromReader(book) # 添加书签 # 注意:页数是从0开始的,中文要用unicode字符串,否则会出现乱码 # 如果这里的页码超过文档的最大页数,会报IndexError异常 # 3表示书签链接到的页码数为第3页 pdf.addBookmark(u'Hello World! 你好,世界!', 3) # 保存修改后的PDF文件内容到文件中 with open('./book-with-bookmark.pdf', 'wb') as fout: pdf.write(fout)
def splitByCsv(): pdf=reader('f.pdf') n=pdf.getNumPages() csv=open('f.csv','r',encoding='utf-8').readlines() for i in range(n): p=pdf.getPage(i) name="".join(csv[i+1].split(",")[1])+'.pdf' print(name) if os.path.isfile(name): name+='_1' w=writer() w.addPage(p) w.write(open(name,'wb')) def split(path,name): f=reader(path+'/'+name) page=f.getNumPages() part=500 w=writer() if page>part: newPath=path+'/'+name.split('.')[0] os.mkdir(newPath) div=page//part for i in range(div): start=i*part end=(i+1)*part-1 dirs=f'{newPath}/{i}_{name}' for p in range(start,end): pdf=f.getPage(i) w.addPage(pdf) w.write(open(dirs,'wb')) def main(): # for fd in ['301','501']: # ls=os.listdir(fd) # for f in ls: # split(fd,f) splitByCsv() if __name__=='__main__': main()
def main(): base_path = "需要合并的pdf文件目录路径" pdf_part = [] total_pages = 0 for f in os.listdir(base_path): print(f) pdf = reader(base_path + "/" + f) pdf_part.append(pdf) total_pages = total_pages + pdf.getNumPages() merged_pdf = writer() page_idx = 0 for pdf in pdf_part: for idx in range(pdf.getNumPages()): merged_pdf.insertPage(pdf.getPage(idx), page_idx) page_idx += 1 with open(base_path + "/merged.pdf", 'wb') as pdf_file: merged_pdf.write(pdf_file)
def img2pdf(self): '''将文件批量转为PDF''' out_pdf = writer() # 创建文件夹 try: os.makedirs(self.outfile+r"\PDF_files") except Exception as e: print("创建文件夹出错,错误为----->",e) # 逐一转为PDF path = self.outfile+r"\PDF_files" i = 0 for img in self.crop_imgs: outpath_pdf = path+r"\img_pdf"+str(i+1)+".pdf" #outpath_pdf = self.outfile+r"\img_pdf"+str(i+1)+".pdf" img.save(outpath_pdf,'PDF') out_pdf.appendPagesFromReader(reader(open(outpath_pdf,'rb'))) i += 1 out_pdf.write(open(path+r"\totalpdf.pdf",'wb')) print("PDF文件保存在{}文件夹里".format(path))
def __init__(self, pdf_file_path, mode=PDFHandleMode.COPY): ''' :param pdf_file_path :param mode ''' self.__pdf = reader(pdf_file_path) # pdf file name(without path name) self.file_name = os.path.basename(pdf_file_path) # self.metadata = self.__pdf.getXmpMetadata() # self.doc_info = self.__pdf.getDocumentInfo() # self.pages_num = self.__pdf.getNumPages() self.__writeable_pdf = writer() if mode == PDFHandleMode.COPY: self.__writeable_pdf.cloneDocumentFromReader(self.__pdf) elif mode == PDFHandleMode.NEWLY: for idx in range(self.pages_num): page = self.__pdf.getPage(idx) self.__writeable_pdf.insertPage(page, idx)
def getbookmark(fullfile: str, outfile: str = None) -> list: ''' 提取PDF文件中的bookmarks,存储到content元组列表中 :param fullfile:PDF文件路径字符串 :param outfile:可选参数,将提取的PDF书签输出outfile指定路径 : return: list ''' pdffile = reader(fullfile) outlins = pdffile.getOutlines() content = [] for bm in outlins: if type(bm) == list: for sub in bm: thistitl = sub.title thispage = pdffile.getDestinationPageNumber(sub) content.append(('\t' + thistitl, thispage)) # if bm.index(sub) == 0: # content.append(('\t'+thistitl,thispage)) # else: # content.append((thistitl,thispage)) else: thistitl = bm.title thispage = pdffile.getDestinationPageNumber(bm) content.append((thistitl, thispage)) # outfile存在则将bookmarks存储到outfie中 if outfile != None: f = open(outfile, 'w') for s in content: f.write('{}@{}\n'.format(s[0], s[1])) f.close() return content # lis = getbookmark('.test.pdf','./files/tt') # lis = getbookmark('test.pdf') # for t,n in lis: # print(t,n)
def img2pdf(self): '''将所有的图片全部转为PDF''' # 生成文件夹 try: # 因为文件夹可能已经存在,这时候是会报错的 os.makedirs('./漫画') except: print('文件夹已经存在!') else: print("所爬取的内容保存在程序所在文件夹下的allimgs文件夹里!") # 批量生成PDF文件 pdf_writer = writer() files_path = './漫画' i = 1 for img in self.img_datas: # 保存为PDF img_path = files_path + '/' + str(i) + '.pdf' img.save(img_path, 'PDF') # 合并PDF pdf_writer.appendPagesFromReader(reader(open(img_path, 'rb'))) i += 1 pdf_path = files_path + '/总的.pdf' pdf_writer.write(open(pdf_path, 'wb')) print('漫画保存在{}路径下'.format(pdf_path))
def open_pdf(input_pdf): with open(input_pdf, 'rb'): return reader(input_pdf)
import os from PyPDF2 import PdfFileReader as reader #Parsing existing user information path = '/Users/Ejmin/Desktop/FEB182018/ADSRegistrationForms' i = 0 userId = 1000 for root, dirs, files in os.walk(path): for file in files: if ".pdf" in file: userId += 1 #print(userId) studentFile = reader(open(os.path.join(root, file), 'rb')) studentFields = studentFile.getFormTextFields() print("-----------------------------------") print(studentFields["studentName"] + " " + studentFields["studentLastName"]) print(studentFields["phoneNumber"]) print(studentFields["email"]) #print(studentFields["emergencyContact"]) if "parentFullName" in studentFields: print("Parent name : " + studentFields["parentFullName"]) print(studentFields["className"]) #print(studentFields["todaysDate"]) #print(studentFields)
# store our table of data here currentRow = [] allRows = [] # possible states IGNORE = 0 FIRST_COL = 1 DEFAULT = 2 # state variable, start out ignoring cells state = IGNORE # extract all of the text from the pdf, store in s f = open('current.pdf','rb') r = reader(f) p = r.getPage(0) s = p.extractText() f.close() #parse through each of the cells for cell in s.split('\n'): cell = cell.strip().upper() # ignore until we see a class type if (state is IGNORE and cell in classtypes): state = DEFAULT currentRow.append(cell) # consume cells elif (state is DEFAULT and not cell in pftypes): currentRow.append(cell) # consume cells until the pftype and then go to FIRST_COL
from PyPDF2 import PdfFileReader as reader from os import getcwd, walk fileList = [] totalNumPages = 0 filesbyPageNumber = [] path = getcwd() for dirpath, dir, files in walk(path): if dirpath == path: fileList += files for file in fileList: if ".pdf" in file: numPages = reader(file).numPages totalNumPages += numPages filesbyPageNumber.append((file, numPages)) print('{} completed'.format(file)) filesbyPageNumber.sort(key=lambda x: x[1]) final_strings = [] for item in filesbyPageNumber: final_strings.append(str(item)[1:-1].replace("'", "") + "\n") with open('files_by_page_number.csv', 'w') as file: for string in final_strings: file.write(string) print(string) print('Total number of pdf pages: ', totalNumPages)