def basic_Op(): # readFile = 'C:/Users/Administrator/Desktop/RxJava 完全解析.pdf' # 获取 PdfFileReader 对象 pdfFileReader = PdfFileReader( "1.pdf") # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb')) # 获取 PDF 文件的文档信息 documentInfo = pdfFileReader.getDocumentInfo() print('documentInfo = %s' % documentInfo) # 获取页面布局 pageLayout = pdfFileReader.getPageLayout() print('pageLayout = %s ' % pageLayout) # 获取页模式 pageMode = pdfFileReader.getPageMode() print('pageMode = %s' % pageMode) xmpMetadata = pdfFileReader.getXmpMetadata() print('xmpMetadata = %s ' % xmpMetadata) # 获取 pdf 文件页数 pageCount = pdfFileReader.getNumPages() print('pageCount = %s' % pageCount) for index in range(0, pageCount): # 返回指定页编号的 pageObject pageObj = pdfFileReader.getPage(index) print('index = %d , pageObj = %s' % (index, type(pageObj))) # <class 'PyPDF2.pdf.PageObject'> # 获取 pageObject 在 PDF 文档中处于的页码 pageNumber = pdfFileReader.getPageNumber(pageObj) print('pageNumber = %s ' % pageNumber)
def info_page(readFile): pdfFileReader = PdfFileReader( readFile) # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb')) # 获取 PDF 文件的文档信息 documentInfo = pdfFileReader.getDocumentInfo() print('documentInfo = %s' % documentInfo) # 获取页面布局 pageLayout = pdfFileReader.getPageLayout() print('pageLayout = %s ' % pageLayout) # 获取页模式 pageMode = pdfFileReader.getPageMode() print('pageMode = %s' % pageMode) xmpMetadata = pdfFileReader.getXmpMetadata() print('xmpMetadata = %s ' % xmpMetadata) # 获取 pdf 文件页数 pageCount = pdfFileReader.getNumPages() print('pageCount = %s' % pageCount) for index in range(0, pageCount): # 返回指定页编号的 pageObject pageObj = pdfFileReader.getPage(index) print('index = %d , pageObj = %s' % (index, pageObj)) # <class 'PyPDF2.pdf.PageObject'> # 获取 pageObject 在 PDF 文档中处于的页码 pageNumber = pdfFileReader.getPageNumber(pageObj) print('pageNumber = %s ' % pageNumber)
def get_metadata(): for dirpath, dirnames, files in os.walk("pdf"): for data in files: ext = data.lower().rsplit('.', 1)[-1] if ext in ['pdf']: print("[--- Metadata : " + "%s ", (dirpath + os.path.sep + data)) print( "------------------------------------------------------------------------------------" ) pdfReader = PdfFileReader( open(dirpath + os.path.sep + data, 'rb')) info = pdfReader.getDocumentInfo() for metaItem in info: print('[+] ' + metaItem.strip('/') + ': ' + info[metaItem]) pages = pdfReader.getNumPages() print('[+] Pages:', pages) layout = pdfReader.getPageLayout() print('[+] Layout: ' + str(layout)) xmpinfo = pdfReader.getXmpMetadata() if hasattr(xmpinfo, 'dc_contributor'): print('[+] Contributor:', xmpinfo.dc_contributor) if hasattr(xmpinfo, 'dc_identifier'): print('[+] Identifier:', xmpinfo.dc_identifier) if hasattr(xmpinfo, 'dc_date'): print('[+] Date:', xmpinfo.dc_date) if hasattr(xmpinfo, 'dc_source'): print('[+] Source:', xmpinfo.dc_source) if hasattr(xmpinfo, 'dc_subject'): print('[+] Subject:', xmpinfo.dc_subject) if hasattr(xmpinfo, 'xmp_modifyDate'): print('[+] ModifyDate:', xmpinfo.xmp_modifyDate) if hasattr(xmpinfo, 'xmp_metadataDate'): print('[+] MetadataDate:', xmpinfo.xmp_metadataDate) if hasattr(xmpinfo, 'xmpmm_documentId'): print('[+] DocumentId:', xmpinfo.xmpmm_documentId) if hasattr(xmpinfo, 'xmpmm_instanceId'): print('[+] InstanceId:', xmpinfo.xmpmm_instanceId) if hasattr(xmpinfo, 'pdf_keywords'): print('[+] PDF-Keywords:', xmpinfo.pdf_keywords) if hasattr(xmpinfo, 'pdf_pdfversion'): print('[+] PDF-Version:', xmpinfo.pdf_pdfversion) if hasattr(xmpinfo, 'dc_publisher'): for published in xmpinfo.dc_publisher: if publisher: print("[+] Publisher:\t" + publisher) fsize = os.stat((dirpath + os.path.sep + data)) print('[+] Size:', fsize[6], 'bytes \n\n')
def get_info_pdf(filename): # 打开文件 file_stream = open(filename, 'rb') # 创建一个实例用来读取pdf文件 pdf_reader = PdfFileReader(file_stream) # 获取pdf文件的信息 document_info = pdf_reader.getDocumentInfo() # 获取pdf文件的总页数 pdf_page_nums = pdf_reader.getNumPages() # 获取单页pdf文件数据,得到一个PageObject对象 single_page = pdf_reader.getPage(1) # 获取页面布局 pdf_layout = pdf_reader.getPageLayout() # 检索指定PageObject的页码 page_num = pdf_reader.getPageNumber(single_page)
# encoding:utf-8 from PyPDF2 import PdfFileReader, PdfFileWriter readFile = 'ks.pdf' # 获取 PdfFileReader 对象 pdfFileReader = PdfFileReader(readFile) # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb')) # 获取 PDF 文件的文档信息 documentInfo = pdfFileReader.getDocumentInfo() print('documentInfo = %s' % documentInfo) # 获取页面布局 pageLayout = pdfFileReader.getPageLayout() print('pageLayout = %s ' % pageLayout) # 获取页模式 pageMode = pdfFileReader.getPageMode() print('pageMode = %s' % pageMode) xmpMetadata = pdfFileReader.getXmpMetadata() print('xmpMetadata = %s ' % xmpMetadata) # 获取 pdf 文件页数 pageCount = pdfFileReader.getNumPages() print('pageCount = %s' % pageCount) for index in range(0, pageCount): # 返回指定页编号的 pageObject pageObj = pdfFileReader.getPage(index) print('index = %d , pageObj = %s' % (index, type(pageObj))) # <class 'PyPDF2.pdf.PageObject'> # 获取 pageObject 在 PDF 文档中处于的页码 pageNumber = pdfFileReader.getPageNumber(pageObj) print('pageNumber = %s ' % pageNumber)
def add_page_numbers(self, input_path, output_path, mask, total_pages_flag, bottom_margin): page_rotations_dict = PdfTask.get_page_rotations(input_path) logger.info('Page rotations: {}'.format(page_rotations_dict)) output = PdfFileWriter() input_pdf = open(input_path, "rb") reader = PdfFileReader(input_pdf) page_ct = reader.getNumPages() logger.info('doc info: ' + str(reader.documentInfo)) logger.info('page layout:' + str(reader.getPageLayout())) logger.info('page mode:' + str(reader.getPageMode())) logger.info('xmp metadata:' + str(reader.getXmpMetadata())) for page_num in range(page_ct): # inspect the current input PDF page page = reader.getPage(page_num) page_rect = page.mediaBox logger.info('page media box: Page {num}: {dim}'.format(num=page_num, dim=page_rect)) # dimensions for a letter sized sheet of paper are [0, 0, 612, 792] # 72 pt = 1 inch page_dimensions = { 'lower_left': page_rect.getUpperLeft() , 'lower_right': page_rect.getLowerRight() , 'upper_left': page_rect.getUpperLeft() , 'upper_right': page_rect.getUpperRight() } logger.info('Page dimensions: {}'.format(page_dimensions)) # create a new PDF containing the page number as a watermark with Reportlab txt = str(page_num + 1) if mask: txt = mask + " " + txt if total_pages_flag == 'Y': txt = txt + " of " + str(page_ct) packet = io.BytesIO() page_width = page_rect.getWidth() page_height = page_rect.getHeight() c = canvas.Canvas(packet, pagesize=(0, 0)) c.drawString(page_width / 2, bottom_margin, txt) c.save() packet.seek(0) new_pdf = PdfFileReader(packet) # merge new watermark pdf with the original wm = new_pdf.getPage(0) page_rotation = page_rotations_dict.get(page_num) or 0 page.mergeRotatedTranslatedPage( wm , rotation=page_rotation , tx=page_width / 2 , ty=page_height / 2 , expand=True ) page.scaleTo(page_width, page_height) page.compressContentStreams() output.addPage(page) with open(output_path, "wb") as outputStream: output.write(outputStream) input_pdf.close() logger.debug('Successfully added page numbers to {}'.format(input_path))
def test_get_page_layout(src, expected): src = os.path.join(RESOURCE_ROOT, src) reader = PdfFileReader(src) assert reader.getPageLayout() == expected
# Exercise================================================================ 1. 取得PDF 檔案資訊 from PyPDF2 import PdfFileReader , PdfFileWriter #import PyPDF2 pdffile = r'/Users/martychen/Documents/Python/water.pdf' pfr = PdfFileReader(pdffile) documentInfo = pfr.getDocumentInfo() print('documentInfo = %s' % documentInfo) pageLayout = pfr.getPageLayout() print('pagelayout = %s' % pageLayout) pagemode = pfr.getPageMode() print('pagemode = %s' % pagemode) xmpmetadata = pfr.getXmpMetadata() print('xmpmetadata = %s' % xmpmetadata) pagecount = pfr.getNumPages() print('pagecount = %s' % pagecount)