Exemple #1
0
def basic_Op():
    # readFile = 'C:/Users/Administrator/Desktop/RxJava 完全解析.pdf'
    # 获取 PdfFileReader 对象
    pdfFileReader = PdfFileReader(
        "1.pdf")  # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb'))
    # 获取 PDF 文件的文档信息
    documentInfo = pdfFileReader.getDocumentInfo()
    print('documentInfo = %s' % documentInfo)
    # 获取页面布局
    pageLayout = pdfFileReader.getPageLayout()
    print('pageLayout = %s ' % pageLayout)

    # 获取页模式
    pageMode = pdfFileReader.getPageMode()
    print('pageMode = %s' % pageMode)

    xmpMetadata = pdfFileReader.getXmpMetadata()
    print('xmpMetadata  = %s ' % xmpMetadata)

    # 获取 pdf 文件页数
    pageCount = pdfFileReader.getNumPages()

    print('pageCount = %s' % pageCount)
    for index in range(0, pageCount):
        # 返回指定页编号的 pageObject
        pageObj = pdfFileReader.getPage(index)
        print('index = %d , pageObj = %s' %
              (index, type(pageObj)))  # <class 'PyPDF2.pdf.PageObject'>
        # 获取 pageObject 在 PDF 文档中处于的页码
        pageNumber = pdfFileReader.getPageNumber(pageObj)
        print('pageNumber = %s ' % pageNumber)
Exemple #2
0
def info_page(readFile):
    pdfFileReader = PdfFileReader(
        readFile)  # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb'))
    # 获取 PDF 文件的文档信息
    documentInfo = pdfFileReader.getDocumentInfo()
    print('documentInfo = %s' % documentInfo)
    # 获取页面布局
    pageLayout = pdfFileReader.getPageLayout()
    print('pageLayout = %s ' % pageLayout)

    # 获取页模式
    pageMode = pdfFileReader.getPageMode()
    print('pageMode = %s' % pageMode)

    xmpMetadata = pdfFileReader.getXmpMetadata()
    print('xmpMetadata  = %s ' % xmpMetadata)

    # 获取 pdf 文件页数
    pageCount = pdfFileReader.getNumPages()

    print('pageCount = %s' % pageCount)
    for index in range(0, pageCount):
        # 返回指定页编号的 pageObject
        pageObj = pdfFileReader.getPage(index)
        print('index = %d , pageObj = %s' %
              (index, pageObj))  # <class 'PyPDF2.pdf.PageObject'>
        # 获取 pageObject 在 PDF 文档中处于的页码
        pageNumber = pdfFileReader.getPageNumber(pageObj)
        print('pageNumber = %s ' % pageNumber)
def get_metadata():
    for dirpath, dirnames, files in os.walk("pdf"):
        for data in files:
            ext = data.lower().rsplit('.', 1)[-1]
            if ext in ['pdf']:
                print("[--- Metadata : " + "%s ",
                      (dirpath + os.path.sep + data))
                print(
                    "------------------------------------------------------------------------------------"
                )
                pdfReader = PdfFileReader(
                    open(dirpath + os.path.sep + data, 'rb'))
                info = pdfReader.getDocumentInfo()

                for metaItem in info:

                    print('[+] ' + metaItem.strip('/') + ': ' + info[metaItem])

                pages = pdfReader.getNumPages()
                print('[+] Pages:', pages)

                layout = pdfReader.getPageLayout()
                print('[+] Layout: ' + str(layout))

                xmpinfo = pdfReader.getXmpMetadata()

                if hasattr(xmpinfo, 'dc_contributor'):
                    print('[+] Contributor:', xmpinfo.dc_contributor)
                if hasattr(xmpinfo, 'dc_identifier'):
                    print('[+] Identifier:', xmpinfo.dc_identifier)
                if hasattr(xmpinfo, 'dc_date'):
                    print('[+] Date:', xmpinfo.dc_date)
                if hasattr(xmpinfo, 'dc_source'):
                    print('[+] Source:', xmpinfo.dc_source)
                if hasattr(xmpinfo, 'dc_subject'):
                    print('[+] Subject:', xmpinfo.dc_subject)
                if hasattr(xmpinfo, 'xmp_modifyDate'):
                    print('[+] ModifyDate:', xmpinfo.xmp_modifyDate)
                if hasattr(xmpinfo, 'xmp_metadataDate'):
                    print('[+] MetadataDate:', xmpinfo.xmp_metadataDate)
                if hasattr(xmpinfo, 'xmpmm_documentId'):
                    print('[+] DocumentId:', xmpinfo.xmpmm_documentId)
                if hasattr(xmpinfo, 'xmpmm_instanceId'):
                    print('[+] InstanceId:', xmpinfo.xmpmm_instanceId)
                if hasattr(xmpinfo, 'pdf_keywords'):
                    print('[+] PDF-Keywords:', xmpinfo.pdf_keywords)
                if hasattr(xmpinfo, 'pdf_pdfversion'):
                    print('[+] PDF-Version:', xmpinfo.pdf_pdfversion)

                if hasattr(xmpinfo, 'dc_publisher'):
                    for published in xmpinfo.dc_publisher:
                        if publisher:
                            print("[+] Publisher:\t" + publisher)

            fsize = os.stat((dirpath + os.path.sep + data))
            print('[+] Size:', fsize[6], 'bytes \n\n')
Exemple #4
0
def get_info_pdf(filename):
    # 打开文件
    file_stream = open(filename, 'rb')

    # 创建一个实例用来读取pdf文件
    pdf_reader = PdfFileReader(file_stream)

    # 获取pdf文件的信息
    document_info = pdf_reader.getDocumentInfo()

    # 获取pdf文件的总页数
    pdf_page_nums = pdf_reader.getNumPages()

    # 获取单页pdf文件数据,得到一个PageObject对象
    single_page = pdf_reader.getPage(1)

    # 获取页面布局
    pdf_layout = pdf_reader.getPageLayout()

    # 检索指定PageObject的页码
    page_num = pdf_reader.getPageNumber(single_page)
# encoding:utf-8
from PyPDF2 import PdfFileReader, PdfFileWriter

readFile = 'ks.pdf'
# 获取 PdfFileReader 对象
pdfFileReader = PdfFileReader(readFile)  # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb'))
# 获取 PDF 文件的文档信息
documentInfo = pdfFileReader.getDocumentInfo()
print('documentInfo = %s' % documentInfo)
# 获取页面布局
pageLayout = pdfFileReader.getPageLayout()
print('pageLayout = %s ' % pageLayout)

# 获取页模式
pageMode = pdfFileReader.getPageMode()
print('pageMode = %s' % pageMode)

xmpMetadata = pdfFileReader.getXmpMetadata()
print('xmpMetadata  = %s ' % xmpMetadata)

# 获取 pdf 文件页数
pageCount = pdfFileReader.getNumPages()

print('pageCount = %s' % pageCount)
for index in range(0, pageCount):
    # 返回指定页编号的 pageObject
    pageObj = pdfFileReader.getPage(index)
    print('index = %d , pageObj = %s' % (index, type(pageObj)))  # <class 'PyPDF2.pdf.PageObject'>
    # 获取 pageObject 在 PDF 文档中处于的页码
    pageNumber = pdfFileReader.getPageNumber(pageObj)
    print('pageNumber = %s ' % pageNumber)
    def add_page_numbers(self, input_path, output_path, mask, total_pages_flag, bottom_margin):
        page_rotations_dict = PdfTask.get_page_rotations(input_path)
        logger.info('Page rotations: {}'.format(page_rotations_dict))
        output = PdfFileWriter()
        input_pdf = open(input_path, "rb")
        reader = PdfFileReader(input_pdf)
        page_ct = reader.getNumPages()

        logger.info('doc info: ' + str(reader.documentInfo))
        logger.info('page layout:' + str(reader.getPageLayout()))
        logger.info('page mode:' + str(reader.getPageMode()))
        logger.info('xmp metadata:' + str(reader.getXmpMetadata()))

        for page_num in range(page_ct):
            #   inspect the current input PDF page
            page = reader.getPage(page_num)
            page_rect = page.mediaBox
            logger.info('page media box: Page {num}: {dim}'.format(num=page_num, dim=page_rect))
            #   dimensions for a letter sized sheet of paper are [0, 0, 612, 792]
            #   72 pt = 1 inch

            page_dimensions = {
                'lower_left':    page_rect.getUpperLeft()
                , 'lower_right': page_rect.getLowerRight()
                , 'upper_left':  page_rect.getUpperLeft()
                , 'upper_right': page_rect.getUpperRight()
            }
            logger.info('Page dimensions: {}'.format(page_dimensions))

            #   create a new PDF containing the page number as a watermark with Reportlab
            txt = str(page_num + 1)

            if mask:
                txt = mask + " " + txt
            if total_pages_flag == 'Y':
                txt = txt + " of " + str(page_ct)

            packet = io.BytesIO()

            page_width = page_rect.getWidth()
            page_height = page_rect.getHeight()

            c = canvas.Canvas(packet, pagesize=(0, 0))
            c.drawString(page_width / 2, bottom_margin, txt)

            c.save()
            packet.seek(0)
            new_pdf = PdfFileReader(packet)
            #   merge new watermark pdf with the original
            wm = new_pdf.getPage(0)
            page_rotation = page_rotations_dict.get(page_num) or 0
            page.mergeRotatedTranslatedPage(
                wm
                , rotation=page_rotation
                , tx=page_width / 2
                , ty=page_height / 2
                , expand=True
            )
            page.scaleTo(page_width, page_height)
            page.compressContentStreams()
            output.addPage(page)

        with open(output_path, "wb") as outputStream:
            output.write(outputStream)

        input_pdf.close()
        logger.debug('Successfully added page numbers to {}'.format(input_path))
Exemple #7
0
def test_get_page_layout(src, expected):
    src = os.path.join(RESOURCE_ROOT, src)
    reader = PdfFileReader(src)
    assert reader.getPageLayout() == expected
Exemple #8
0

# Exercise================================================================

1. 取得PDF 檔案資訊

from PyPDF2 import PdfFileReader , PdfFileWriter #import PyPDF2
pdffile = r'/Users/martychen/Documents/Python/water.pdf'

pfr = PdfFileReader(pdffile)

documentInfo = pfr.getDocumentInfo()

print('documentInfo = %s' % documentInfo)

pageLayout = pfr.getPageLayout()

print('pagelayout = %s' % pageLayout)

pagemode = pfr.getPageMode()

print('pagemode = %s' % pagemode)

xmpmetadata = pfr.getXmpMetadata()

print('xmpmetadata = %s' % xmpmetadata)

pagecount = pfr.getNumPages()

print('pagecount = %s' % pagecount)