Python reader Examples, PyPDF2.reader Python Examples

Example #1

0

Show file

    def __init__(self, pdf_file_path, mode=PDFHandleMode.COPY):
        '''
        用一个PDF文件初始化
        :param pdf_file_path: PDF文件路径
        :param mode: 处理PDF文件的模式，默认为PDFHandleMode.COPY模式
        '''
        # 只读的PDF对象
        self.__pdf = reader(pdf_file_path)

        # 获取PDF文件名（不带路径）
        self.file_name = os.path.basename(pdf_file_path)
        #
        self.metadata = self.__pdf.getXmpMetadata()
        #
        self.doc_info = self.__pdf.getDocumentInfo()
        #
        self.pages_num = self.__pdf.getNumPages()

        # 可写的PDF对象，根据不同的模式进行初始化
        self.__writeable_pdf = writer()
        if mode == PDFHandleMode.COPY:
            self.__writeable_pdf.cloneDocumentFromReader(self.__pdf)
        elif mode == PDFHandleMode.NEWLY:
            for idx in range(self.pages_num):
                page = self.__pdf.getPage(idx)
                self.__writeable_pdf.insertPage(page, idx)

Example #2

0

Show file

def rotate(filenames, angle, overwrite):
    for inFp in filenames.get().split(','):
        inPdf = reader(inFp)
        n = inPdf.getNumPages()
        outPdf = writer()
        outFp = inFp if overwrite.get() else inFp.replace(
            '.pdf', '_rotate.pdf')

        for i in range(n):
            p = inPdf.getPage(i).rotateClockwise(int(angle.get()))
            outPdf.addPage(p)
            outPdf.write(open(outFp, 'wb'))
    showinfo(message="完成，保存在源文件所在文件夹")

Example #3

0

Show file

File: cut_pdf.py Project: zj1123581321/py-project

 def __init__(self, pdf_file_path):
     '''
     用一个PDF文件路径初始化
     
     :param pdf_file_path: PDF文件路径
     '''
     # 读取PDF
     pdf = reader(pdf_file_path)
     # PDF页面列表对象
     self.pages = pdf.pages
     # PDF写对象
     self.pdf_writer = writer()
     # 当前已保存的页码总数，初始化为0
     self.saved_page_num = 0

Example #4

0

Show file

File: book.py Project: hugooood/AddPDF

def main():
    book = reader('./book.pdf')

    pdf = writer()
    pdf.cloneDocumentFromReader(book)

    # 添加书签
    # 注意：页数是从0开始的，中文要用unicode字符串，否则会出现乱码
    # 如果这里的页码超过文档的最大页数，会报IndexError异常
    pdf.addBookmark(u'Hello World! 你好，世界！', 2)

    # 保存修改后的PDF文件内容到文件中
    # 注意：这里必须用二进制的'wb'模式来写文件，否则写到文件中的内容都为乱码
    with open('./book-with-bookmark.pdf', 'wb') as fout:
        pdf.write(fout)

Example #5

0

Show file

def split(path,name):
    f=reader(path+'/'+name)
    page=f.getNumPages()
    part=500
    w=writer()
    if page>part:
        newPath=path+'/'+name.split('.')[0]
        os.mkdir(newPath)
        div=page//part
        for i in range(div):
            start=i*part
            end=(i+1)*part-1
            dirs=f'{newPath}/{i}_{name}'
            for p in range(start,end):
                pdf=f.getPage(i)
                w.addPage(pdf)
            w.write(open(dirs,'wb'))

Example #6

0

Show file

File: add_bookmark.py Project: An-jazhuang/pdf-helper

def main():
    # 读取PDF文件，创建PdfFileReader对象
    book = reader('./book.pdf')

    # 创建PdfFileWriter对象，并用reader对象进行初始化
    pdf = writer()
    pdf.cloneDocumentFromReader(book)

    # 添加书签
    # 注意：页数是从0开始的，中文要用unicode字符串，否则会出现乱码
    # 如果这里的页码超过文档的最大页数，会报IndexError异常
    # 3表示书签链接到的页码数为第3页
    pdf.addBookmark(u'Hello World! 你好，世界！', 3)

    # 保存修改后的PDF文件内容到文件中
    with open('./book-with-bookmark.pdf', 'wb') as fout:
        pdf.write(fout)

Example #7

0

Show file

def splitByCsv():
    pdf=reader('f.pdf')
    n=pdf.getNumPages()
    csv=open('f.csv','r',encoding='utf-8').readlines()

    for i in range(n):
    p=pdf.getPage(i)
    name="".join(csv[i+1].split(",")[1])+'.pdf'
    print(name)
    if os.path.isfile(name):
        name+='_1'
    w=writer()
    w.addPage(p)
    w.write(open(name,'wb'))


def split(path,name):
    f=reader(path+'/'+name)
    page=f.getNumPages()
    part=500
    w=writer()
    if page>part:
        newPath=path+'/'+name.split('.')[0]
        os.mkdir(newPath)
        div=page//part
        for i in range(div):
            start=i*part
            end=(i+1)*part-1
            dirs=f'{newPath}/{i}_{name}'
            for p in range(start,end):
                pdf=f.getPage(i)
                w.addPage(pdf)
            w.write(open(dirs,'wb'))

def main():
    # for fd in ['301','501']:
    #     ls=os.listdir(fd)
    #     for f in ls:
    #         split(fd,f)
    splitByCsv()

if __name__=='__main__':
    main()

Example #8

0

Show file

def main():
    base_path = "需要合并的pdf文件目录路径"
    pdf_part = []
    total_pages = 0
    for f in os.listdir(base_path):
        print(f)
        pdf = reader(base_path + "/" + f)
        pdf_part.append(pdf)
        total_pages = total_pages + pdf.getNumPages()

    merged_pdf = writer()
    page_idx = 0
    for pdf in pdf_part:
        for idx in range(pdf.getNumPages()):
            merged_pdf.insertPage(pdf.getPage(idx), page_idx)
            page_idx += 1

    with open(base_path + "/merged.pdf", 'wb') as pdf_file:
        merged_pdf.write(pdf_file)

Example #9

0

Show file

File: img_split.py Project: Hoemr/small-projects

 def img2pdf(self):
     '''将文件批量转为PDF'''
     out_pdf = writer()
     
      # 创建文件夹
     try:
         os.makedirs(self.outfile+r"\PDF_files")
     except Exception as e:
         print("创建文件夹出错，错误为----->",e)
     
     # 逐一转为PDF
     path = self.outfile+r"\PDF_files"
     i = 0
     for img in self.crop_imgs:
         outpath_pdf = path+r"\img_pdf"+str(i+1)+".pdf"
         #outpath_pdf = self.outfile+r"\img_pdf"+str(i+1)+".pdf"
         img.save(outpath_pdf,'PDF')
         out_pdf.appendPagesFromReader(reader(open(outpath_pdf,'rb')))
         i += 1
     out_pdf.write(open(path+r"\totalpdf.pdf",'wb'))
     print("PDF文件保存在{}文件夹里".format(path))

Example #10

0

Show file

File: pdf_utils.py Project: TingyiLi/PDF_editor_py

    def __init__(self, pdf_file_path, mode=PDFHandleMode.COPY):
        '''
        :param pdf_file_path
        :param mode
        '''
        self.__pdf = reader(pdf_file_path)

        # pdf file name(without path name)
        self.file_name = os.path.basename(pdf_file_path)
        #
        self.metadata = self.__pdf.getXmpMetadata()
        #
        self.doc_info = self.__pdf.getDocumentInfo()
        #
        self.pages_num = self.__pdf.getNumPages()

        self.__writeable_pdf = writer()
        if mode == PDFHandleMode.COPY:
            self.__writeable_pdf.cloneDocumentFromReader(self.__pdf)
        elif mode == PDFHandleMode.NEWLY:
            for idx in range(self.pages_num):
                page = self.__pdf.getPage(idx)
                self.__writeable_pdf.insertPage(page, idx)

Example #11

0

Show file

def getbookmark(fullfile: str, outfile: str = None) -> list:
    '''
    提取PDF文件中的bookmarks，存储到content元组列表中
    :param fullfile：PDF文件路径字符串
    :param outfile：可选参数，将提取的PDF书签输出outfile指定路径
    : return: list
    '''
    pdffile = reader(fullfile)
    outlins = pdffile.getOutlines()
    content = []
    for bm in outlins:
        if type(bm) == list:
            for sub in bm:
                thistitl = sub.title
                thispage = pdffile.getDestinationPageNumber(sub)
                content.append(('\t' + thistitl, thispage))
                # if bm.index(sub) == 0:
                #     content.append(('\t'+thistitl,thispage))
                # else:
                #     content.append((thistitl,thispage))
        else:
            thistitl = bm.title
            thispage = pdffile.getDestinationPageNumber(bm)
            content.append((thistitl, thispage))
    # outfile存在则将bookmarks存储到outfie中
    if outfile != None:
        f = open(outfile, 'w')
        for s in content:
            f.write('{}@{}\n'.format(s[0], s[1]))
        f.close()
    return content


# lis = getbookmark('.test.pdf','./files/tt')
# lis = getbookmark('test.pdf')
# for t,n in lis:
#     print(t,n)

Example #12

0

Show file

File: Demo.py Project: Hoemr/small-projects

    def img2pdf(self):
        '''将所有的图片全部转为PDF'''
        # 生成文件夹
        try:  # 因为文件夹可能已经存在，这时候是会报错的
            os.makedirs('./漫画')
        except:
            print('文件夹已经存在！')
        else:
            print("所爬取的内容保存在程序所在文件夹下的allimgs文件夹里!")

        # 批量生成PDF文件
        pdf_writer = writer()
        files_path = './漫画'
        i = 1
        for img in self.img_datas:
            # 保存为PDF
            img_path = files_path + '/' + str(i) + '.pdf'
            img.save(img_path, 'PDF')
            # 合并PDF
            pdf_writer.appendPagesFromReader(reader(open(img_path, 'rb')))
            i += 1
        pdf_path = files_path + '/总的.pdf'
        pdf_writer.write(open(pdf_path, 'wb'))
        print('漫画保存在{}路径下'.format(pdf_path))

Example #13

0

Show file

 def open_pdf(input_pdf):
   with open(input_pdf, 'rb'):
     return reader(input_pdf)

Example #14

0

Show file

File: totalStudents.py Project: ejminmehranian/PDFParsingScripts

import os
from PyPDF2 import PdfFileReader as reader

#Parsing existing user information
path = '/Users/Ejmin/Desktop/FEB182018/ADSRegistrationForms'
i = 0
userId = 1000
for root, dirs, files in os.walk(path):
    for file in files:
        if ".pdf" in file:
            userId += 1
            #print(userId)
            studentFile = reader(open(os.path.join(root, file), 'rb'))
            studentFields = studentFile.getFormTextFields()
            print("-----------------------------------")
            print(studentFields["studentName"] + " " +
                  studentFields["studentLastName"])
            print(studentFields["phoneNumber"])
            print(studentFields["email"])
            #print(studentFields["emergencyContact"])
            if "parentFullName" in studentFields:
                print("Parent name : " + studentFields["parentFullName"])

            print(studentFields["className"])
            #print(studentFields["todaysDate"])
            #print(studentFields)

Example #15

0

Show file

File: pdfscrape.py Project: nturley/livestock

# store our table of data here
currentRow = []
allRows = []

# possible states
IGNORE = 0
FIRST_COL = 1
DEFAULT = 2

# state variable, start out ignoring cells
state = IGNORE

# extract all of the text from the pdf, store in s
f = open('current.pdf','rb')
r = reader(f)
p = r.getPage(0)
s = p.extractText()
f.close()

#parse through each of the cells
for cell in s.split('\n'):
    cell = cell.strip().upper()
    # ignore until we see a class type
    if (state is IGNORE and cell in classtypes):
        state = DEFAULT
        currentRow.append(cell)
    # consume cells
    elif (state is DEFAULT and not cell in pftypes):
        currentRow.append(cell)
    # consume cells until the pftype and then go to FIRST_COL

Example #16

0

Show file

File: count_page_numbers.py Project: jicruz96/PresupuestosPR.com

from PyPDF2 import PdfFileReader as reader
from os import getcwd, walk

fileList = []
totalNumPages = 0
filesbyPageNumber = []
path = getcwd()

for dirpath, dir, files in walk(path):
    if dirpath == path:
        fileList += files

for file in fileList:
    if ".pdf" in file:
        numPages = reader(file).numPages
        totalNumPages += numPages
        filesbyPageNumber.append((file, numPages))
        print('{} completed'.format(file))

filesbyPageNumber.sort(key=lambda x: x[1])
final_strings = []
for item in filesbyPageNumber:
    final_strings.append(str(item)[1:-1].replace("'", "") + "\n")

with open('files_by_page_number.csv', 'w') as file:
    for string in final_strings:
        file.write(string)
        print(string)

print('Total number of pdf pages: ', totalNumPages)